├── .dokx
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── INSTALL.md
├── LICENSE
├── PATENTS
├── README.md
├── cmake
    ├── FindFolly.cmake
    ├── FindGlog.cmake
    ├── FindTHPP.cmake
    └── MultiLevelIncludes.cmake
├── docs.sh
├── fbcunn
    ├── AbstractParallel.lua
    ├── BatchNormalization.lua
    ├── CuBLASWrapper.lua
    ├── DataParallel.lua
    ├── FFTCDefs.lua
    ├── FFTWrapper.lua
    ├── FeatureLPPooling.lua
    ├── HalfPrecision.lua
    ├── LookupTableGPU.lua
    ├── ModelParallel.lua
    ├── OneBitDataParallel.lua
    ├── OneBitQuantization.lua
    ├── OneBitSGD.lua
    ├── SpatialBatchNormalization.lua
    ├── SpatialConvolution.lua
    ├── SpatialConvolutionCuFFT.lua
    ├── SpatialConvolutionFBFFT.lua
    ├── SpatialConvolutionFBFFTGemm.lua
    ├── SpatialConvolutionFFT.lua
    ├── SpatialConvolutionFFTTiled.lua
    ├── SpatialConvolutionFFTTiledAsync.lua
    ├── SpatialConvolutionFFTTiledIterated.lua
    ├── SpatialConvolutionFFTTiledSync.lua
    ├── TemporalConvolutionFB.lua
    ├── TemporalKMaxPooling.lua
    └── init.lua
├── rocks
    └── fbcunn-scm-1.rockspec
├── src
    ├── BLASParameters.cpp
    ├── BLASParameters.h
    ├── BatchNormalization.cu
    ├── ConvolutionBias.cu
    ├── ConvolutionBias.cuh
    ├── CrossMapNormalization.cu
    ├── CrossMapNormalization.cuh
    ├── CrossMapNormalizationHost.cpp
    ├── CuBLASWrapper.cpp
    ├── CuBLASWrapper.h
    ├── CuBLASWrapperLua.cpp
    ├── CudaTensorUtils.cpp
    ├── CudaTensorUtils.h
    ├── DeviceTensorUtils-inl.h
    ├── DeviceTensorUtils.h
    ├── FeatureLPPooling.cu
    ├── FeatureLPPooling.cuh
    ├── FeatureLPPoolingHost.cpp
    ├── HSM.cu
    ├── HSMHost.cpp
    ├── HalfPrec.cpp
    ├── HalfPrec.h
    ├── HalfPrecKernels.cu
    ├── HalfPrecTest.cpp
    ├── Includes.h
    ├── InitCuda.cpp
    ├── LocallyConnected.cuh
    ├── LocallyConnectedAccGradParameters.cu
    ├── LocallyConnectedHost.cpp
    ├── LocallyConnectedUpdateGradInput.cu
    ├── LocallyConnectedUpdateOutput.cu
    ├── LookupTableGPU.cu
    ├── LookupTableGPUHost.cpp
    ├── LuaUtils.h
    ├── MM.cu
    ├── MM.h
    ├── OneBitQuantization.cu
    ├── OneBitQuantization.cuh
    ├── OneBitQuantizationHost.cpp
    ├── SparseNLLCriterion.cu
    ├── SparseNLLCriterion.cuh
    ├── SparseNLLCriterionHost.cpp
    ├── SpatialBatchNormalization.cu
    ├── Storage.h
    ├── TemporalConvolutionFBHost.cpp
    ├── TemporalConvolutionTBC.cu
    ├── TemporalConvolutionTBC.cuh
    ├── TemporalConvolutionTBCHost.cpp
    ├── TemporalKMaxPooling.cu
    ├── TemporalKMaxPooling.cuh
    ├── TemporalKMaxPoolingHost.cpp
    ├── TemporalMaxPooling.cu
    ├── Tensor.h
    ├── Utils.cpp
    ├── Utils.h
    ├── WeightedLookupTable.cu
    ├── WeightedLookupTableHost.cpp
    ├── fft
    │   ├── CuFFTConvolution.cpp
    │   ├── CuFFTConvolution.cuh
    │   ├── CuFFTConvolution_AccGradParameters.cu
    │   ├── CuFFTConvolution_AccGradParameters.cuh
    │   ├── CuFFTConvolution_UpdateGradInput.cu
    │   ├── CuFFTConvolution_UpdateGradInput.cuh
    │   ├── CuFFTConvolution_UpdateOutput.cu
    │   ├── CuFFTConvolution_UpdateOutput.cuh
    │   ├── CuFFTStrategy.h
    │   ├── CuFFTWrapper.cu
    │   ├── CuFFTWrapper.cuh
    │   ├── FBFFTDevice.cu
    │   ├── FBFFTHost.cpp
    │   ├── FBFFTHost.h
    │   ├── FFTIteratedConvolution.cu
    │   ├── FFTWrapperLua.cpp
    │   ├── SpatialConvolutionCuFFT.cpp
    │   ├── SpatialConvolutionCuFFT.h
    │   ├── SpatialConvolutionCuFFTHost.cpp
    │   ├── SpatialConvolutionCuFFTTuner.cpp
    │   ├── SpatialConvolutionCuFFTTuner.h
    │   ├── Utils-inl.h
    │   ├── Utils.cuh
    │   └── Utils.h
    ├── init.cu
    └── util
    │   ├── AsyncCopier.cpp
    │   ├── AsyncCopier.h
    │   ├── GlobalAsyncCopier.cpp
    │   ├── GlobalAsyncCopier.h
    │   ├── Misc.cpp
    │   ├── Misc.h
    │   ├── Transform.cu
    │   └── Transform.cuh
└── test
    ├── BiasTest.cpp
    ├── ConvolutionTest.cpp
    ├── CuBLASTest.cpp
    ├── CudaTensorTest.cpp
    ├── CudaTensorTestKernels.cu
    ├── CudaTensorTestKernels.cuh
    ├── FFTTest.cpp
    ├── InputCentricConvolution_UpdateOutput.cu
    ├── InputCentricConvolution_UpdateOutput.cuh
    ├── ReferenceConvolutions.cpp
    ├── ReferenceConvolutions.h
    ├── TestUtils.cpp
    ├── TestUtils.h
    ├── benchmark_fft.lua
    ├── fb_test.lua
    ├── run_test_HSM_seed.sh
    ├── test.lua
    ├── test_BatchNormalization.lua
    ├── test_ClassHierarchicalNLLCriterion.lua
    ├── test_CuBLAS.lua
    ├── test_CuFFT.lua
    ├── test_DataParallel.lua
    ├── test_FBFFTTiling.lua
    ├── test_FFT.lua
    ├── test_FFTModule.lua
    ├── test_FeatureLPPooling.lua
    ├── test_GroupKMaxPooling.lua
    ├── test_HSM.lua
    ├── test_HSM_cuda.lua
    ├── test_HSM_speed.lua
    ├── test_KMaxPooling.lua
    ├── test_LinearNB.lua
    ├── test_LocallyConnected.lua
    ├── test_LookupTableGPU.lua
    ├── test_ModelParallel.lua
    ├── test_OneBitDataParallel.lua
    ├── test_OneBitQuantization.lua
    ├── test_OneBitSGD.lua
    ├── test_SequentialCriterion.lua
    ├── test_SparseModules.lua
    ├── test_SparseNLLCriterion.lua
    ├── test_SpatialConvolutionTuned.lua
    ├── test_TemporalConvolutionFB.lua
    ├── test_TemporalKMaxPooling.lua
    ├── test_Threshold.lua
    ├── test_Util.lua
    ├── test_VolumetricMaxPooling.lua
    ├── test_WeightedLookupTable.lua
    └── volumetric_average_pooling_test.lua


/.dokx:
--------------------------------------------------------------------------------
1 | return {
2 |    packageName = 'fbcunn',
3 |    tocLevel = 'class',
4 |    githubURL = 'facebook/fbcunn',
5 |    exclude = {'**/init.lua', '**/test/*', '**/nn_layers.lua'}
6 | }
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | TARGETS
2 | facebook
3 | build
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "cuda"]
2 | 	path = cuda
3 | 	url = https://github.com/facebook/fbcuda.git
4 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | - Find a machine with [Ubuntu 14.04+](http://www.ubuntu.com/) and an NVIDIA GPU with compute capability 3.5 or above
 2 | 
 3 | Then, install everything that is needed by using the instructions below:
 4 | 
 5 | Install CUDA
 6 | =============
 7 | ```bash
 8 | sudo apt-get install build-essential
 9 | ```
10 | 
11 | If you are using a Virtual Machine (like Amazon EC2 instances), install:
12 | ```bash
13 | sudo apt-get update
14 | sudo apt-get install linux-generic
15 | ```
16 | 
17 | Download the CUDA .deb file for Linux Ubuntu 14.04 64-bit from this page: https://developer.nvidia.com/cuda-downloads  
18 | It would be a file named similar this: cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
19 | Now, install it using:
20 | ```bash
21 | sudo dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb
22 | sudo apt-get update
23 | sudo apt-get install cuda
24 | echo "export PATH=/usr/local/cuda/bin/:\$PATH; export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:\$LD_LIBRARY_PATH; " >>~/.bashrc && source ~/.bashrc
25 | ```
26 | 
27 | Restart your computer
28 | 
29 | Install CuDNN
30 | - Go to https://developer.nvidia.com/cuDNN and use the Download button (you have to register and login to download. no way around that.)
31 | - Download cuDNN R3 for Linux. You will download a file cudnn-7.0-linux-x64-v3.0-prod.tgz
32 | then use the commands:
33 | ```bash
34 | tar -xvf cudnn-7.0-linux-x64-v3.0-prod.tgz
35 | sudo cp cuda/include/*.h /usr/local/cuda/include
36 | sudo cp cuda/lib64/*.so* /usr/local/cuda/lib64
37 | ```
38 | 
39 | Install Torch Dependencies
40 | ==========================
41 | ```bash
42 | curl -sk https://raw.githubusercontent.com/torch/ezinstall/master/install-deps | bash -e
43 | ```
44 | 
45 | Install Torch in a local folder
46 | ================================
47 | ```bash
48 | git clone https://github.com/torch/distro.git ~/torch --recursive
49 | cd ~/torch; ./install.sh
50 | ```
51 | 
52 | If you want to uninstall torch, you can use the command: `rm -rf ~/torch`
53 | 
54 | Install Folly, fbthrift, thpp and fblualib
55 | ============================================
56 | ```bash
57 | curl -sk https://raw.githubusercontent.com/soumith/fblualib/master/install_all.sh | bash -e
58 | ```
59 | 
60 | Install fbcunn
61 | ==============
62 | ```bash
63 | git clone https://github.com/torch/nn && ( cd nn && git checkout getParamsByDevice && luarocks make rocks/nn-scm-1.rockspec )
64 | 
65 | git clone https://github.com/facebook/fbtorch.git && ( cd fbtorch && luarocks make rocks/fbtorch-scm-1.rockspec )
66 | 
67 | git clone https://github.com/facebook/fbnn.git && ( cd fbnn && luarocks make rocks/fbnn-scm-1.rockspec )
68 | 
69 | # go get a coffee
70 | git clone https://github.com/facebook/fbcunn.git && ( cd fbcunn && luarocks make rocks/fbcunn-scm-1.rockspec )
71 | ```
72 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For fbcunn software
 4 | 
 5 | Copyright (c) 2014, Facebook, Inc. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/PATENTS:
--------------------------------------------------------------------------------
 1 | Additional Grant of Patent Rights Version 2
 2 | 
 3 | "Software" means the fbcunn software distributed by Facebook, Inc.
 4 | 
 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software
 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable
 7 | (subject to the termination provision below) license under any Necessary
 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise
 9 | transfer the Software. For avoidance of doubt, no license is granted under
10 | Facebook’s rights in any patent claims that are infringed by (i) modifications
11 | to the Software made by you or any third party or (ii) the Software in
12 | combination with any software or other technology.
13 | 
14 | The license granted hereunder will terminate, automatically and without notice,
15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate
16 | directly or indirectly, or take a direct financial interest in, any Patent
17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate
18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or
19 | in part from any software, technology, product or service of Facebook or any of
20 | its subsidiaries or corporate affiliates, or (iii) against any party relating
21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its
22 | subsidiaries or corporate affiliates files a lawsuit alleging patent
23 | infringement against you in the first instance, and you respond by filing a
24 | patent infringement counterclaim in that lawsuit against that party that is
25 | unrelated to the Software, the license granted hereunder will not terminate
26 | under section (i) of this paragraph due to such counterclaim.
27 | 
28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is
29 | necessarily infringed by the Software standing alone.
30 | 
31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect,
32 | or contributory infringement or inducement to infringe any patent, including a
33 | cross-claim or counterclaim.
34 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # `fbcunn` - Deep Learning CUDA Extensions from Facebook AI Research.
 2 | 
 3 | ## What?
 4 | [Deep Learning](http://en.wikipedia.org/wiki/Deep_learning) is a popular kid in machine learning these days.
 5 | At [Facebook AI Research](http://research.facebook.com/ai/) we've been doing quite a bit of deep learning research.
 6 | This repository contains our highly engineered deep learning modules for GPUs, to accelerate your own deep learning endeavors.
 7 | It plugs into the [Torch-7](https://github.com/torch/torch7/wiki/Cheatsheet) framework and  installs seamlessly via `luarocks`, 
 8 | and is fully compatible with torch's [nn](https://github.com/torch/nn) package.
 9 | 
10 | In summary, we're releasing fast nn modules for Convnets and neural networks in general:
11 | - Fast spatial convolution modules that use FFT to accelerate convolutions. [We wrote a paper about them](http://arxiv.org/abs/1412.7580) if you'd like to read more.
12 | - Fast Temporal convolutions that are 1.5x to 10x faster compared to Torch's cunn implementations.
13 | - nn.DataParallel and nn.ModelParallel containers. Plug your model in them and see it accelerate over multiple GPUs
14 | - Wrappers to use FFT/IFFT as nn modules.
15 | - Fast LookupTable that is used for Neural Language Models and word embeddings. Much faster than the one in torch/nn
16 | - Hierarchical SoftMax module, now classifying 1 million classes is a practically viable strategy
17 | - LP and Max Pooling over feature maps (usable for MaxOut).
18 | - more goodies. Full documentation and spec is here: https://facebook.github.io/fbcunn/fbcunn/
19 | 
20 | Examples:
21 | - Training an imagenet based classifier in Torch-7 using multiple GPUs (showcasing our FFT convolutions as well as our ModelParallel container)
22 | 
23 | ## Why?
24 | We know that science and technology progress faster when researchers exchange ideas and tools. Making significant progress in AI will take the participation of the entire research community, and We want to do what we can to make the field progress faster. That is why we love open science and open source. We publish our research with open access, very often on [Arxiv](http://arxiv.org), on [our members' web sites](http://research.facebook.com/ai), and eventually on the [FAIR publications page](https://research.facebook.com/publications/ai/). And we share our code right here!
25 | 
26 | ## Who is this for?
27 | This will help you if you want to train large-scale deep learning systems (particularly convolutional nets) for image recognition, NLP, or other applications. This will help you particularly well if already are a Torch user.
28 | 
29 | ## How to install them?
30 | You will find step-by-step and detailed installation instructions in **[INSTALL.md](INSTALL.md)**
31 | 
32 | We've worked hard to make the install as pain-free as possible. If you have an issue, use github issues, we'll try our best to help.
33 | 
34 | ## How to use them?
35 | 
36 | - The DataParallel and ModelParallel modules are super-simple to use. The unit-test doubles as both an example as well as a test. There is also a practical example of ModelParallel in examples/imagenet. If you want more examples, please do ask.
37 | ```lua
38 | m = nn.DataParallel():add(nn.SpatialConvolution(...)):add(nn.ReLU()) -- see, so simple
39 | ```
40 | 
41 | - Convolution modules are even simpler to use. They are fully API compatible with their [nn equivalents](https://github.com/torch/nn/blob/master/doc/convolution.md). For an example, look at examples/imagenet
42 | ```lua
43 | conv = nn.SpatialConvolutionCuFFT(...) -- fast spatial convolutions!
44 | conv = nn.TemporalConvolutionFB(...) -- fast temporal convolutions!
45 | ```
46 | 
47 | - LookupTable is named `nn.LookupTableGPU` and Hierarchical SoftMax as `nn.HSM`, they are super-simple to use as well, check the docs out.
48 | 
49 | https://facebook.github.io/fbcunn/fbcunn/
50 | 
51 | The unit tests in the test/ folder also double as examples! If you have a question, do ask.
52 | 
53 | 
54 | ## I want exact details of everything...
55 | 
56 | API docs, generated with [torch-dokx](https://github.com/deepmind/torch-dokx), are available at http://facebook.github.io/fbcunn/fbcunn/
57 | 
58 | Some of the unit tests need [fbnn](https://github.com/facebook/fbnn)
59 | 
60 | ## License
61 | 
62 | `fbcunn` is BSD-licensed. We also provide an additional patent
63 | grant.
64 | 


--------------------------------------------------------------------------------
/cmake/FindFolly.cmake:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2014, Facebook, Inc.
 2 | #  All rights reserved.
 3 | #
 4 | #  This source code is licensed under the BSD-style license found in the
 5 | #  LICENSE file in the root directory of this source tree. An additional grant
 6 | #  of patent rights can be found in the PATENTS file in the same directory.
 7 | #
 8 | # - Try to find folly
 9 | # This will define
10 | # FOLLY_FOUND
11 | # FOLLY_INCLUDE_DIR
12 | # FOLLY_LIBRARIES
13 | 
14 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR)
15 | 
16 | INCLUDE(FindPackageHandleStandardArgs)
17 | 
18 | FIND_LIBRARY(FOLLY_LIBRARY folly)
19 | FIND_PATH(FOLLY_INCLUDE_DIR "folly/String.h")
20 | 
21 | SET(FOLLY_LIBRARIES ${FOLLY_LIBRARY})
22 | 
23 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Folly
24 |   REQUIRED_ARGS FOLLY_INCLUDE_DIR FOLLY_LIBRARIES)
25 | 


--------------------------------------------------------------------------------
/cmake/FindGlog.cmake:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2014, Facebook, Inc.
 2 | #  All rights reserved.
 3 | #
 4 | #  This source code is licensed under the BSD-style license found in the
 5 | #  LICENSE file in the root directory of this source tree. An additional grant
 6 | #  of patent rights can be found in the PATENTS file in the same directory.
 7 | #
 8 | # GLOG_FOUND
 9 | # GLOG_INCLUDE_DIR
10 | # GLOG_LIBRARIES
11 | 
12 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR)
13 | 
14 | INCLUDE(FindPackageHandleStandardArgs)
15 | 
16 | FIND_LIBRARY(GLOG_LIBRARY glog)
17 | FIND_PATH(GLOG_INCLUDE_DIR "glog/logging.h")
18 | 
19 | SET(GLOG_LIBRARIES ${GLOG_LIBRARY})
20 | 
21 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(
22 |   Glog
23 |   REQUIRED_ARGS GLOG_INCLUDE_DIR GLOG_LIBRARY)
24 | 


--------------------------------------------------------------------------------
/cmake/FindTHPP.cmake:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2014, Facebook, Inc.
 2 | #  All rights reserved.
 3 | #
 4 | #  This source code is licensed under the BSD-style license found in the
 5 | #  LICENSE file in the root directory of this source tree. An additional grant
 6 | #  of patent rights can be found in the PATENTS file in the same directory.
 7 | #
 8 | # - Try to find thpp
 9 | # This will define
10 | # THPP_FOUND
11 | # THPP_INCLUDE_DIR
12 | # THPP_LIBRARIES
13 | 
14 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR)
15 | 
16 | INCLUDE(FindPackageHandleStandardArgs)
17 | 
18 | FIND_LIBRARY(THPP_LIBRARY thpp)
19 | FIND_PATH(THPP_INCLUDE_DIR "thpp/Tensor.h")
20 | 
21 | SET(THPP_LIBRARIES ${THPP_LIBRARY})
22 | 
23 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Folly
24 |   REQUIRED_ARGS THPP_INCLUDE_DIR THPP_LIBRARIES)
25 | 


--------------------------------------------------------------------------------
/cmake/MultiLevelIncludes.cmake:
--------------------------------------------------------------------------------
 1 | #  Copyright (c) 2014, Facebook, Inc.
 2 | #  All rights reserved.
 3 | #
 4 | #  This source code is licensed under the BSD-style license found in the
 5 | #  LICENSE file in the root directory of this source tree. An additional grant
 6 | #  of patent rights can be found in the PATENTS file in the same directory.
 7 | 
 8 | # Some projects are installed individually as part of a larger tree, but
 9 | # include paths always reference the full include path in the tree. This
10 | # module makes it easier to do so.
11 | #
12 | # Suppose you have a source tree fblualib/thrift/submodule, which is built at
13 | # the submodule level (so you have fblualib/thrift/submodule/CMakeLists.txt)
14 | # Files inside submodule include each other (and files from other sibling
15 | # directories) with the full path:
16 | #
17 | # #include <fblualib/thrift/submodule/SomeFile.h>
18 | # #include <fblualib/thrift/othermodule/OtherFile.h>
19 | # #include <fblualib/thrift/Bar.h>
20 | # #include <fblualib/meow/Foo.h>
21 | #
22 | # MLI_SET_DEPTH(2) at the root of your CMakeLists.txt would set "../.."
23 | # as the include path (so fblualib is a subdirectory of that), making
24 | # the includes work. Also, it will set MLI_INCLUDE_OUTPUT_DIR and
25 | # MLI_INCLUDE_RELATIVE_OUTPUT_DIR to directories inside the build tree
26 | # where any generators should output header files so they can be found
27 | # via #include. (we recreate the lowest 2 levels of the hierarchy underneath
28 | # ${CMAKE_BINARY_DIR})
29 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR)
30 | 
31 | FUNCTION(MLI_SET_DEPTH level)
32 |   SET(dirs)
33 |   SET(dir ${CMAKE_SOURCE_DIR})
34 |   SET(relinc)
35 |   FOREACH(i RANGE 1 ${level})
36 |     GET_FILENAME_COMPONENT(bn ${dir} NAME)
37 |     GET_FILENAME_COMPONENT(dir ${dir} PATH)
38 |     LIST(APPEND dirs ${bn})
39 |     SET(relinc "${relinc}/..")
40 |   ENDFOREACH()
41 |   LIST(REVERSE dirs)
42 |   STRING(REPLACE ";" "/" relpath "${dirs}")
43 |   SET(MLI_INCLUDE_OUTPUT_DIR
44 |     "${CMAKE_BINARY_DIR}/${relpath}"
45 |     PARENT_SCOPE)
46 |   SET(MLI_INCLUDE_RELATIVE_OUTPUT_DIR
47 |     "${relpath}"
48 |     PARENT_SCOPE)
49 |   INCLUDE_DIRECTORIES(
50 |     "${CMAKE_SOURCE_DIR}/${relinc}"
51 |     "${CMAKE_BINARY_DIR}")
52 | ENDFUNCTION()
53 | 


--------------------------------------------------------------------------------
/docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -x
 3 | set -e
 4 | DIR=/tmp/fbcunn
 5 | rm -rf $DIR && mkdir -p $DIR
 6 | CURDIR=$(pwd)
 7 | echo $CURDIR
 8 | (
 9 |     cd $DIR
10 |     dokx-build-package-docs -o . $CURDIR
11 |     # Fix relative links in HTML to point to the CDN
12 |     replace "../_highlight/highlight.pack.js" "//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/highlight.min.js"
13 |     replace "../_highlight/styles/github.css" "//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/styles/github.min.css"
14 | 
15 |     git init
16 |     git checkout -b gh-pages
17 |     git add .
18 |     git commit -m "Documentation"
19 |     git remote add origin git@github.com:facebook/fbcunn.git
20 |     git push --set-upstream origin gh-pages -f
21 | )
22 | 


--------------------------------------------------------------------------------
/fbcunn/CuBLASWrapper.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | local CuBLASWrapper = torch.class('nn.CuBLASWrapper')
 4 | 
 5 | function CuBLASWrapper:__init(timed)
 6 |    self.iterDims = 0
 7 |    self.batchDims = 0
 8 |    self.handles = 0
 9 |    self.streams = 0
10 |    self.timed = timed or false
11 | end
12 | 
13 | function CuBLASWrapper:matmult(
14 |       A, B, C, iterDims, batchDims, transA, transB, scale)
15 |    self.transA = transA or 'n'
16 |    self.transB = transB or 'n'
17 |    self.iterDims = table.getn(iterDims) or 0
18 |    self.batchDims = table.getn(batchDims) or 0
19 |    self.scale = scale or 1.0
20 |    A.nn.CuBLASWrapper_matmult(self, A, B, C)
21 | end
22 | 
23 | function CuBLASWrapper:matmultComplex(
24 |       A, B, C, iterDims, batchDims, transA, transB, scale)
25 |    self.transA = transA or 'n'
26 |    self.transB = transB or 'n'
27 |    self.iterDims = table.getn(iterDims) or 0
28 |    self.batchDims = table.getn(batchDims) or 0
29 |    self.scale = scale or 1.0
30 |    A.nn.CuBLASWrapper_matmultComplex(self, A, B, C)
31 | end
32 | 
33 | function CuBLASWrapper:transpose(
34 |       A, B, separator, transposeMetaData, handle, stream)
35 |    self.separator = separator or 0
36 |    self.transposeMetaData = transposeMetaData or false
37 |    self.handle = handle or 1 -- always handle 1 by default
38 |    self.stream = stream or 0
39 |    A.nn.CuBLASWrapper_transpose(self, A, B)
40 | end
41 | 
42 | function CuBLASWrapper:transposeComplex(
43 |       A, B, separator, transposeMetaData, handle, stream)
44 |    self.separator = separator or 0
45 |    self.transposeMetaData = transposeMetaData or false
46 |    self.handle = handle or 1 -- always handle 1 by default
47 |    self.stream = stream or 0
48 |    A.nn.CuBLASWrapper_transposeComplex(self, A, B)
49 | end
50 | 


--------------------------------------------------------------------------------
/fbcunn/FFTCDefs.lua:
--------------------------------------------------------------------------------
 1 | local ffi = require 'ffi'
 2 | 
 3 | ffi.cdef[[
 4 |    void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*);
 5 |    void accGradParametersBiasFFI(
 6 |       THCState*, THCudaTensor*, THCudaTensor*, float scale);
 7 |    void transposeMMFFI(THCState*,
 8 |                        THCudaTensor* tA,
 9 |                        THCudaTensor* tB,
10 |                        THCudaTensor* tC,
11 |                        float invNorm,
12 |                        bool conjugateTransposeA,
13 |                        bool conjugateTransposeB,
14 |                        bool accumulate);
15 |    typedef struct {
16 |       static const int FFT_UpdateOutput = 0;
17 |       static const int FFT_UpdateGradInput = 1;
18 |       static const int FFT_AccGradParameters = 2;
19 |       int pass;
20 |    } FFTConvolutionPassFFI;
21 |    typedef struct {
22 |      THCudaTensor* tensor;
23 |      int padL;
24 |      int padU;
25 |    } TiledDeviceTensorFFI;
26 |    void convolveIteratedFFI(THCState* state,
27 |                             TiledDeviceTensorFFI* input,
28 |                             THCudaTensor* weight,
29 |                             TiledDeviceTensorFFI* output,
30 |                             int numTiles,
31 |                             int fftSize,
32 |                             FFTConvolutionPassFFI pass,
33 |                             float scale);
34 | ]]
35 | 


--------------------------------------------------------------------------------
/fbcunn/FeatureLPPooling.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | require 'cutorch'
 4 | require 'nn'
 5 | 
 6 | local FeatureLPPooling, parent =
 7 |     torch.class('nn.FeatureLPPooling', 'nn.Module')
 8 | 
 9 | --[[
10 | Possible inputs that we handle:
11 | 
12 | #### `batch_mode = false`
13 | The dimensionality of the input chooses between the following modes:
14 | 
15 | ```
16 | [feature dim]
17 | [feature dim][opt dim 1]
18 | [feature dim][opt dim 1][opt dim 2]
19 | ```
20 | 
21 | #### `batch_mode = true`
22 | The dimensionality of the input chooses between the following modes:
23 | ```
24 | [batch dim][feature dim]
25 | [batch dim][feature dim][opt dim 1]
26 | [batch dim][feature dim][opt dim 1][opt dim 2]
27 | ```
28 | 
29 | The output has the same number of dimensions as the input, except the feature
30 | dimension size is reduced to ((`input` - `width`) / `stride`) + 1
31 | ]]
32 | function FeatureLPPooling:__init(width, stride, power, batch_mode)
33 |     parent.__init(self)
34 | 
35 |     if (width < 2 or width > 16) then
36 |        error('width must be within 2 to 16 at the moment')
37 |     end
38 | 
39 |     if (stride < 1 or stride > 4) then
40 |        error('stride must be within 1 to 4 at the moment')
41 |     end
42 | 
43 |     self.width = width
44 |     self.stride = stride
45 |     self.power = power
46 |     self.batch_mode = batch_mode
47 | 
48 |     self.output = torch.Tensor()
49 |     self.gradInput = torch.Tensor()
50 | end
51 | 
52 | function FeatureLPPooling:updateOutput(input)
53 |    if torch.type(input) == 'torch.CudaTensor' then
54 |       input.nn.FeatureLPPooling_updateOutput(self, input)
55 |    else
56 |       error('CUDA only supported at the moment')
57 |    end
58 |     return self.output
59 | end
60 | 
61 | function FeatureLPPooling:updateGradInput(input, gradOutput)
62 |    if torch.type(input) == 'torch.CudaTensor' then
63 |       input.nn.FeatureLPPooling_updateGradInput(self, input, gradOutput)
64 |    else
65 |       error('CUDA only supported at the moment')
66 |    end
67 |    return self.gradInput
68 | end
69 | 


--------------------------------------------------------------------------------
/fbcunn/HalfPrecision.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | local libhalfprec = require('libhalfprec')
 4 | 
 5 | local function truncate(floats)
 6 |     return libhalfprec.toFloatCUDA(libhalfprec.toHalfCUDA(floats))
 7 | end
 8 | 
 9 | local HalfPrecision, parent =
10 |     torch.class('nn.HalfPrecision', 'nn.Module')
11 | 
12 | function HalfPrecision:__init()
13 |     parent.__init(self)
14 |     self.output = torch.CudaTensor()
15 |     self.gradInput = torch.CudaTensor()
16 | end
17 | 
18 | function HalfPrecision:updateOutput(input)
19 |     input = input:contiguous():cuda()
20 |     self.output = truncate(input)
21 |     self.output:resizeAs(input)
22 |     return self.output
23 | end
24 | 
25 | function HalfPrecision:updateGradInput(input, gradOutput)
26 |     self.gradInput = truncate(gradOutput)
27 |     self.gradInput:resizeAs(gradOutput)
28 |     return self.gradInput
29 | end
30 | 


--------------------------------------------------------------------------------
/fbcunn/LookupTableGPU.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright 2004-present Facebook. All Rights Reserved.
  2 | -- Author: Michael Mathieu <myrhev@fb.com>
  3 | 
  4 | require 'cunn'
  5 | 
  6 | --[[
  7 | Fast lookup table, supporting both CPU and GPU modes.
  8 | ]]
  9 | local LookupTableGPU, parent = torch.class('nn.LookupTableGPU', 'nn.Module')
 10 | 
 11 | 
 12 | --[[
 13 | If `featuresInDim2` is `true`, an input of dimension `batchSize` ${\times}$ `N` will produce an output of size `batchSize` ${\times}$ `nOutput`
 14 | ${\times}$ `N`. If it is set to `false` (default) it will produce an output
 15 | of size `batchSize` ${\times}$ `N` ${\times}$ `nOutput`.
 16 | ]]
 17 | function LookupTableGPU:__init(nInput, nOutput, featuresInDim2)
 18 |    parent:__init(self)
 19 |    self.nInput = nInput
 20 |    self.nOutput = nOutput
 21 |    self.featuresInDim2 = featuresInDim2 or false
 22 |    -- Careful : weight is transposed from nn.Linear
 23 |    self.weight = torch.Tensor(nInput, nOutput)
 24 |    self.gradWeight = torch.Tensor(nInput, nOutput)
 25 |    self.output = torch.Tensor()
 26 | 
 27 |    self:reset()
 28 | end
 29 | 
 30 | function LookupTableGPU:reset(stdv)
 31 |    stdv = stdv or 1
 32 |    self.weight:normal(0, stdv)
 33 | end
 34 | 
 35 | function LookupTableGPU:parameters()
 36 |     return {self.weight}, {self.gradWeight}
 37 | end
 38 | 
 39 | -- input should be a 1d (size N) or 2d (size batchSize x N)
 40 | -- tensor of byte or long on CPU, cudaTensor on GPU.
 41 | -- It contains the indices of the lookup.
 42 | function LookupTableGPU:updateOutput(input)
 43 |    if input:dim() == 2 then
 44 |       if self.featuresInDim2 then
 45 |          self.output:resize(input:size(1), self.nOutput, input:size(2))
 46 |       else
 47 |          self.output:resize(input:size(1), input:size(2), self.nOutput)
 48 |       end
 49 |    else
 50 |       self.output:resize(input:size(1), self.nOutput)
 51 |    end
 52 | 
 53 |    if input:type() == 'torch.CudaTensor' then
 54 |       input.nn.LookupTableGPU_updateOutput(input, self.weight, self.output,
 55 |                                            self.featuresInDim2)
 56 |    else
 57 |       if input:dim() == 2 then
 58 |          -- batch mode
 59 |          local this_output
 60 |          for batch = 1, input:size(1) do
 61 |             for i = 1, input:size(2) do
 62 |                if self.featuresInDim2 then
 63 |                   this_output = self.output[{batch, {}, i}]
 64 |                else
 65 |                   this_output = self.output[{batch, i}]
 66 |                end
 67 |                if self.unk_index and (input[batch][i] == self.unk_index) then
 68 |                   this_output:zero()
 69 |                else
 70 |                   this_output:copy(self.weight[input[batch][i]])
 71 |                end
 72 |             end
 73 |          end
 74 |       else
 75 |          -- non-batch mode
 76 |          if input:size(1) == 1 then
 77 |             if self.unk_index and (input[1] == self.unk_index) then
 78 |                self.zeros = self.zeros or torch.zeros(1, self.nOutput)
 79 |                self.output = self.zeros
 80 |             else
 81 |                self.output = self.weight[input[1]]:reshape(1, self.nOutput)
 82 |             end
 83 |          else
 84 |             self.output:resize(input:size(1), self.nOutput)
 85 |             for i = 1,input:size(1) do
 86 |                if self.unk_index and (input[i] == self.unk_index) then
 87 |                   self.output[i]:zero()
 88 |                else
 89 |                   self.output[i]:copy(self.weight[input[i]])
 90 |                end
 91 |             end
 92 |          end
 93 |       end
 94 |    end
 95 | 
 96 |    return self.output
 97 | end
 98 | 
 99 | function LookupTableGPU:updateGradInput(input, gradOutput)
100 |     --print("Should not be used") --TODO
101 | end
102 | 
103 | function LookupTableGPU:accGradParameters(input, gradOutput, scale)
104 |     scale = scale or 1
105 |     if input:type() == 'torch.CudaTensor' then
106 |         input.nn.LookupTableGPU_accGradParameters(input, gradOutput,
107 |                                                   self.gradWeight, scale,
108 |                                                   self.featuresInDim2)
109 |     else
110 |        if input:dim() == 2 then
111 |           -- batch mode
112 |           for batch = 1, input:size(1) do
113 |              for i = 1, input:size(2) do
114 |                 if (self.unk_index == nil) or
115 |                 (input[batch][i] ~= self.unk_index) then
116 |                    if self.featuresInDim2 then
117 |                       self.gradWeight[input[batch][i]]
118 |                       :add(scale, gradOutput[{batch, {}, i}])
119 |                    else
120 |                       self.gradWeight[input[batch][i]]
121 |                       :add(scale, gradOutput[batch][i])
122 |                    end
123 |                 end
124 |              end
125 |           end
126 |        else
127 |           -- non-batch mode
128 |           for i = 1,input:size(1) do
129 |              if (self.unk_index == nil) or
130 |              (input[i] ~= self.unk_index) then
131 |                 self.gradWeight[input[i] ]:add(scale, gradOutput[i])
132 |              end
133 |           end
134 |        end
135 |     end
136 | end
137 | 


--------------------------------------------------------------------------------
/fbcunn/OneBitDataParallel.lua:
--------------------------------------------------------------------------------
 1 | require('cutorch')
 2 | require 'fbnn'
 3 | local util = require('fb.util')
 4 | local OBSGD = require('fbcunn.OneBitSGD')
 5 | local withDevice = cutorch.withDevice
 6 | 
 7 | --[[ OneBitDataParallel implements the "1-Bit Stochastic Gradient
 8 | Descent and Application to Data-Parallel Distributed Training of
 9 | Speech DNNs" paper of Frank Seide, Hao Fu, Jasha Droppo, Gang Li, and
10 | Dong Yu.
11 | 
12 | The implementation is similar to a vanilla DataParallel module, except we replace the averaging gradient step with a quantize-copy-merge-broadcast procedure.
13 | 
14 | <http://research.microsoft.com/apps/pubs/?id=230137>
15 | ]]
16 | local OneBitDataParallel, parent =
17 |     torch.class('nn.OneBitDataParallel', 'nn.DataParallel')
18 | 
19 | function OneBitDataParallel:__init(dimension, config)
20 |     parent.__init(self, dimension)
21 |     self.config = {}
22 |     self.config.min_elements = config.min_elements or 20
23 |     self.config.adagrad_learning_rate = config.learningRate or 1.0
24 |     self.config.momentum_rate = config.momentum or 0
25 | 
26 |     -- Aggregators for each [row][gradient]
27 |     self._aggregators = util.defaultdict(function() return end)
28 |     self.home_device = 1
29 | end
30 | 
31 | 
32 | function OneBitDataParallel:_should_run_one_bit_sgd(gradients)
33 |     -- TODO(tulloch) - flesh this test out
34 |     assert(gradients)
35 |     assert(#gradients >= 1)
36 |     return gradients[1]:nDimension() == 2 and
37 |         gradients[1]:nElement() > self.config.min_elements
38 | end
39 | 
40 | function OneBitDataParallel:_combine_gradients(row_idx, gradients)
41 |     assert(#gradients >= 1)
42 |     if not self:_should_run_one_bit_sgd(gradients) then
43 |         return parent._combine_gradients(self, row_idx, gradients)
44 |     end
45 | 
46 |     if not self._aggregators[row_idx] then
47 |         local g = gradients[1]
48 |         self._aggregators[row_idx] = OBSGD.OneBitAggregator(
49 |             self.config,
50 |             function() return torch.Tensor():typeAs(g):resizeAs(g):zero() end,
51 |             function(dest, source) return self:gpuSend(dest, source) end,
52 |             self.home_device
53 |         )
54 |         self.home_device = self.home_device % cutorch.getDeviceCount() + 1
55 |     end
56 |     self._aggregators[row_idx]:run(gradients)
57 | end
58 | 


--------------------------------------------------------------------------------
/fbcunn/OneBitQuantization.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | require 'cutorch'
 4 | require 'nn'
 5 | 
 6 | --[[
 7 | CUDA implementation of the quantize/unquantize methods used by `nn.OneBitDataParallel`.
 8 | ]]
 9 | local OneBitQuantization = torch.class('nn.OneBitQuantization')
10 | 
11 | function OneBitQuantization:__init()
12 |    self.quantized = torch.CudaTensor()
13 |    self.non_quantized = torch.CudaTensor()
14 |    self.quantization_error = nil
15 |    self.avg_pos = torch.CudaTensor()
16 |    self.avg_neg = torch.CudaTensor()
17 | end
18 | 
19 | function OneBitQuantization:reset()
20 |    self.quantization_error = nil
21 | end
22 | 
23 | function OneBitQuantization:quantize(non_quantized_input)
24 |    -- When starting a new quantization chain, we start with zero error
25 |    if not self.quantization_error then
26 |       self.quantization_error = non_quantized_input:clone()
27 |       self.quantization_error:zero()
28 |    end
29 | 
30 |    non_quantized_input.nn.OneBitQuantization_quantize(self, non_quantized_input)
31 |    return self.quantized, self.quantization_error, self.avg_pos, self.avg_neg
32 | end
33 | 
34 | function OneBitQuantization:dequantize(quantized_input,
35 |                                        avg_pos, avg_neg, num_orig_cols)
36 |    quantized_input.nn.OneBitQuantization_dequantize(
37 |       self, quantized_input, avg_pos, avg_neg, num_orig_cols)
38 |    return self.non_quantized
39 | end
40 | 


--------------------------------------------------------------------------------
/fbcunn/TemporalConvolutionFB.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | require 'nn'
 4 | 
 5 | local TemporalConvolutionFB, parent =
 6 |    torch.class('nn.TemporalConvolutionFB', 'nn.Module')
 7 | 
 8 | function TemporalConvolutionFB:__init(inputFrameSize, outputFrameSize, kW, dW)
 9 |    parent.__init(self)
10 | 
11 |    dW = dW or 1
12 | 
13 |    self.inputFrameSize = inputFrameSize
14 |    self.outputFrameSize = outputFrameSize
15 |    self.kW = kW
16 |    self.dW = dW
17 | 
18 |    self.weight = torch.Tensor(outputFrameSize, kW, inputFrameSize)
19 |    self.bias = torch.Tensor(outputFrameSize)
20 |    self.gradWeight = torch.Tensor(outputFrameSize, kW, inputFrameSize)
21 |    self.gradBias = torch.Tensor(outputFrameSize)
22 | 
23 |    self:reset()
24 | end
25 | 
26 | function TemporalConvolutionFB:reset(stdv)
27 |    if stdv then
28 |       stdv = stdv * math.sqrt(3)
29 |    else
30 |       stdv = 1/math.sqrt(self.kW*self.inputFrameSize)
31 |    end
32 |    if nn.oldSeed then
33 |       self.weight:apply(function()
34 |          return torch.uniform(-stdv, stdv)
35 |       end)
36 |       self.bias:apply(function()
37 |          return torch.uniform(-stdv, stdv)
38 |       end)
39 |    else
40 |       self.weight:uniform(-stdv, stdv)
41 |       self.bias:uniform(-stdv, stdv)
42 |    end
43 | end
44 | 
45 | function TemporalConvolutionFB:updateOutput(input)
46 |    input.nn.TemporalConvolutionFB_updateOutput(self, input)
47 |    return self.output
48 | end
49 | 
50 | function TemporalConvolutionFB:updateGradInput(input, gradOutput)
51 |    if self.gradInput then
52 |       return input.nn.TemporalConvolutionFB_updateGradInput(
53 |          self, input, gradOutput)
54 |    end
55 | end
56 | 
57 | function TemporalConvolutionFB:accGradParameters(input, gradOutput, scale)
58 |    scale = scale or 1
59 |    input.nn.TemporalConvolutionFB_accGradParameters(
60 |       self, input, gradOutput, scale)
61 | end
62 | 
63 | function TemporalConvolutionFB:sharedAccUpdateGradParameters(input, gradOutput, lr)
64 |    -- we do not need to accumulate parameters when sharing:
65 |    self:defaultAccUpdateGradParameters(input, gradOutput, lr)
66 | end
67 | 


--------------------------------------------------------------------------------
/fbcunn/TemporalKMaxPooling.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | -- TemporalKmaxPooling
 4 | -- Input : (bsize x) width x height
 5 | -- Output : (bisze x) k_out x height
 6 | -- with k_out = max(k_out_prop, inputSeqLen)
 7 | -- where k_out_prop = max(k, ceil(k_dynamic*inputSeqLen))
 8 | 
 9 | require 'cutorch'
10 | require 'nn'
11 | 
12 | local TemporalKMaxPooling, parent =
13 |     torch.class('nn.TemporalKMaxPooling', 'nn.Module')
14 | 
15 | function TemporalKMaxPooling:__init(k, k_dynamic)
16 |     parent.__init(self)
17 | 
18 |     self.k = k
19 |     if k_dynamic then
20 |         assert(k_dynamic <= 1 and k_dynamic >=0,
21 |         'k_dynamic must be between 0 and 1')
22 |     end
23 |     self.k_dynamic = k_dynamic or -1
24 | 
25 |     -- k_dynamic is an optional scalar parameter between 0 and 1
26 |     -- that makes k a fraction of the input sequence size.
27 | 
28 |     -- To follow Kalchbrenner et al's architecture on Dynamic k-Max Pooling:
29 |     -- Use (k = k_top, kDynamic = (L - l)/L), with
30 |     -- L : total number of conv layers,
31 |     -- l : current convolutional layer to which the pooling is applied,
32 |     -- k_top : fixed pooling parameter for the topmost convolutional layer.
33 | 
34 |     self.output = torch.CudaTensor()
35 |     self.gradInput = torch.CudaTensor()
36 |     self.indices = torch.CudaTensor()
37 | end
38 | 
39 | function TemporalKMaxPooling:updateOutput(input)
40 |     input = input:contiguous()
41 |     input.nn.TemporalKMaxPooling_updateOutput(self, input)
42 |     return self.output
43 | end
44 | 
45 | function TemporalKMaxPooling:updateGradInput(input, gradOutput)
46 |     input = input:contiguous()
47 |     gradOutput = gradOutput:contiguous()
48 | 
49 |     input.nn.TemporalKMaxPooling_updateGradInput(self, input, gradOutput)
50 |     return self.gradInput
51 | end
52 | 


--------------------------------------------------------------------------------
/fbcunn/init.lua:
--------------------------------------------------------------------------------
 1 | require 'nn'
 2 | require 'fbnn'
 3 | require 'cunn'
 4 | require 'libfbcunn'
 5 | require 'fbcunn.cuda_ext'
 6 | 
 7 | include('AbstractParallel.lua')
 8 | include('BatchNormalization.lua')
 9 | include('CuBLASWrapper.lua')
10 | include('DataParallel.lua')
11 | include('FeatureLPPooling.lua')
12 | include('FFTWrapper.lua')
13 | include('HalfPrecision.lua')
14 | include('LookupTableGPU.lua')
15 | include('ModelParallel.lua')
16 | include('OneBitDataParallel.lua')
17 | include('OneBitQuantization.lua')
18 | include('OneBitSGD.lua')
19 | include('FFTCDefs.lua')
20 | include('SpatialBatchNormalization.lua')
21 | -- include('SpatialConvolutionFFT.lua')
22 | -- include('SpatialConvolutionCuFFT.lua')
23 | -- include('SpatialConvolutionFBFFT.lua')
24 | -- include('SpatialConvolutionFBFFTGemm.lua')
25 | -- include('SpatialConvolutionFFTTiled.lua')
26 | -- include('SpatialConvolutionFFTTiledSync.lua')
27 | -- include('SpatialConvolutionFFTTiledAsync.lua')
28 | -- include('SpatialConvolutionFFTTiledIterated.lua')
29 | -- include('SpatialConvolution.lua')
30 | include('TemporalConvolutionFB.lua')
31 | include('TemporalKMaxPooling.lua')
32 | 
33 | -- Monkey-patch module to include getParametersByDevice
34 | -- Get the params of the module separated by device.
35 | -- Returns the pair:
36 | --   {0 = flat tensor containing CPU weights,
37 | --    1 = flat tensor containing weights from device 1,
38 | --    ...
39 | --    N = ... containing weights from device N},
40 | --   {0 = flat tensor containing CPU grads,
41 | --    1 = ... containing grads from device 1, ...}
42 | function nn.Module:getParametersByDevice()
43 |     local n_dev = cutorch.getDeviceCount()
44 |     local d2weights = {} -- Device => { tensor1, tensor2, ..., tensorN }
45 |     local d2grads   = {} -- Device => { tensor1, tensor2, ..., tensorN }
46 | 
47 |     local function tensor_to_dev(tensor)
48 |         local tnm = torch.typename(tensor)
49 |         if tnm == 'torch.CudaTensor' then
50 |             return tensor:getDevice()
51 |         end
52 |         return 0
53 |     end
54 | 
55 |     local params, grads = self:parameters()
56 |     assert(#params == #grads)
57 |     -- Herd each tensor into appropriate row of weights,grads
58 |     for i = 1,#params do
59 |         local p = params[i]
60 |         local g = grads[i]
61 |         local d = tensor_to_dev(p)
62 |         if d ~= tensor_to_dev(g) then
63 |             error(("Improbable module; params,grads on devices %d,%d"):
64 |                   format(d, tensor_to_dev(g)))
65 |         end
66 |         if not d2weights[d] then
67 |             d2weights[d] = {}
68 |             d2grads[d] = {}
69 |         end
70 |         table.insert(d2weights[d], p)
71 |         table.insert(d2grads[d], g)
72 |     end
73 | 
74 |     local function gather(dev, params, grads)
75 |         if not params or #params == 0 then
76 |             return nil
77 |         end
78 |         if dev == 0 then
79 |             return nn.Module.flatten(params), nn.Module.flatten(grads)
80 |         end
81 |         return cutorch.withDevice(dev,
82 |             function() return nn.Module.flatten(params),
83 |                               nn.Module.flatten(grads)
84 |         end)
85 |     end
86 | 
87 |     local ret_params = { }
88 |     local ret_grads = { }
89 |     for d = 0,n_dev do -- sic
90 |         ret_params[d], ret_grads[d] = gather(d, d2weights[d], d2grads[d])
91 |     end
92 | 
93 |     return ret_params, ret_grads
94 | end
95 | 


--------------------------------------------------------------------------------
/rocks/fbcunn-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "fbcunn"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/facebook/fbcunn.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "Facebook's extensions to torch/cunn. ",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/facebook/fbcunn",
13 |    license = "BSD"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 |    "nn >= 1.0",
19 |    "cutorch >= 1.0",
20 |    "multikey",
21 |    "fbnn",
22 |    "fbtorch"
23 | }
24 | 
25 | build = {
26 |    type = "command",
27 |    build_command = [[
28 |    git submodule init
29 |    git submodule update
30 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)"
31 | ]],
32 |    install_command = "cd build && $(MAKE) install"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/BLASParameters.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "BLASParameters.h"
 4 | 
 5 | using namespace std;
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch {
 8 | 
 9 | std::ostream& operator<<(ostream& os, const BLASParameters& params) {
10 |   os << " ITD = " << params.iterDims;
11 |   os << " BTD = " << params.batchDims;
12 |   os << " RIX = " << params.resourceIndex;
13 |   os << " CPLX = " << params.asComplex;
14 |   os << " batchStepA = " << params.batchStepA;
15 |   os << " batchStepB = " << params.batchStepB;
16 |   os << " batchStepC = " << params.batchStepC;
17 |   os << " #handles = " << params.handles.size();
18 |   os << " #streams = " << params.streams.size();
19 |   os << " transposeA = " << ((params.transposeA == CUBLAS_OP_T) ? "t " :
20 |                              (params.transposeA == CUBLAS_OP_C) ? "c " : "n");
21 |   os << " transposeB = " << ((params.transposeB == CUBLAS_OP_T) ? "t " :
22 |                              (params.transposeB == CUBLAS_OP_C) ? "c " : "n");
23 |   os << " scale = (" << params.scaleRe << ", " << params.scaleIm << ")";
24 |   return os;
25 | }
26 | 
27 | }}}
28 | 


--------------------------------------------------------------------------------
/src/BLASParameters.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | #pragma once
  3 | 
  4 | #include <cublas_v2.h>
  5 | #include <iostream>
  6 | #include <vector>
  7 | 
  8 | namespace facebook { namespace deeplearning { namespace torch {
  9 | 
 10 | // Column major: columns are contiguous in memory
 11 | // Cmxn <- Amxk * Bkxn becomes
 12 | // C'nxm <- A'kxm * B'nxk and so C'nxm <- B'nxk * A'kxm
 13 | struct BLASParameters {
 14 |   BLASParameters() :
 15 |       iterDims(0),
 16 |       batchDims(0),
 17 |       resourceIndex(0),
 18 |       batchStepA(1),
 19 |       batchStepB(1),
 20 |       batchStepC(1),
 21 |       scaleRe(1.0f),
 22 |       scaleIm(0.0f),
 23 |       asComplex(false),
 24 |       accumulate(false),
 25 |       handles(),
 26 |       streams(),
 27 |       transposeA(CUBLAS_OP_N),
 28 |       transposeB(CUBLAS_OP_N) {}
 29 | 
 30 |   // Outermost dimensions to be treated as individual iterations in enclosing
 31 |   // for loops.
 32 |   BLASParameters& withIterDims(int i) {
 33 |     iterDims = i;
 34 |     return *this;
 35 |   }
 36 | 
 37 |   // After iterDims, remaining outermost dimensions to be treated as batch
 38 |   // dimensions, for instance, in a gemmbatched call.
 39 |   BLASParameters& withBatchDims(int i) {
 40 |     batchDims = i;
 41 |     return *this;
 42 |   }
 43 | 
 44 |   // Force running on a particular handle / stream index in the handle /
 45 |   // stream vectors. The actual handle / stream we will end up running on is
 46 |   // recovered by modulo indexing into the vector, default handle / stream if
 47 |   // the vectors are empty.
 48 |   BLASParameters& withResourceIndex(int i) {
 49 |     resourceIndex = i;
 50 |     return *this;
 51 |   }
 52 | 
 53 |   // Distance between two batches of A, used in batched mode, in case we want
 54 |   // to compute one entry every k. Step of zerom means the same matrix A will
 55 |   // be read over and over again.
 56 |   BLASParameters& withBatchStepA(int i) {
 57 |     batchStepA = i;
 58 |     return *this;
 59 |   }
 60 | 
 61 |   // Distance between two batches of B, used in batched mode, in case we want
 62 |   // to compute one entry every k. Step of zerom means the same matrix B will
 63 |   // be read over and over again.
 64 |   BLASParameters& withBatchStepB(int i) {
 65 |     batchStepB = i;
 66 |     return *this;
 67 |   }
 68 | 
 69 |   // Distance between two batches of C, used in batched mode, in case we want
 70 |   // to compute one entry every k. Step of zerom means the same matrix C will
 71 |   // be written over and over again.
 72 |   BLASParameters& withBatchStepC(int i) {
 73 |     batchStepC = i;
 74 |     return *this;
 75 |   }
 76 | 
 77 |   // Sets real scale in C += alpha * C + scale * A * B
 78 |   BLASParameters& withScaleReal(float f) {
 79 |     scaleRe = f;
 80 |     return *this;
 81 |   }
 82 | 
 83 |   // Sets imaginary scale in C += alpha * C + scale * A * B
 84 |   BLASParameters& withScaleImaginary(float f) {
 85 |     scaleIm = f;
 86 |     return *this;
 87 |   }
 88 | 
 89 |   // Use cgemm instead of sgemm
 90 |   BLASParameters& withComplex(bool b) {
 91 |     asComplex = b;
 92 |     return *this;
 93 |   }
 94 | 
 95 |   // If true, computes C += scale * A * B. Default is C = scale * A * B.
 96 |   BLASParameters& withAccumulate(bool b) {
 97 |     accumulate = b;
 98 |     return *this;
 99 |   }
100 | 
101 |   // Set vector of handle resources
102 |   BLASParameters& withHandles(const std::vector<cublasHandle_t>& h) {
103 |     handles = h;
104 |     return *this;
105 |   }
106 | 
107 |   // Set vector of stream resources
108 |   BLASParameters& withStreams(const std::vector<cudaStream_t>& s) {
109 |     streams = s;
110 |     return *this;
111 |   }
112 | 
113 |   // Transpose A
114 |   BLASParameters& withTransposeA(cublasOperation_t t) {
115 |     transposeA = t;
116 |     return *this;
117 |   }
118 | 
119 |   // Transpose B
120 |   BLASParameters& withTransposeB(cublasOperation_t t) {
121 |     transposeB = t;
122 |     return *this;
123 |   }
124 | 
125 |   // Transpose A
126 |   BLASParameters& withTransposeA(char c) {
127 |     transposeA = (c == 't') ? CUBLAS_OP_T :
128 |       ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N);
129 |     return *this;
130 |   }
131 | 
132 |   // Transpose B
133 |   BLASParameters& withTransposeB(char c) {
134 |     transposeB = (c == 't') ? CUBLAS_OP_T :
135 |       ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N);
136 |     return *this;
137 |   }
138 | 
139 |   unsigned int iterDims;
140 |   unsigned int batchDims;
141 |   unsigned int resourceIndex;
142 |   unsigned int batchStepA;
143 |   unsigned int batchStepB;
144 |   unsigned int batchStepC;
145 |   float scaleRe;
146 |   float scaleIm;
147 |   bool asComplex;
148 |   bool accumulate;
149 |   std::vector<cublasHandle_t> handles;
150 |   std::vector<cudaStream_t> streams;
151 |   cublasOperation_t transposeA;
152 |   cublasOperation_t transposeB;
153 | };
154 | 
155 | std::ostream& operator<<(std::ostream& os, const BLASParameters& params);
156 | 
157 | }}}
158 | 


--------------------------------------------------------------------------------
/src/ConvolutionBias.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | struct THCudaTensor;
 5 | struct THCState;
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch {
 8 | namespace bias {
 9 | 
10 | /// Applies an additive bias to all output elements, pointwise, one
11 | /// bias per output plane
12 | /// Performs the operation output[b][o][y][x] += bias[o]
13 | void updateOutputBias(THCState* state,
14 |                       THCudaTensor* outputTH,
15 |                       THCudaTensor* biasTH);
16 | 
17 | /// Applies an additive bias to all output elements, pointwise, one
18 | /// bias per kernel column.
19 | /// Performs the operation output[b][o][x] += bias[x]
20 | void updateOutputTemporalBias(THCState* state,
21 |                               THCudaTensor* outputTH,
22 |                               THCudaTensor* biasTH);
23 | 
24 | /// Updates the gradient bias with the scaled sum of the output per
25 | /// output plane
26 | /// Performs the operation gradBias[o] += biasScale * output[b][o][x][y]
27 | void accGradParametersBias(THCState* state,
28 |                            THCudaTensor* outputTH,
29 |                            THCudaTensor* gradBiasTH,
30 |                            float biasScale);
31 | 
32 | /// Updates the gradient bias with the scaled sum of the output per
33 | /// output plane
34 | /// Performs the operation gradBias[x] += biasScale * output[b][o][x]
35 | void accGradParametersTemporalBias(THCState* state,
36 |                                    THCudaTensor* outputTH,
37 |                                    THCudaTensor* gradBiasTH,
38 |                                    float biasScale);
39 | 
40 | } } } }
41 | 


--------------------------------------------------------------------------------
/src/CrossMapNormalization.cuh:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Facebook
 3 |  * @author Tudor Bosman (tudorb@fb.com)
 4 |  */
 5 | #pragma once
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch {
 8 | namespace detail {
 9 | 
10 | struct CrossMapNormalizationParam {
11 |   int batchSize;
12 |   int numFeatures;
13 |   int featureSize;
14 |   int kernelSize;
15 |   int kernelRadius;
16 |   float scale;
17 |   float power;
18 | };
19 | 
20 | void launchCrossMapNormalizationUpdateOutputKernel(
21 |   cudaStream_t stream,
22 |   const float* input,
23 |   float* output,
24 |   float* squaredSum,
25 |   CrossMapNormalizationParam params);
26 | 
27 | void launchCrossMapNormalizationUpdateGradInputKernel(
28 |   cudaStream_t stream,
29 |   const float* input,
30 |   const float* gradOutput,
31 |   const float* squaredSum,
32 |   float* gradInput,
33 |   CrossMapNormalizationParam params);
34 | 
35 | }}}}  // namespaces
36 | 


--------------------------------------------------------------------------------
/src/CrossMapNormalizationHost.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014 Facebook
  3 |  * @author Tudor Bosman (tudorb@fb.com)
  4 |  */
  5 | 
  6 | #include "THC.h"
  7 | #include "src/CrossMapNormalization.cuh"
  8 | #include "src/Utils.h"
  9 | #include <luaT.h>
 10 | #include <lua.hpp>
 11 | 
 12 | namespace facebook { namespace deeplearning { namespace torch {
 13 | 
 14 | namespace {
 15 | 
 16 | // Forward pass
 17 | int updateOutput(lua_State* L) {
 18 |   THCState* state = getCutorchState(L);
 19 |   auto input = static_cast<THCudaTensor*>(
 20 |       luaT_checkudata(L, 2, "torch.CudaTensor"));
 21 |   auto output = static_cast<THCudaTensor*>(
 22 |       luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor"));
 23 | 
 24 | 
 25 |   int outputIdx = lua_gettop(L);
 26 |   auto squaredSum = static_cast<THCudaTensor*>(
 27 |       luaT_getfieldcheckudata(L, 1, "squaredSum", "torch.CudaTensor"));
 28 | 
 29 |   THAssert(THCudaTensor_checkGPU(state, 3, input, output, squaredSum));
 30 | 
 31 |   detail::CrossMapNormalizationParam param;
 32 |   param.kernelSize = luaT_getfieldcheckint(L, 1, "size");
 33 |   param.kernelRadius = param.kernelSize / 2;
 34 |   param.scale = luaT_getfieldchecknumber(L, 1, "scale");
 35 |   param.power = luaT_getfieldchecknumber(L, 1, "power");
 36 | 
 37 |   int ndims = THCudaTensor_nDimension(state, input);
 38 |   if (ndims != 3 && ndims != 4) {
 39 |     luaL_error(L, "Invalid input tensor dimension");
 40 |   }
 41 | 
 42 |   if (param.kernelSize % 2 == 0) {
 43 |     luaL_error(L, "Kernel size must be odd");
 44 |   }
 45 | 
 46 |   // Make tensors contiguous
 47 |   input  = THCudaTensor_newContiguous(state, input);
 48 |   output = THCudaTensor_newContiguous(state, output);
 49 | 
 50 |   // Resize derived tensors based on input
 51 |   THCudaTensor_resizeAs(state, output, input);
 52 |   THCudaTensor_resizeAs(state, squaredSum, input);
 53 | 
 54 |   param.batchSize = 1;
 55 |   int firstDim = 0;
 56 |   if (ndims == 4) {
 57 |     param.batchSize = THCudaTensor_size(state, input, 0);
 58 |     firstDim = 1;
 59 |   }
 60 | 
 61 |   param.numFeatures = THCudaTensor_size(state, input, firstDim);
 62 |   param.featureSize = THCudaTensor_stride(state, input, firstDim);
 63 | 
 64 |   detail::launchCrossMapNormalizationUpdateOutputKernel(
 65 |     THCState_getCurrentStream(state),
 66 |     THCudaTensor_data(state, input),
 67 |     THCudaTensor_data(state, output),
 68 |     THCudaTensor_data(state, squaredSum),
 69 |     param);
 70 |   lua_pushvalue(L, outputIdx);
 71 |   THCudaTensor_free(state, input);
 72 |   THCudaTensor_free(state, output);
 73 | 
 74 |   return 1;
 75 | }
 76 | 
 77 | // Backprop
 78 | int updateGradInput(lua_State* L) {
 79 |   THCState* state = getCutorchState(L);
 80 |   auto input = static_cast<THCudaTensor*>(
 81 |       luaT_checkudata(L, 2, "torch.CudaTensor"));
 82 |   auto gradOutput = static_cast<THCudaTensor*>(
 83 |       luaT_checkudata(L, 3, "torch.CudaTensor"));
 84 |   auto gradInput = static_cast<THCudaTensor*>(
 85 |       luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor"));
 86 |   int gradInputIdx = lua_gettop(L);
 87 |   auto squaredSum = static_cast<THCudaTensor*>(
 88 |       luaT_getfieldcheckudata(L, 1, "squaredSum", "torch.CudaTensor"));
 89 | 
 90 |   THAssert(THCudaTensor_checkGPU(state, 4, input,
 91 |                                  gradInput, gradOutput, squaredSum));
 92 | 
 93 |   detail::CrossMapNormalizationParam param;
 94 |   param.kernelSize = luaT_getfieldcheckint(L, 1, "size");
 95 |   param.kernelRadius = param.kernelSize / 2;
 96 |   param.scale = luaT_getfieldchecknumber(L, 1, "scale");
 97 |   param.power = luaT_getfieldchecknumber(L, 1, "power");
 98 | 
 99 |   int ndims = THCudaTensor_nDimension(state, input);
100 |   if (ndims != 3 && ndims != 4) {
101 |     luaL_error(L, "Invalid input tensor dimension");
102 |   }
103 | 
104 |   if (param.kernelSize % 2 == 0) {
105 |     luaL_error(L, "Kernel size must be odd");
106 |   }
107 | 
108 |   // Make tensors contiguous
109 |   input = THCudaTensor_newContiguous(state, input);
110 |   gradOutput = THCudaTensor_newContiguous(state, gradOutput);
111 |   gradInput = THCudaTensor_newContiguous(state, gradInput);
112 | 
113 |   // Resize derived tensors based on input
114 |   THCudaTensor_resizeAs(state, gradInput, input);
115 | 
116 |   param.batchSize = 1;
117 |   int firstDim = 0;
118 |   if (ndims == 4) {
119 |     param.batchSize = THCudaTensor_size(state, input, 0);
120 |     firstDim = 1;
121 |   }
122 | 
123 | 
124 |   param.numFeatures = THCudaTensor_size(state, input, firstDim);
125 |   param.featureSize = THCudaTensor_stride(state, input, firstDim);
126 | 
127 |   detail::launchCrossMapNormalizationUpdateGradInputKernel(
128 |     THCState_getCurrentStream(state),
129 |     THCudaTensor_data(state, input),
130 |     THCudaTensor_data(state, gradOutput),
131 |     THCudaTensor_data(state, squaredSum),
132 |     THCudaTensor_data(state, gradInput),
133 |     param);
134 | 
135 |   lua_pushvalue(L, gradInputIdx);
136 |   THCudaTensor_free(state, gradOutput);
137 |   THCudaTensor_free(state, gradInput);
138 |   THCudaTensor_free(state, input);
139 |   return 1;
140 | }
141 | 
142 | const luaL_Reg functions[] = {
143 |   {"CrossMapNormalization_updateOutput", updateOutput},
144 |   {"CrossMapNormalization_updateGradInput", updateGradInput},
145 |   {nullptr, nullptr},
146 | };
147 | 
148 | }  // namespace
149 | 
150 | void initCrossMapNormalizationCuda(lua_State* L) {
151 |   luaT_pushmetatable(L, "torch.CudaTensor");
152 |   luaT_registeratname(L, functions, "nn");
153 |   lua_pop(L, 1);
154 | }
155 | 
156 | }}}  // namespaces
157 | 


--------------------------------------------------------------------------------
/src/CuBLASWrapper.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cuda/DeviceTensor.cuh"
 5 | 
 6 | #include "src/BLASParameters.h"
 7 | 
 8 | #include "cublas_v2.h"
 9 | #include <iostream>
10 | #include <thrust/host_vector.h>
11 | #include <thrust/device_vector.h>
12 | 
13 | namespace facebook { namespace deeplearning { namespace torch {
14 | 
15 | //
16 | // This transposition wrapper implements quick device-side transpositions.
17 | // Consider tensor dimensions are collapsed into a 2-D 'y'-by-'x'.
18 | // The wrapper takes a separator integer and considers dimensions
19 | // (0 .. separator - 1) as being collapsed to form the 'y'
20 | // dimension. Dimensions (separator .. Dim - 1) are collapsed to form the 'x'
21 | // dimension.
22 | //
23 | // The complex case is a bit trickier since Torch does not natively support
24 | // complex numbers, we emulate them with float[2]. In that case, 'x' is
25 | // special in that it has to be exactly [x/2][2] and the inner 2 floats can
26 | // never be transposed.
27 | //
28 | // The invariant is that in and out are sized identically on entry and that
29 | // out is permuted to account for the transposition on exit.
30 | //
31 | // This wrapper requires non-padded tensors since it calls CUBLAS
32 | // under the hood. It could support padding along 1 dimension if needed.
33 | //
34 | template<int Dim>
35 | void transpose(const cuda::DeviceTensor<float, Dim>& in,
36 |                cuda::DeviceTensor<float, Dim>& out,
37 |                int separator,
38 |                bool asComplex = false,
39 |                bool transposeMetaData = true,
40 |                cublasHandle_t handle = NULL,
41 |                cudaStream_t stream = NULL);
42 | 
43 | template<int Dim>
44 | void transposeAsComplex(const cuda::DeviceTensor<float, Dim>& in,
45 |                         cuda::DeviceTensor<float, Dim>& out,
46 |                         int separator,
47 |                         bool transposeMetaData = true,
48 |                         cublasHandle_t handle = NULL,
49 |                         cudaStream_t stream = NULL);
50 | 
51 | // Single matmult, not batched, not iterated, complex or real
52 | template<int Dim>
53 | void matmult(cuda::DeviceTensor<float, Dim>& C,
54 |              const cuda::DeviceTensor<float, Dim>& A,
55 |              const cuda::DeviceTensor<float, Dim>& B,
56 |              const BLASParameters& params);
57 | 
58 | 
59 | // Batched matmult from device pointers and model tensors serve to derive
60 | // problem sizes. This is exposed for convenience to perform fancier batched
61 | // sgemm calls.
62 | void matmultBatched(thrust::host_vector<cuFloatComplex*>& CPtrVec,
63 |                     thrust::host_vector<const cuFloatComplex*>& APtrVec,
64 |                     thrust::host_vector<const cuFloatComplex*>& BPtrVec,
65 |                     const cuda::DeviceTensor<float, 3>& modelC,
66 |                     const cuda::DeviceTensor<float, 3>& modelA,
67 |                     const cuda::DeviceTensor<float, 3>& modelB,
68 |                     const BLASParameters& params);
69 | 
70 | // Batched matmult from device pointers and model tensors serve to derive
71 | // problem sizes. This is exposed for convenience to perform fancier batched
72 | // sgemm calls.
73 | void matmultBatched(thrust::host_vector<float*>& CPtrVec,
74 |                     thrust::host_vector<const float*>& APtrVec,
75 |                     thrust::host_vector<const float*>& BPtrVec,
76 |                     const cuda::DeviceTensor<float, 2>& modelC,
77 |                     const cuda::DeviceTensor<float, 2>& modelA,
78 |                     const cuda::DeviceTensor<float, 2>& modelB,
79 |                     const BLASParameters& params);
80 | 
81 | // Batched matmult, not iterated, complex or real
82 | // batchDims are outermost dimensions of the tensor iterated in parallel
83 | template<int Dim>
84 | void matmultBatched(cuda::DeviceTensor<float, Dim>& C,
85 |                     cuda::DeviceTensor<float, Dim>& A,
86 |                     cuda::DeviceTensor<float, Dim>& B,
87 |                     const BLASParameters& params);
88 | 
89 | // Iterated matmult, batch or not, complex or real
90 | // iterDims are outermost dimensions of the tensor iterated sequentially
91 | // batchDims are outermost dimensions of the tensor iterated in parallel
92 | template<int Dim>
93 | void matmultIter(cuda::DeviceTensor<float, Dim>& C,
94 |                  cuda::DeviceTensor<float, Dim>& A,
95 |                  cuda::DeviceTensor<float, Dim>& B,
96 |                  const BLASParameters& params);
97 | 
98 | } } } // namespace
99 | 


--------------------------------------------------------------------------------
/src/CudaTensorUtils.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "THCTensor.h"
 5 | #include <folly/Optional.h>
 6 | #include "thpp/Tensor.h"
 7 | 
 8 | #include <cuda_runtime.h>
 9 | #include <memory>
10 | #include <vector>
11 | 
12 | struct THCState;
13 | 
14 | // unique_ptr destructor for THCudaTensor
15 | struct THCudaTensor;
16 | struct CudaTensorDeleter {
17 |   explicit CudaTensorDeleter(THCState* s) : state(s) {}
18 |   CudaTensorDeleter() : state(nullptr) {}
19 | 
20 |   void operator()(THCudaTensor*);
21 |   THCState* state;
22 | };
23 | 
24 | // unique_ptr destructor for device-malloc'd memory
25 | struct CudaDeleter {
26 |   void operator()(void* p) {
27 |     if (p) {
28 |       cudaFree(p);
29 |     }
30 |   }
31 | };
32 | 
33 | namespace facebook { namespace deeplearning { namespace torch {
34 | 
35 | /// Constructs a new THCudaTensor initialized to 0 with the given
36 | /// sizes and strides.
37 | /// See D1581014, this method allocates a full tensor whose storage capacity is
38 | /// greater than strictly requested by torch.
39 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
40 | makeTHCudaTensorFull(THCState* state,
41 |                      const std::vector<long>& sizes,
42 |                      const folly::Optional<std::vector<long>>& strides =
43 |                      folly::none);
44 | 
45 | /// Constructs a new THCudaTensor which is a view of the aliased
46 | /// THCudaTensor with the given sizes and strides.
47 | /// The requested size (strides(0) * sizes(0)) must fit within the input
48 | /// Tensor otherwise overflows would occur.
49 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
50 | makeAliasedTHCudaTensorFull(THCState* state,
51 |                             THCudaTensor* in,
52 |                             const std::vector<long>& sizes,
53 |                             const folly::Optional<std::vector<long>>& strides =
54 |                             folly::none);
55 | 
56 | /// See D1581014, this method allocates a full tensor whose storage capacity is
57 | /// greater than strictly requested by torch.
58 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
59 | makeTHCudaTensorFull(THCState* state,
60 |                      std::initializer_list<long> sizes,
61 |                      std::initializer_list<long> strides =
62 |                      std::initializer_list<long>());
63 | 
64 | /// Copy a THCudaTensor to a new host-resident Tensor. Does not modify 'tensor'.
65 | thpp::Tensor<float> copyFromCuda(THCState* state,
66 |                                  const THCudaTensor* tensor);
67 | 
68 | /// Copy a Tensor<float> to a new THCudaTensor. Does not modify 'tensor'.
69 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
70 | copyToCuda(THCState* state, thpp::Tensor<float>& tensor);
71 | 
72 | template <typename T>
73 | std::unique_ptr<T, CudaDeleter> cudaAlloc(size_t size) {
74 |   T* ptr = nullptr;
75 |   const auto err = cudaMalloc(&ptr, size);
76 | 
77 |   if (!ptr || err == cudaErrorMemoryAllocation) {
78 |     throw std::bad_alloc();
79 |   }
80 | 
81 |   return std::unique_ptr<T, CudaDeleter>(ptr, CudaDeleter());
82 | }
83 | 
84 | } } }  // namespace
85 | 


--------------------------------------------------------------------------------
/src/DeviceTensorUtils.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cuda/DeviceTensor.cuh"
 5 | #include "THCTensor.h"
 6 | 
 7 | #include <cuda_runtime.h>
 8 | 
 9 | namespace facebook { namespace deeplearning { namespace torch {
10 | 
11 | /// Constructs a DeviceTensor initialized from a THCudaTensor. Will
12 | /// throw if the dimensionality does not match.
13 | template <typename T, int Dim,
14 |           typename IndexT, template <typename U> class PtrTraits>
15 | cuda::DeviceTensor<T, Dim, IndexT, PtrTraits>
16 | torchToDeviceTensor(THCState* state, THCudaTensor* t);
17 | 
18 | template <typename T, int Dim, typename IndexT>
19 | cuda::DeviceTensor<T, Dim, IndexT, cuda::DefaultPtrTraits>
20 | torchToDeviceTensor(THCState* state, THCudaTensor* t) {
21 |   return torchToDeviceTensor<T, Dim, IndexT, cuda::DefaultPtrTraits>(state, t);
22 | }
23 | 
24 | template <typename T, int Dim>
25 | cuda::DeviceTensor<T, Dim, int, cuda::DefaultPtrTraits>
26 | torchToDeviceTensor(THCState* state, THCudaTensor* t) {
27 |   return torchToDeviceTensor<T, Dim, int, cuda::DefaultPtrTraits>(state, t);
28 | }
29 | 
30 | /// Constructs a DeviceTensor initialized from a THCudaTensor by
31 | /// upcasting or downcasting the tensor to that of a different
32 | /// dimension.
33 | template <typename T, int Dim,
34 |           typename IndexT, template <typename U> class PtrTraits>
35 | cuda::DeviceTensor<T, Dim, IndexT, PtrTraits>
36 | torchToDeviceTensorCast(THCState* state, THCudaTensor* t);
37 | 
38 | template <typename T, int Dim, typename IndexT>
39 | cuda::DeviceTensor<T, Dim, IndexT, cuda::DefaultPtrTraits>
40 | torchToDeviceTensorCast(THCState* state, THCudaTensor* t) {
41 |   return
42 |     torchToDeviceTensorCast<T, Dim, IndexT, cuda::DefaultPtrTraits>(state, t);
43 | }
44 | 
45 | template <typename T, int Dim>
46 | cuda::DeviceTensor<T, Dim, int, cuda::DefaultPtrTraits>
47 | torchToDeviceTensorCast(THCState* state, THCudaTensor* t) {
48 |   return
49 |     torchToDeviceTensorCast<T, Dim, int, cuda::DefaultPtrTraits>(state, t);
50 | }
51 | 
52 | } } }  // namespace
53 | 
54 | #include "src/DeviceTensorUtils-inl.h"
55 | 


--------------------------------------------------------------------------------
/src/FeatureLPPooling.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "cuda/DeviceTensor.cuh"
 4 | #include <cuda_runtime.h>
 5 | 
 6 | namespace facebook { namespace deeplearning { namespace torch {
 7 | 
 8 | bool
 9 | runFeatureLPPoolingUpdateOutput(
10 |   cudaStream_t stream,
11 |   const cuda::DeviceTensor<float, 4>& input,
12 |   cuda::DeviceTensor<float, 4>& output,
13 |   float power, int width, int stride);
14 | 
15 | bool
16 | runFeatureLPPoolingUpdateGradInput(
17 |   cudaStream_t stream,
18 |   const cuda::DeviceTensor<float, 4>& gradOutput,
19 |   const cuda::DeviceTensor<float, 4>& input,
20 |   const cuda::DeviceTensor<float, 4>& output,
21 |   cuda::DeviceTensor<float, 4>& gradInput,
22 |   float power, int width, int stride);
23 | 
24 | } } }
25 | 


--------------------------------------------------------------------------------
/src/HalfPrec.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
  2 | 
  3 | #include "src/HalfPrec.h"
  4 | 
  5 | #include <string>
  6 | #include <assert.h>
  7 | #include <lua.hpp>
  8 | #include "src/Utils.h"
  9 | #include "src/Tensor.h"
 10 | #include "src/LuaUtils.h"
 11 | #include "THC.h"
 12 | 
 13 | using namespace std;
 14 | using namespace facebook::deeplearning::torch;
 15 | 
 16 | namespace {
 17 | 
 18 | // Would be nice to use thrust, but its header files are full of things
 19 | // that trip our Werr settings for unused typedefs.
 20 | struct HalfTensor {
 21 |   explicit HalfTensor(THCState* state, THCudaTensor* floats) {
 22 |     auto sz = THCudaTensor_nElement(state, floats);
 23 |     auto err = cudaMalloc(&devPtr_, sz * sizeof(half_t));
 24 |     if (err != cudaSuccess) {
 25 |       throw std::runtime_error("failed to cudamalloc HalfTensor");
 26 |     }
 27 |     size_ = sz;
 28 |     halfprec_ToHalf(THCState_getCurrentStream(state),
 29 |                     THCudaTensor_data(state, floats), devPtr_, size_);
 30 |   }
 31 | 
 32 |   HalfTensor()
 33 |   : devPtr_(nullptr)
 34 |   , size_(0) {
 35 |   }
 36 | 
 37 |   ~HalfTensor() {
 38 |     cudaFree(devPtr_);
 39 |   }
 40 | 
 41 |   void toFloat(THCState* state, THCudaTensor* dest) {
 42 |     THCudaTensor_resize1d(state, dest, size_);
 43 |     assert(size_ > 0);
 44 |     halfprec_ToFloat(THCState_getCurrentStream(state),
 45 |                      devPtr_, THCudaTensor_data(state, dest), size_);
 46 |   }
 47 | 
 48 | private:
 49 |   half_t *devPtr_;
 50 |   size_t size_;
 51 | };
 52 | 
 53 | const char* kLibName = "HalfPrec";
 54 | 
 55 | int HalfPrec_new(lua_State* l) {
 56 |   auto dv = new HalfTensor();
 57 |   luaT_pushudata(l, dv, kLibName);
 58 |   return 1;
 59 | }
 60 | 
 61 | int HalfPrec_destroy(lua_State* l) {
 62 |   delete static_cast<HalfTensor*>(luaT_checkudata(l, 1, kLibName));
 63 |   return 0;
 64 | };
 65 | 
 66 | int HalfPrec_toHalfCUDA(lua_State* l) {
 67 |   THCState* state = getCutorchState(l);
 68 |   auto input = (THCudaTensor*)luaT_checkudata(l, 1, "torch.CudaTensor");
 69 |   THAssert(THCudaTensor_checkGPU(state, 1, input));
 70 |   auto cinput = THCudaTensor_newContiguous(state, input);
 71 |   auto dest = new HalfTensor(state, cinput);
 72 | 
 73 |   luaT_pushudata(l, dest, kLibName);
 74 |   THCudaTensor_free(state, cinput);
 75 |   return 1;
 76 | }
 77 | 
 78 | int HalfPrec_toFloatCUDA(lua_State* l) {
 79 |   THCState* state = getCutorchState(l);
 80 |   auto input = (HalfTensor*)luaT_checkudata(l, 1, kLibName);
 81 |   auto dest = THCudaTensor_new(state);
 82 |   input->toFloat(state, dest);
 83 |   luaT_pushudata(l, dest, "torch.CudaTensor");
 84 |   return 1;
 85 | }
 86 | 
 87 | const struct luaL_reg manifest[] = {
 88 |   {"new", HalfPrec_new},
 89 |   {"toHalfCUDA", HalfPrec_toHalfCUDA},
 90 |   {"toFloatCUDA", HalfPrec_toFloatCUDA},
 91 |   {"free", HalfPrec_destroy},
 92 |   {nullptr, nullptr},
 93 | };
 94 | 
 95 | }
 96 | 
 97 | extern "C" int luaopen_libhalfprec(lua_State* L) {
 98 |   luaT_newmetatable(L, kLibName, nullptr,
 99 |                     HalfPrec_new, // ctor
100 |                     HalfPrec_destroy, // dtor
101 |                     nullptr);
102 |   lua_newtable(L);
103 |   luaL_register(L, nullptr, manifest);
104 |   return 1;
105 | }
106 | 


--------------------------------------------------------------------------------
/src/HalfPrec.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | #include <cuda_runtime.h>
 5 | #include <stdint.h>
 6 | #include <stdlib.h>
 7 | 
 8 | typedef uint16_t half_t;
 9 | 
10 | void halfprec_ToHalf(cudaStream_t stream,
11 |                      const float* input,
12 |                      half_t* output,
13 |                      size_t n);
14 | 
15 | void halfprec_ToFloat(cudaStream_t stream,
16 |                       const half_t* input,
17 |                       float* output,
18 |                       size_t n);
19 | 


--------------------------------------------------------------------------------
/src/HalfPrecKernels.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 2 | 
 3 | #include <stdio.h>
 4 | #include <stdexcept>
 5 | #include <cuda.h>
 6 | 
 7 | #include "src/HalfPrec.h"
 8 | #include "src/util/Transform.cuh"
 9 | 
10 | using namespace facebook::cuda;
11 | void halfprec_ToHalf(cudaStream_t stream,
12 |                      const float* input,
13 |                      half_t* output,
14 |                      size_t n) {
15 |   transform<ToHalf>(stream, input, output, n);
16 | }
17 | 
18 | void halfprec_ToFloat(cudaStream_t stream,
19 |                       const half_t* input,
20 |                       float* output,
21 |                       size_t n) {
22 |   transform<ToFloat>(stream, input, output, n);
23 | }
24 | 


--------------------------------------------------------------------------------
/src/HalfPrecTest.cpp:
--------------------------------------------------------------------------------
  1 | #include <src/HalfPrec.h>
  2 | #include <gtest/gtest.h>
  3 | #include <common/math/Float16.h>
  4 | #include <cuda.h>
  5 | #include <cuda_runtime.h>
  6 | #include <stdexcept>
  7 | #include <math.h>
  8 | 
  9 | void cudaCheck(cudaError_t e) {
 10 |   auto toStr = [&] {
 11 |     return std::string(cudaGetErrorString(e));
 12 |   };
 13 |   if (e != cudaSuccess) {
 14 |     throw std::runtime_error("cuda failure: " + toStr());
 15 |   }
 16 |   e = cudaDeviceSynchronize();
 17 |   if (e != cudaSuccess) {
 18 |     throw std::runtime_error("cuda failure @ synchronize: " + toStr());
 19 |   }
 20 | }
 21 | 
 22 | template<typename T>
 23 | class CUDA {
 24 | public:
 25 |   explicit CUDA(size_t n)
 26 |   : n_(n) {
 27 |     cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
 28 |     cudaCheck(cudaMemset(vals_, 0, n_ * sizeof(T)));
 29 |   }
 30 | 
 31 |   CUDA(const T* base, size_t n) :
 32 |   n_(n) {
 33 |     cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T)));
 34 |     cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice));
 35 |   }
 36 | 
 37 |   void toHost(T* base) const {
 38 |     cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost));
 39 |   }
 40 | 
 41 |   size_t size() const {
 42 |     return n_;
 43 |   }
 44 | 
 45 |   ~CUDA() {
 46 |     cudaCheck(cudaFree(vals_));
 47 |   }
 48 | 
 49 |   T* data() {
 50 |     return vals_;
 51 |   }
 52 | 
 53 | private:
 54 |   T* vals_;
 55 |   size_t n_;
 56 | };
 57 | 
 58 | TEST(HalfPrec, cuda) {
 59 |   float hostFloats[] = {
 60 |     -1,
 61 |     -100,
 62 |     2.3,
 63 |     0.0,
 64 |     1.0,
 65 |     3867.2,
 66 |   };
 67 |   const auto N = sizeof(hostFloats) / sizeof(float);
 68 |   CUDA<float> devFloats(hostFloats, N);
 69 |   CUDA<half_t> devHalfs(N);
 70 | 
 71 |   halfprec_ToHalf(nullptr, devFloats.data(), devHalfs.data(), devFloats.size());
 72 |   cudaCheck(cudaDeviceSynchronize());
 73 | 
 74 |   {
 75 |     uint16_t cpuHalfs[N] = { 666 };
 76 |     facebook::math::Float16::encode(cpuHalfs, hostFloats, N);
 77 | 
 78 |     half_t convertedHalfs[N];
 79 |     devHalfs.toHost(convertedHalfs);
 80 |     for (int i = 0; i < N; i++) {
 81 |       // The CPU and GPU disagree by a digit sometimes because the GPU
 82 |       // is using a different rounding mode.
 83 |       EXPECT_NEAR(cpuHalfs[i], convertedHalfs[i], 1);
 84 |     }
 85 |   }
 86 | 
 87 |   CUDA<float> exploded(N);
 88 |   halfprec_ToFloat(nullptr, devHalfs.data(), exploded.data(), N);
 89 |   float postExpl[N];
 90 |   exploded.toHost(postExpl);
 91 |   for (int i = 0; i < N; i++) {
 92 |     auto thousandth = fabs(hostFloats[i] / 1000.0);
 93 |     EXPECT_NEAR(postExpl[i], hostFloats[i], thousandth);
 94 |   }
 95 | }
 96 | 
 97 | int halfSign(half_t h) {
 98 |   return (h & (1 << 15)) >> 15;
 99 | }
100 | 
101 | int halfExp(half_t h) {
102 |   return (h >> 10) & 31;
103 | }
104 | 
105 | int halfMant(half_t h) {
106 |   return h & 1023;
107 | }
108 | 
109 | TEST(HalfPrec, exhaustive) {
110 |   const auto N = 1 << 16;
111 | 
112 |   half_t hostHalfs[N];
113 |   float hostFloats[N];
114 |   for (int i = 0; i < N; i++) {
115 |     hostHalfs[i] = i;
116 |   }
117 |   facebook::math::Float16::decode(hostFloats, hostHalfs, N);
118 | 
119 |   CUDA<half_t> devHalfs(hostHalfs, N);
120 |   CUDA<float> devFloats(N);
121 |   float devOut[N];
122 |   halfprec_ToFloat(nullptr, devHalfs.data(), devFloats.data(), N);
123 |   devFloats.toHost(devOut);
124 |   for (int i = 0; i < N; i++) {
125 |     if (halfExp(i) == 0) continue; // subnormals
126 |     if (halfExp(i) == 31) continue; // inf/nan
127 |     if (hostFloats[i] != devOut[i]) {
128 |       printf("failure: %d %x s/e/m %01x %03x %04x\n",
129 |              i, i, halfSign(i), halfExp(i), halfMant(i));
130 |       EXPECT_EQ(hostFloats[i], devOut[i]);
131 |     }
132 |   }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/Includes.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "thpp/Tensor.h"
 4 | #include "thpp/Storage.h"
 5 | #include "fblualib/LuaUtils.h"
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch {
 8 | 
 9 | using namespace thpp;
10 | using namespace fblualib;
11 | 
12 | }}}  // namespaces
13 | 


--------------------------------------------------------------------------------
/src/InitCuda.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Facebook
 3 |  * @author Tudor Bosman (tudorb@fb.com)
 4 |  */
 5 | 
 6 | #include <lua.hpp>
 7 | 
 8 | #ifdef FB_INTERNAL
 9 | #else
10 | #define LUAOPEN(x) luaopen_fbcunn_cuda_ext(x)
11 | #endif
12 | 
13 | namespace facebook { namespace deeplearning { namespace torch {
14 | 
15 | void initCrossMapNormalizationCuda(lua_State* L);
16 | void initLocallyConnectedCuda(lua_State* L);
17 | void initLookupTableGPUCuda(lua_State* L);
18 | void initHSMCuda(lua_State* L);
19 | void initTemporalConvolutionFB(lua_State *L);
20 | void initTemporalKMaxPoolingCuda(lua_State* L);
21 | void initOneBitQuantizationCuda(lua_State* L);
22 | void initSparseNLLCriterionCuda(lua_State* L);
23 | void initTemporalConvolutionTBCCuda(lua_State* L);
24 | void initFeatureLPPoolingCuda(lua_State* L);
25 | void initCuBLASWrapper(lua_State *L);
26 | // void initFFTWrapper(lua_State *L);
27 | // void initSpatialConvolutionCuFFT(lua_State *L);
28 | void initWeightedLookupTableCuda(lua_State *L);
29 | 
30 | }}}  // namespace
31 | 
32 | using namespace facebook::deeplearning::torch;
33 | 
34 | extern "C" int LUAOPEN(lua_State* L) {
35 |   initCrossMapNormalizationCuda(L);
36 |   initLocallyConnectedCuda(L);
37 |   initLookupTableGPUCuda(L);
38 |   initTemporalConvolutionFB(L);
39 |   initTemporalKMaxPoolingCuda(L);
40 |   initHSMCuda(L);
41 |   initOneBitQuantizationCuda(L);
42 |   initSparseNLLCriterionCuda(L);
43 |   initTemporalConvolutionTBCCuda(L);
44 |   initFeatureLPPoolingCuda(L);
45 |   initCuBLASWrapper(L);
46 |   // initFFTWrapper(L);
47 |   // initSpatialConvolutionCuFFT(L);
48 |   initWeightedLookupTableCuda(L);
49 | 
50 |   return 0;
51 | }
52 | 


--------------------------------------------------------------------------------
/src/LookupTableGPU.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | 
  3 | #include "cuda/CudaUtils.cuh"
  4 | #include "cuda/DeviceTensor.cuh"
  5 | #include "cuda/WarpReductions.cuh"
  6 | 
  7 | using namespace facebook::cuda;
  8 | 
  9 | namespace facebook { namespace deeplearning { namespace torch {
 10 | namespace detail {
 11 | 
 12 | namespace {
 13 | 
 14 | // for updateOutput
 15 | 
 16 | __device__ __forceinline__ int getBatch() {
 17 |   return blockIdx.x;
 18 | }
 19 | 
 20 | __device__ __forceinline__ int getLookupElement() {
 21 |   return blockIdx.y;
 22 | }
 23 | 
 24 | // // for accGradParameters
 25 | 
 26 | __device__ __forceinline__ int getFeatureDim() {
 27 |   // Each warp runs effectively independently, but there is slightly
 28 |   // better utilization if each block has at least 4 warps.
 29 |   int warpId = threadIdx.x / 32;
 30 |   return blockIdx.x * 4 + warpId;
 31 | }
 32 | 
 33 | // Feature dimension is always innermost. Depending on tensor layout,
 34 | // it may or may not be contiguous.
 35 | __global__ void updateOutputKernel(DeviceTensor<float, 2> input,
 36 |                                    DeviceTensor<float, 2> weight,
 37 |                                    DeviceTensor<float, 3> output) {
 38 |   int weightIndex = (int)(input[getBatch()][getLookupElement()] - 0.5f);
 39 | 
 40 |   for (int i = threadIdx.x; i < weight.getSize(1); i += blockDim.x) {
 41 |     output[getBatch()][getLookupElement()][i] = weight[weightIndex][i];
 42 |   }
 43 | }
 44 | 
 45 | __global__ void accGradParametersKernel(DeviceTensor<float, 2> input,
 46 |                                         DeviceTensor<float, 3> gradOutput,
 47 |                                         DeviceTensor<float, 2> gradWeight,
 48 |                                         float scale) {
 49 |   const int featureDim = getFeatureDim();
 50 |   if (featureDim >= gradWeight.getSize(1)) {
 51 |     return;
 52 |   }
 53 | 
 54 |   // The strategy here is that each warp handles a single feature
 55 |   // dimension.
 56 |   // Within that feature dimension, points in the [batch][element]
 57 |   // dimension can overlap, and we need to determine if threads want
 58 |   // to add to the gradient in a colliding manner.
 59 |   // Typically one would use floating-point atomicAdd() to resolve
 60 |   // these collisions, but that is non-deterministic if there are
 61 |   // collisions. Non-determinism for this code is really bad,
 62 |   // especially in RNNs, and is prone to snowballing error.
 63 |   // In order to get a deterministic order of execution, we handle
 64 |   // non-colliding updates separately from colliding ones. Colliding
 65 |   // updates are serialized in their order of execution by using the
 66 |   // warp-wide collision detector `warpHasCollision`.
 67 |   unsigned int maxLinearIndex = input.getSize(0) * input.getSize(1);
 68 |   for (unsigned int i = getLaneId(); i < maxLinearIndex; i += WARP_SIZE) {
 69 |     unsigned int batch = i / input.getSize(1);
 70 |     unsigned int lookupElement = i % input.getSize(1);
 71 | 
 72 |     int weightIndex = (int) (input[batch][lookupElement].ldg() - 0.5f);
 73 |     float update = gradOutput[batch][lookupElement][featureDim] * scale;
 74 | 
 75 |     // Check for collision
 76 |     if (warpHasCollision(weightIndex)) {
 77 |       // Run all lanes sequentially; warp divergence
 78 |       for (int i = 0; i < WARP_SIZE; ++i) {
 79 |         if (getLaneId() == i) {
 80 |           gradWeight[weightIndex][featureDim] += update;
 81 |         }
 82 |       }
 83 |     } else {
 84 |       // No collision; warp coherence
 85 |       gradWeight[weightIndex][featureDim] += update;
 86 |     }
 87 |   }
 88 | }
 89 | 
 90 | } // namespace
 91 | 
 92 | typedef DeviceTensor<float, 2> DeviceTensor2;
 93 | typedef DeviceTensor<float, 3> DeviceTensor3;
 94 | 
 95 | void launchLookupTableGPUUpdateOutputKernel(cudaStream_t stream,
 96 |                                             DeviceTensor2& input,
 97 |                                             DeviceTensor2& weight,
 98 |                                             DeviceTensor3& output) {
 99 |   const dim3 grid(input.getSize(0), input.getSize(1));
100 |   const dim3 block(min(weight.getSize(1), 1024));
101 | 
102 |   updateOutputKernel<<<grid, block, 0, stream>>>(input, weight, output);
103 | }
104 | 
105 | void launchLookupTableGPUAccGradParametersKernel(cudaStream_t stream,
106 |                                                  DeviceTensor2& input,
107 |                                                  DeviceTensor3& gradOutput,
108 |                                                  DeviceTensor2& gradWeight,
109 |                                                  float scale) {
110 |   // Target 4 warps/block for better utilization. Even if the input
111 |   // doesn't have that many dimensions, the blocks/warps not
112 |   // participating will just exit immediately.
113 |   const dim3 grid(ceil(gradOutput.getSize(2), 4));
114 |   const dim3 block(32 * 4);
115 | 
116 |   accGradParametersKernel<<<grid, block, 0, stream>>>(
117 |     input, gradOutput, gradWeight, scale);
118 | }
119 | 
120 | }}}}  // namespaces
121 | 


--------------------------------------------------------------------------------
/src/LuaUtils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "fblualib/LuaUtils.h"
 4 | 
 5 | namespace facebook { namespace deeplearning { namespace torch {
 6 | 
 7 | using namespace fblualib;
 8 | 
 9 | }}}  // namespaces
10 | 


--------------------------------------------------------------------------------
/src/MM.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "DeviceTensorUtils.h"
 4 | #include "THCTensor.h"
 5 | 
 6 | #include "cuda/DeviceTensor.cuh"
 7 | #include "cuda/MM.cuh"
 8 | 
 9 | 
10 | using namespace facebook::cuda;
11 | 
12 | namespace facebook { namespace deeplearning { namespace torch {
13 | 
14 | template
15 | <int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB, bool Accumulate>
16 | void transposeMM(DeviceTensor<float, Dim>& A,
17 |                  DeviceTensor<float, Dim>& B,
18 |                  DeviceTensor<float, Dim>& C,
19 |                  float invNorm,
20 |                  cudaStream_t s = 0) {
21 |   facebook::cuda::transposeMM
22 |     <Dim, ConjugateTransposeA, ConjugateTransposeB, Accumulate>(
23 |       A, B, C, invNorm, s);
24 | }
25 | 
26 | #define INSTANTIATE_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC)        \
27 |   template void transposeMM<DIM, CONJA, CONJB, ACC>(            \
28 |     DeviceTensor<float, DIM>& A,                                \
29 |     DeviceTensor<float, DIM>& B,                                \
30 |     DeviceTensor<float, DIM>& C,                                \
31 |     float invNorm,                                              \
32 |     cudaStream_t s);
33 | 
34 | INSTANTIATE_TRANSPOSE_MM(5, true, false, true);
35 | INSTANTIATE_TRANSPOSE_MM(5, false, true, true);
36 | INSTANTIATE_TRANSPOSE_MM(5, false, false, true);
37 | INSTANTIATE_TRANSPOSE_MM(5, true, false, false);
38 | INSTANTIATE_TRANSPOSE_MM(5, false, true, false);
39 | INSTANTIATE_TRANSPOSE_MM(5, false, false, false);
40 | 
41 | #define CALL_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC)                            \
42 |   if (THCudaTensor_nDimension(state, tA) == DIM &&                           \
43 |       conjugateTransposeA == CONJA &&                                        \
44 |       conjugateTransposeB == CONJB &&                                        \
45 |       accumulate == ACC) {                                                   \
46 |     DeviceTensor<float, DIM> A = torchToDeviceTensor<float, DIM>(state, tA); \
47 |     DeviceTensor<float, DIM> B = torchToDeviceTensor<float, DIM>(state, tB); \
48 |     DeviceTensor<float, DIM> C = torchToDeviceTensor<float, DIM>(state, tC); \
49 |     facebook::deeplearning::torch::transposeMM<DIM, CONJA, CONJB, ACC>(      \
50 |       A, B, C, invNorm, THCState_getCurrentStream(state));                   \
51 |     return;                                                                  \
52 |   }
53 | 
54 | extern "C" void transposeMMFFI(THCState* state,
55 |                                THCudaTensor* tA,
56 |                                THCudaTensor* tB,
57 |                                THCudaTensor* tC,
58 |                                float invNorm,
59 |                                bool conjugateTransposeA,
60 |                                bool conjugateTransposeB,
61 |                                bool accumulate) {
62 |   CHECK_EQ(THCudaTensor_nDimension(state, tA),
63 |            THCudaTensor_nDimension(state, tB));
64 |   CHECK_EQ(THCudaTensor_nDimension(state, tA),
65 |            THCudaTensor_nDimension(state, tC));
66 | 
67 |   CALL_TRANSPOSE_MM(5, true, false, true);
68 |   CALL_TRANSPOSE_MM(5, false, true, true);
69 |   CALL_TRANSPOSE_MM(5, false, false, true);
70 |   CALL_TRANSPOSE_MM(5, true, false, false);
71 |   CALL_TRANSPOSE_MM(5, false, true, false);
72 |   CALL_TRANSPOSE_MM(5, false, false, false);
73 | }
74 | 
75 | #undef INSTANTIATE_TRANSPOSE_MM
76 | 
77 | }}}
78 | 


--------------------------------------------------------------------------------
/src/MM.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "cuda/DeviceTensor.cuh"
 6 | 
 7 | #include <cuda_runtime.h>
 8 | 
 9 | namespace facebook { namespace deeplearning { namespace torch {
10 | 
11 | template
12 | <int Dim, bool ConjugateTransposeA, bool ConjugateTransposeB, bool Accumulate>
13 | void transposeMM(facebook::cuda::DeviceTensor<float, Dim>& A,
14 |                  facebook::cuda::DeviceTensor<float, Dim>& B,
15 |                  facebook::cuda::DeviceTensor<float, Dim>& C,
16 |                  float invNorm,
17 |                  cudaStream_t s = 0);
18 | 
19 | }}}
20 | 


--------------------------------------------------------------------------------
/src/OneBitQuantization.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "cuda/DeviceTensor.cuh"
 4 | 
 5 | namespace facebook { namespace deeplearning { namespace torch {
 6 | 
 7 | void
 8 | runQuantize1Bit(cudaStream_t stream,
 9 |                 const cuda::DeviceTensor<float, 2>& in,
10 |                 cuda::DeviceTensor<float, 2>& out,
11 |                 cuda::DeviceTensor<float, 2>& quantizationError,
12 |                 cuda::DeviceTensor<float, 1>& avgPos,
13 |                 cuda::DeviceTensor<float, 1>& avgNeg);
14 | 
15 | void
16 | runDequantize1Bit(cudaStream_t stream,
17 |                   const cuda::DeviceTensor<float, 2>& in,
18 |                   const cuda::DeviceTensor<float, 1>& avgPos,
19 |                   const cuda::DeviceTensor<float, 1>& avgNeg,
20 |                   cuda::DeviceTensor<float, 2>& out);
21 | 
22 | } } }
23 | 


--------------------------------------------------------------------------------
/src/OneBitQuantizationHost.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 Facebook
  2 | 
  3 | #include "cuda/DeviceTensor.cuh"
  4 | #include "src/DeviceTensorUtils.h"
  5 | #include "src/Utils.h"
  6 | #include "THC.h"
  7 | #include "THCTensor.h"
  8 | #include "src/OneBitQuantization.cuh"
  9 | 
 10 | #include <cuda_runtime.h>
 11 | #include <glog/logging.h>
 12 | #include <luaT.h>
 13 | #include <lua.hpp>
 14 | 
 15 | using namespace std;
 16 | using namespace facebook::cuda;
 17 | 
 18 | namespace facebook { namespace deeplearning { namespace torch {
 19 | 
 20 | namespace {
 21 | 
 22 | constexpr int kNumBits = sizeof(unsigned) * 8;
 23 | 
 24 | constexpr int toQuantizedSize(int size) {
 25 |   return (size + kNumBits - 1) / kNumBits;
 26 | }
 27 | 
 28 | int quantize(lua_State *L) {
 29 |   THCState* state = getCutorchState(L);
 30 |   auto nonQuantizedTH = (THCudaTensor*)luaT_checkudata(
 31 |     L, 2, "torch.CudaTensor");
 32 |   auto quantizedTH = (THCudaTensor*)luaT_getfieldcheckudata(
 33 |     L, 1, "quantized", "torch.CudaTensor");
 34 |   auto quantizationErrorTH = (THCudaTensor*)luaT_getfieldcheckudata(
 35 |     L, 1, "quantization_error", "torch.CudaTensor");
 36 |   auto avgPosTH = (THCudaTensor*)luaT_getfieldcheckudata(
 37 |     L, 1, "avg_pos", "torch.CudaTensor");
 38 |   auto avgNegTH = (THCudaTensor*)luaT_getfieldcheckudata(
 39 |     L, 1, "avg_neg", "torch.CudaTensor");
 40 | 
 41 |   THAssert(THCudaTensor_checkGPU(state, 5, nonQuantizedTH, quantizedTH,
 42 |                                  quantizationErrorTH, avgPosTH, avgNegTH));
 43 |   // The input should be two-dimensional
 44 |   luaL_argcheck(L, THCudaTensor_nDimension(state, nonQuantizedTH) == 2, 2,
 45 |                 "non_quantized_input should be 2d");
 46 | 
 47 |   const auto rows = THCudaTensor_size(state, nonQuantizedTH, 0);
 48 |   const auto cols = THCudaTensor_size(state, nonQuantizedTH, 1);
 49 | 
 50 |   // Make sure that the outputs are properly sized
 51 |   THCudaTensor_resize2d(state, quantizedTH, rows, toQuantizedSize(cols));
 52 |   THCudaTensor_resize2d(state, quantizationErrorTH, rows, cols);
 53 |   THCudaTensor_resize1d(state, avgPosTH, rows);
 54 |   THCudaTensor_resize1d(state, avgNegTH, rows);
 55 | 
 56 |   DeviceTensor<float, 2> nonQuantized =
 57 |     torchToDeviceTensor<float, 2>(state, nonQuantizedTH);
 58 |   DeviceTensor<float, 2> quantized =
 59 |     torchToDeviceTensor<float, 2>(state, quantizedTH);
 60 |   DeviceTensor<float, 2> quantizationError =
 61 |     torchToDeviceTensor<float, 2>(state, quantizationErrorTH);
 62 |   DeviceTensor<float, 1> avgPos =
 63 |     torchToDeviceTensor<float, 1>(state, avgPosTH);
 64 |   DeviceTensor<float, 1> avgNeg =
 65 |     torchToDeviceTensor<float, 1>(state, avgNegTH);
 66 | 
 67 |   runQuantize1Bit(THCState_getCurrentStream(state),
 68 |                   nonQuantized, quantized, quantizationError, avgPos, avgNeg);
 69 | 
 70 |   return 0;
 71 | }
 72 | 
 73 | int dequantize(lua_State *L) {
 74 |   THCState* state = getCutorchState(L);
 75 |   auto quantizedTH = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 76 |   auto avgPosTH = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
 77 |   auto avgNegTH = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor");
 78 |   auto nonQuantizedCols = luaL_checkint(L, 5);
 79 |   auto nonQuantizedTH = (THCudaTensor*)luaT_getfieldcheckudata(
 80 |     L, 1, "non_quantized", "torch.CudaTensor");
 81 | 
 82 |   THAssert(THCudaTensor_checkGPU(state, 4, nonQuantizedTH, quantizedTH,
 83 |                                  avgPosTH, avgNegTH));
 84 |   // The input should be two-dimensional
 85 |   luaL_argcheck(L, THCudaTensor_nDimension(state, quantizedTH) == 2, 2,
 86 |                 "input should be 2d");
 87 | 
 88 |   const auto rows = THCudaTensor_size(state, quantizedTH, 0);
 89 |   const auto quantizedCols = THCudaTensor_size(state, quantizedTH, 1);
 90 | 
 91 |   // The input should be within appropriate quantization sizes
 92 |   luaL_argcheck(L, quantizedCols == toQuantizedSize(nonQuantizedCols), 5,
 93 |                 "num_orig_cols does not match quantized_input cols");
 94 |   luaL_argcheck(L, THCudaTensor_size(state, avgPosTH, 0) == rows, 3,
 95 |                 "avg_pos size doesn't match quantized_input rows");
 96 |   luaL_argcheck(L, THCudaTensor_size(state, avgNegTH, 0) == rows, 4,
 97 |                 "avg_neg size doesn't match quantized_input rows");
 98 | 
 99 |   // Make sure that the outputs are properly sized
100 |   THCudaTensor_resize2d(state, nonQuantizedTH, rows, nonQuantizedCols);
101 | 
102 |   DeviceTensor<float, 2> quantized =
103 |     torchToDeviceTensor<float, 2>(state, quantizedTH);
104 |   DeviceTensor<float, 1> avgPos =
105 |     torchToDeviceTensor<float, 1>(state, avgPosTH);
106 |   DeviceTensor<float, 1> avgNeg =
107 |     torchToDeviceTensor<float, 1>(state, avgNegTH);
108 |   DeviceTensor<float, 2> nonQuantized =
109 |     torchToDeviceTensor<float, 2>(state, nonQuantizedTH);
110 | 
111 |   runDequantize1Bit(THCState_getCurrentStream(state),
112 |                     quantized, avgPos, avgNeg, nonQuantized);
113 | 
114 |   return 0;
115 | }
116 | 
117 | const luaL_Reg functions [] = {
118 |   {"OneBitQuantization_quantize", quantize},
119 |   {"OneBitQuantization_dequantize", dequantize},
120 |   {nullptr, nullptr}
121 | };
122 | 
123 | }  // namespace
124 | 
125 | void initOneBitQuantizationCuda(lua_State *L) {
126 |   luaT_pushmetatable(L, "torch.CudaTensor");
127 |   luaT_registeratname(L, functions, "nn");
128 |   lua_pop(L,1);
129 | }
130 | 
131 | }}}  // namespaces
132 | 


--------------------------------------------------------------------------------
/src/SparseNLLCriterion.cu:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 2014 Facebook
  3 |  * @author Michael Mathieu (myrhev@fb.com)
  4 |  */
  5 | 
  6 | #include "cuda/CudaUtils.cuh"
  7 | #include "cuda/WarpReductions.cuh"
  8 | #include "cuda/util/CachedDeviceProperties.h"
  9 | 
 10 | #include "SparseNLLCriterion.cuh"
 11 | 
 12 | using namespace facebook::cuda;
 13 | 
 14 | namespace facebook { namespace deeplearning { namespace torch {
 15 | namespace detail {
 16 | 
 17 | namespace {
 18 | 
 19 | // only one block.
 20 | // threadIdx.x splits K (ideally, is equal to K)
 21 | // threadIdx.y splits batchSize
 22 | __global__ void
 23 | updateOutput(const DeviceTensor<float, 2> targetIdx,
 24 |              const DeviceTensor<float, 2> targetP,
 25 |              const DeviceTensor<float, 2> input,
 26 |              DeviceTensor<float, 1> output,
 27 |              const int batchSize,
 28 |              const int K) {
 29 |   extern __shared__ float buffer[];
 30 | 
 31 |   // map (sum the correct input multiplied by the probabilities)
 32 |   float local_sum = 0.f;
 33 |   for (int i = threadIdx.y; i < batchSize; i += blockDim.y) {
 34 |     for (int j = threadIdx.x; j < K; j += blockDim.x) {
 35 |       local_sum += input[i][(int)(targetIdx[i][j] - 1)] * targetP[i][j];
 36 |     }
 37 |   }
 38 | 
 39 |   // reduce (sum all)
 40 |   local_sum = cuda::warpReduceSum(local_sum);
 41 |   if (cuda::getLaneId() == 0)
 42 |     buffer[cuda::getWarpId()] = local_sum;
 43 |   __syncthreads();
 44 |   if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
 45 |     local_sum = 0.f;
 46 |     for (int i = 0; i < cuda::ceil(blockDim.x * blockDim.y, 32u); ++i) {
 47 |       local_sum += buffer[i];
 48 |     }
 49 |     output[0] = -local_sum;
 50 |   }
 51 | }
 52 | 
 53 | // blockIdx.x * threadIdx.y splits batchSize
 54 | // threadIdx.x splits K (ideally is equal to K)
 55 | __global__ void
 56 | updateGradInput(const DeviceTensor<float, 2> targetIdx,
 57 |                 const DeviceTensor<float, 2> targetP,
 58 |                 DeviceTensor<float, 2> gradInput,
 59 |                 int batchSize, int K) {
 60 |   const int batch_idx = blockIdx.x * blockDim.y + threadIdx.y;
 61 |   const int batch_dim = gridDim.x * blockDim.y;
 62 |   for (int i = batch_idx; i < batchSize; i += batch_dim) {
 63 |     for (int j = threadIdx.x; j < K; j += blockDim.x) {
 64 |       gradInput[i][(int)(targetIdx[i][j] - 0.5)] = - targetP[i][j];
 65 |     }
 66 |   }
 67 | }
 68 | 
 69 | } // namespace
 70 | 
 71 | void runSparseNLLCriterion_updateOutput(
 72 |   cudaStream_t stream,
 73 |   const DeviceTensor<float, 2>& targetIdx,
 74 |   const DeviceTensor<float, 2>& targetP,
 75 |   const DeviceTensor<float, 2>& input,
 76 |   DeviceTensor<float, 1>& output) {
 77 | 
 78 |   const cudaDeviceProp& deviceProperties =
 79 |     facebook::cuda::getCurrentDeviceProperties();
 80 |   const int maxThreads = deviceProperties.maxThreadsPerBlock;
 81 | 
 82 |   const int batchSize = targetP.getSize(0);
 83 |   const int K = targetP.getSize(1);
 84 |   dim3 blocks(1, 1, 1);
 85 |   int threadsx = min(K, maxThreads);
 86 |   dim3 threads(threadsx, max(1, maxThreads/threadsx), 1);
 87 |   size_t sharedSize = cuda::ceil(threads.x * threads.y * sizeof(float),
 88 |                                  (size_t)deviceProperties.warpSize);
 89 |   updateOutput<<<blocks, threads, sharedSize, stream>>>(
 90 |     targetIdx, targetP, input, output, batchSize, K);
 91 | }
 92 | 
 93 | void runSparseNLLCriterion_updateGradInput(
 94 |   cudaStream_t stream,
 95 |   const DeviceTensor<float, 2>& targetIdx,
 96 |   const DeviceTensor<float, 2>& targetP,
 97 |   DeviceTensor<float, 2>& gradInput) {
 98 | 
 99 |   const cudaDeviceProp& deviceProperties =
100 |     facebook::cuda::getCurrentDeviceProperties();
101 | 
102 |   const int batchSize = targetP.getSize(0);
103 |   const int K = targetP.getSize(1);
104 |   const int nClasses = gradInput.getSize(1);
105 |   cudaMemsetAsync(gradInput.data(), 0, nClasses * batchSize * sizeof(float), stream);
106 |   int threadsx = min(K, deviceProperties.maxThreadsPerBlock);
107 |   int threadsy = (threadsx > 128) ? 1 : (256 / threadsx);
108 |   dim3 threads(threadsx, threadsy, 1);
109 |   dim3 blocks(max(1, batchSize / threadsy), 1, 1);
110 |   updateGradInput<<<blocks, threads, 0, stream>>>(
111 |     targetIdx, targetP, gradInput, batchSize, K);
112 | }
113 | 
114 | }}}} // namespaces
115 | 


--------------------------------------------------------------------------------
/src/SparseNLLCriterion.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "cuda/DeviceTensor.cuh"
 4 | 
 5 | namespace facebook { namespace deeplearning { namespace torch {
 6 | namespace detail {
 7 | 
 8 | void runSparseNLLCriterion_updateOutput(
 9 |   cudaStream_t stream,
10 |   const cuda::DeviceTensor<float, 2>& targetIdx,
11 |   const cuda::DeviceTensor<float, 2>& targetP,
12 |   const cuda::DeviceTensor<float, 2>& input,
13 |   cuda::DeviceTensor<float, 1>& output);
14 | 
15 | void runSparseNLLCriterion_updateGradInput(
16 |   cudaStream_t stream,
17 |   const cuda::DeviceTensor<float, 2>& targetIdx,
18 |   const cuda::DeviceTensor<float, 2>& targetP,
19 |   cuda::DeviceTensor<float, 2>& gradinput);
20 | 
21 | }}}}
22 | 


--------------------------------------------------------------------------------
/src/SparseNLLCriterionHost.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2014 Facebook
  2 | 
  3 | #include "cuda/DeviceTensor.cuh"
  4 | #include "src/Utils.h"
  5 | #include "src/DeviceTensorUtils.h"
  6 | #include "THC.h"
  7 | #include "THCTensor.h"
  8 | #include "src/SparseNLLCriterion.cuh"
  9 | 
 10 | #include <cuda_runtime.h>
 11 | #include <glog/logging.h>
 12 | #include <luaT.h>
 13 | #include <lua.hpp>
 14 | 
 15 | using namespace std;
 16 | using namespace facebook::cuda;
 17 | 
 18 | namespace facebook { namespace deeplearning { namespace torch {
 19 | 
 20 | namespace {
 21 | 
 22 | inline THCudaTensor* getFieldCudaTensor(lua_State* L, int arg,
 23 |                                         const char* name) {
 24 |   return static_cast<THCudaTensor*>(luaT_getfieldcheckudata(
 25 |                                       L, arg, name, "torch.CudaTensor"));
 26 | }
 27 | inline THCudaTensor* getCudaTensor(lua_State* L, int arg) {
 28 |   return static_cast<THCudaTensor*>(luaT_checkudata(L, arg,
 29 |                                                     "torch.CudaTensor"));
 30 | }
 31 | 
 32 | int updateOutput(lua_State *L) {
 33 |   THCState* state = getCutorchState(L);
 34 |   auto output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output",
 35 |                                                        "torch.CudaTensor");
 36 |   auto input     = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 37 |   auto targetP   = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
 38 |   auto targetIdx = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor");
 39 |   auto batchSize = targetP->size[0];
 40 |   auto K = targetP->size[1];
 41 | 
 42 |   THAssert(THCudaTensor_checkGPU(state, 4, input, output, targetP, targetIdx));
 43 |   luaL_argcheck(L, (output->nDimension == 1) && (output->size[0] == 1),
 44 |                 1, "output has wrong dimension");
 45 |   luaL_argcheck(L, (input->nDimension == 2) && (input->size[0] == batchSize)
 46 |                    && (THCudaTensor_isContiguous(state, input)),
 47 |                 2, "input has wrong dimension");
 48 |   luaL_argcheck(L, (targetP->nDimension == 2)
 49 |                    && (THCudaTensor_isContiguous(state, targetP)),
 50 |                 3, "targetP has wrong dimension");
 51 |   luaL_argcheck(L, (targetIdx->nDimension == 2)
 52 |                    && (targetIdx->size[0] == batchSize)
 53 |                    && (targetIdx->size[1] == K)
 54 |                    && (THCudaTensor_isContiguous(state, targetIdx)),
 55 |                 4, "targetIdx has wrong dimension");
 56 | 
 57 |   auto targetIdxDev = torchToDeviceTensor<float, 2>(state, targetIdx);
 58 |   auto targetPDev = torchToDeviceTensor<float, 2>(state, targetP);
 59 |   auto inputDev = torchToDeviceTensor<float, 2>(state, input);
 60 |   auto outputDev = torchToDeviceTensor<float, 1>(state, output);
 61 | 
 62 |   detail::runSparseNLLCriterion_updateOutput(
 63 |     THCState_getCurrentStream(state),
 64 |     targetIdxDev, targetPDev,
 65 |     inputDev, outputDev);
 66 | 
 67 |   return 0;
 68 | }
 69 | 
 70 | int updateGradInput(lua_State *L) {
 71 |   THCState* state = getCutorchState(L);
 72 |   auto gradInput =
 73 |     (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "gradInput",
 74 |                                            "torch.CudaTensor");
 75 |   auto targetP   = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 76 |   auto targetIdx = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
 77 |   auto batchSize = targetP->size[0];
 78 |   auto K = targetP->size[1];
 79 | 
 80 |   THAssert(THCudaTensor_checkGPU(state, 3, gradInput, targetP, targetIdx));
 81 |   luaL_argcheck(L, (gradInput->nDimension == 2)
 82 |                 && (gradInput->size[0] == batchSize)
 83 |                 && (THCudaTensor_isContiguous(state, gradInput)),
 84 |                 1, "gradInput has wrong dimension");
 85 |   luaL_argcheck(L, (targetP->nDimension == 2)
 86 |                 && (THCudaTensor_isContiguous(state, targetP)),
 87 |                 2, "targetP has wrong dimension");
 88 |   luaL_argcheck(L, (targetIdx->nDimension == 2)
 89 |                 && (targetIdx->size[0] == batchSize)
 90 |                 && (targetIdx->size[1] == K)
 91 |                 && (THCudaTensor_isContiguous(state, targetIdx)),
 92 |                 3, "targetIdx has wrong dimension");
 93 | 
 94 |   auto targetIdxDev = torchToDeviceTensor<float, 2>(state, targetIdx);
 95 |   auto targetPDev = torchToDeviceTensor<float, 2>(state, targetP);
 96 |   auto gradInputDev = torchToDeviceTensor<float, 2>(state, gradInput);
 97 | 
 98 |   detail::runSparseNLLCriterion_updateGradInput(
 99 |     THCState_getCurrentStream(state),
100 |     targetIdxDev, targetPDev,
101 |     gradInputDev);
102 | 
103 |   return 0;
104 | }
105 | 
106 | const luaL_Reg functions [] = {
107 |   {"SparseNLLCriterion_updateOutput", updateOutput},
108 |   {"SparseNLLCriterion_updateGradInput", updateGradInput},
109 |   {nullptr, nullptr}
110 | };
111 | 
112 | } // namespace
113 | 
114 | void initSparseNLLCriterionCuda(lua_State *L) {
115 |   luaT_pushmetatable(L, "torch.CudaTensor");
116 |   luaT_registeratname(L, functions, "nn");
117 |   lua_pop(L, 1);
118 | }
119 | 
120 | }}} // namespaces
121 | 


--------------------------------------------------------------------------------
/src/Storage.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "thpp/Storage.h"
 4 | 
 5 | namespace facebook { namespace deeplearning { namespace torch {
 6 | 
 7 | using namespace thpp;
 8 | 
 9 | }}}  // namespaces
10 | 


--------------------------------------------------------------------------------
/src/TemporalConvolutionTBC.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "THCGeneral.h"
 4 | #include "cuda/DeviceTensor.cuh"
 5 | 
 6 | namespace facebook {
 7 | namespace deeplearning {
 8 | namespace torch {
 9 | namespace detail {
10 | 
11 | void runTemporalConvolutionTBC_updateOutput(
12 |     THCState* state,
13 |     const cuda::DeviceTensor<float, 3>& input,
14 |     const cuda::DeviceTensor<float, 3>& output,
15 |     const cuda::DeviceTensor<float, 3>& weight,
16 |     const cuda::DeviceTensor<float, 1>& bias);
17 | 
18 | void runTemporalConvolutionTBC_updateGradInput(
19 |     THCState* state,
20 |     const cuda::DeviceTensor<float, 3>& dInput,
21 |     const cuda::DeviceTensor<float, 3>& dOutput,
22 |     const cuda::DeviceTensor<float, 3>& weight);
23 | 
24 | void runTemporalConvolutionTBC_accGradParameters(
25 |     THCState* state,
26 |     const cuda::DeviceTensor<float, 3>& input,
27 |     const cuda::DeviceTensor<float, 3>& dOutput,
28 |     const cuda::DeviceTensor<float, 3>& dWeight,
29 |     const cuda::DeviceTensor<float, 1>& dBias,
30 |     float scale);
31 | }
32 | }
33 | }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/TemporalConvolutionTBCHost.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 Facebook
  2 | 
  3 | #include "THC.h"
  4 | #include "THCTensor.h"
  5 | #include "cuda/DeviceTensor.cuh"
  6 | #include "src/DeviceTensorUtils.h"
  7 | #include "src/TemporalConvolutionTBC.cuh"
  8 | #include "src/Utils.h"
  9 | 
 10 | #include <cuda_runtime.h>
 11 | #include <glog/logging.h>
 12 | #include <lua.hpp>
 13 | #include <luaT.h>
 14 | 
 15 | using namespace std;
 16 | using namespace facebook::cuda;
 17 | 
 18 | namespace facebook {
 19 | namespace deeplearning {
 20 | namespace torch {
 21 | 
 22 | namespace {
 23 | 
 24 | inline THCudaTensor*
 25 | getFieldCudaTensor(lua_State* L, int arg, const char* name) {
 26 |   return static_cast<THCudaTensor*>(
 27 |       luaT_getfieldcheckudata(L, arg, name, "torch.CudaTensor"));
 28 | }
 29 | inline THCudaTensor* getCudaTensor(lua_State* L, int arg) {
 30 |   return static_cast<THCudaTensor*>(
 31 |       luaT_checkudata(L, arg, "torch.CudaTensor"));
 32 | }
 33 | 
 34 | int updateOutput(lua_State* L) {
 35 |   THCState* state = getCutorchState(L);
 36 |   auto output = (THCudaTensor*)luaT_getfieldcheckudata(
 37 |       L, 1, "output", "torch.CudaTensor");
 38 |   auto weight = (THCudaTensor*)luaT_getfieldcheckudata(
 39 |       L, 1, "weight", "torch.CudaTensor");
 40 |   auto bias =
 41 |       (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor");
 42 |   auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 43 | 
 44 |   THAssert(THCudaTensor_checkGPU(state, 4, input, output, weight, bias));
 45 | 
 46 |   auto inputDev = torchToDeviceTensor<float, 3>(state, input);
 47 |   auto outputDev = torchToDeviceTensor<float, 3>(state, output);
 48 |   auto weightDev = torchToDeviceTensor<float, 3>(state, weight);
 49 |   auto biasDev = torchToDeviceTensor<float, 1>(state, bias);
 50 | 
 51 |   detail::runTemporalConvolutionTBC_updateOutput(
 52 |       state, inputDev, outputDev, weightDev, biasDev);
 53 | 
 54 |   return 0;
 55 | }
 56 | 
 57 | int updateGradInput(lua_State* L) {
 58 |   THCState* state = getCutorchState(L);
 59 |   auto dInput = (THCudaTensor*)luaT_getfieldcheckudata(
 60 |       L, 1, "gradInput", "torch.CudaTensor");
 61 |   auto weight = (THCudaTensor*)luaT_getfieldcheckudata(
 62 |       L, 1, "weight", "torch.CudaTensor");
 63 |   auto dOutput = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 64 | 
 65 |   THAssert(THCudaTensor_checkGPU(state, 3, dInput, dOutput, weight));
 66 | 
 67 |   auto dInputDev = torchToDeviceTensor<float, 3>(state, dInput);
 68 |   auto dOutputDev = torchToDeviceTensor<float, 3>(state, dOutput);
 69 |   auto weightDev = torchToDeviceTensor<float, 3>(state, weight);
 70 | 
 71 |   detail::runTemporalConvolutionTBC_updateGradInput(
 72 |       state, dInputDev, dOutputDev, weightDev);
 73 | 
 74 |   return 0;
 75 | }
 76 | 
 77 | int accGradParameters(lua_State* L) {
 78 |   THCState* state = getCutorchState(L);
 79 |   auto dWeight = (THCudaTensor*)luaT_getfieldcheckudata(
 80 |       L, 1, "gradWeight", "torch.CudaTensor");
 81 |   auto dBias = (THCudaTensor*)luaT_getfieldcheckudata(
 82 |       L, 1, "gradBias", "torch.CudaTensor");
 83 |   auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
 84 |   auto dOutput = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
 85 |   float scale = lua_tonumber(L, 4);
 86 | 
 87 |   THAssert(THCudaTensor_checkGPU(state, 4, input, dOutput, dWeight, dBias));
 88 | 
 89 |   auto inputDev = torchToDeviceTensor<float, 3>(state, input);
 90 |   auto dOutputDev = torchToDeviceTensor<float, 3>(state, dOutput);
 91 |   auto dWeightDev = torchToDeviceTensor<float, 3>(state, dWeight);
 92 |   auto dBiasDev = torchToDeviceTensor<float, 1>(state, dBias);
 93 | 
 94 |   detail::runTemporalConvolutionTBC_accGradParameters(
 95 |       state, inputDev, dOutputDev, dWeightDev, dBiasDev, scale);
 96 | 
 97 |   return 0;
 98 | }
 99 | 
100 | const luaL_Reg functions[] = {
101 |     {"TemporalConvolutionTBC_updateOutput", updateOutput},
102 |     {"TemporalConvolutionTBC_updateGradInput", updateGradInput},
103 |     {"TemporalConvolutionTBC_accGradParameters", accGradParameters},
104 |     {nullptr, nullptr}};
105 | 
106 | } // namespace
107 | 
108 | void initTemporalConvolutionTBCCuda(lua_State* L) {
109 |   luaT_pushmetatable(L, "torch.CudaTensor");
110 |   luaT_registeratname(L, functions, "nn");
111 |   lua_pop(L, 1);
112 | }
113 | }
114 | }
115 | } // namespaces
116 | 


--------------------------------------------------------------------------------
/src/TemporalKMaxPooling.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | 
  3 | #include "cuda/DeviceTensor.cuh"
  4 | #include "cuda/TopKElements.cuh"
  5 | #include "cuda/DeviceTensor.cuh"
  6 | #include "cuda/util/CachedDeviceProperties.h"
  7 | #include "THC.h"
  8 | 
  9 | using namespace facebook::cuda;
 10 | 
 11 | namespace facebook { namespace deeplearning { namespace torch {
 12 | 
 13 | namespace {
 14 | 
 15 | __device__ __forceinline__
 16 | int getUpdateOutputBatch(const DeviceTensor<float, 3>& input) {
 17 |   return blockIdx.y;
 18 | }
 19 | 
 20 | __device__ __forceinline__
 21 | int getUpdateOutputFeature(const DeviceTensor<float, 3>& input) {
 22 |   return blockIdx.x * blockDim.y + threadIdx.y;
 23 | }
 24 | 
 25 | // input: [batch][frame][embedding]
 26 | // output: [batch][K frames s.t. (f)(embedding) is the highest][embedding]
 27 | // ordered in original [frame] order
 28 | 
 29 | __global__ void
 30 | temporalKMaxPoolingUpdateOutput(DeviceTensor<float, 3> input,
 31 |                                 DeviceTensor<float, 3> indices,
 32 |                                 DeviceTensor<float, 3> output,
 33 |                                 int k) {
 34 |   const int batch = getUpdateOutputBatch(input);
 35 |   const int feature = getUpdateOutputFeature(input);
 36 | 
 37 |   if (feature >= input.getSize(2)) {
 38 |     return;
 39 |   }
 40 | 
 41 |   DeviceTensor<float, 1> input1d(&input[batch][0][feature],
 42 |                                  (const int[1]){ input.getSize(1) },
 43 |                                  (const int[1]){ input.getSize(2) });
 44 |   DeviceTensor<float, 1> output1d(&output[batch][0][feature],
 45 |                                   (const int[1]){ k },
 46 |                                   (const int[1]){ output.getSize(2) });
 47 |   DeviceTensor<float, 1> indices1d(&indices[batch][0][feature],
 48 |                                    (const int[1]){ k },
 49 |                                    (const int[1]){ indices.getSize(2) });
 50 | 
 51 |   warpFindTopKElementsIndexOrder(input1d, output1d, indices1d, k);
 52 | }
 53 | 
 54 | __device__ __forceinline__
 55 | int getUpdateGradInputBatch() {
 56 |   return blockIdx.x;
 57 | }
 58 | 
 59 | __device__ __forceinline__
 60 | int getUpdateGradInputOutputFrame() {
 61 |   return blockIdx.y;
 62 | }
 63 | 
 64 | __global__ void
 65 | temporalKMaxPoolingUpdateGradInput(DeviceTensor<float, 3> gradOutput,
 66 |                                    DeviceTensor<float, 3> indices,
 67 |                                    DeviceTensor<float, 3> gradInput,
 68 |                                    int k) {
 69 |   const int batch = getUpdateGradInputBatch();
 70 |   const int outputFrame = getUpdateGradInputOutputFrame();
 71 | 
 72 |   for (int feature = threadIdx.x;
 73 |        feature < gradInput.getSize(2);
 74 |        feature += blockDim.x) {
 75 |     int index = (int) indices[batch][outputFrame][feature];
 76 | 
 77 |     atomicAdd(&gradInput[batch][index][feature],
 78 |               gradOutput[batch][outputFrame][feature]);
 79 |   }
 80 | }
 81 | 
 82 | }
 83 | 
 84 | void
 85 | runTemporalKMaxPoolingUpdateOutput(cudaStream_t stream,
 86 |                                    const DeviceTensor<float, 3>& input,
 87 |                                    const DeviceTensor<float, 3>& indices,
 88 |                                    DeviceTensor<float, 3>& output,
 89 |                                    int k) {
 90 |   const cudaDeviceProp& deviceProperties =
 91 |     facebook::cuda::getCurrentDeviceProperties();
 92 | 
 93 |   // We aim to run with 4 warps.
 94 |   const int numWarps = std::min(input.getSize(2), 4);
 95 | 
 96 |   dim3 block(deviceProperties.warpSize, numWarps);
 97 |   dim3 grid(cuda::ceil(input.getSize(2), numWarps), input.getSize(0));
 98 | 
 99 |   temporalKMaxPoolingUpdateOutput<<<grid, block, 0, stream>>>(
100 |     input, indices, output, k);
101 | }
102 | 
103 | void
104 | runTemporalKMaxPoolingUpdateGradInput(cudaStream_t stream,
105 |                                       const DeviceTensor<float, 3>& gradOutput,
106 |                                       const DeviceTensor<float, 3>& indices,
107 |                                       DeviceTensor<float, 3>& gradInput,
108 |                                       int k) {
109 |   const cudaDeviceProp& deviceProperties =
110 |     facebook::cuda::getCurrentDeviceProperties();
111 | 
112 |   // We aim to run with 4 warps.
113 |   const int numThreads =
114 |     std::min(gradOutput.getSize(2), deviceProperties.warpSize * 4);
115 | 
116 |   dim3 block(numThreads);
117 |   dim3 grid(gradOutput.getSize(0),
118 |             gradOutput.getSize(1));
119 | 
120 |   temporalKMaxPoolingUpdateGradInput<<<grid, block, 0, stream>>>(
121 |     gradOutput, indices, gradInput, k);
122 | }
123 | 
124 | } } }
125 | 


--------------------------------------------------------------------------------
/src/TemporalKMaxPooling.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "cuda/DeviceTensor.cuh"
 4 | 
 5 | namespace facebook { namespace deeplearning { namespace torch {
 6 | 
 7 | void
 8 | runTemporalKMaxPoolingUpdateOutput(
 9 |   cudaStream_t stream,
10 |   const cuda::DeviceTensor<float, 3>& input,
11 |   const cuda::DeviceTensor<float, 3>& indices,
12 |   cuda::DeviceTensor<float, 3>& output,
13 |   int k);
14 | 
15 | void
16 | runTemporalKMaxPoolingUpdateGradInput(
17 |   cudaStream_t stream,
18 |   const cuda::DeviceTensor<float, 3>& gradOutput,
19 |   const cuda::DeviceTensor<float, 3>& indices,
20 |   cuda::DeviceTensor<float, 3>& gradInput,
21 |   int k);
22 | 
23 | } } }
24 | 


--------------------------------------------------------------------------------
/src/Tensor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "thpp/Storage.h"
 4 | #include "thpp/Tensor.h"
 5 | 
 6 | namespace facebook { namespace deeplearning { namespace torch {
 7 | 
 8 | using namespace thpp;
 9 | 
10 | }}}  // namespaces
11 | 


--------------------------------------------------------------------------------
/src/Utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "Utils.h"
 2 | 
 3 | namespace facebook { namespace deeplearning { namespace torch {
 4 | 
 5 | THCState* getCutorchState(lua_State* L) {
 6 |   // Unfortunately cutorch lua headers aren't exported, so we have to
 7 |   // copy this. This is a copy from cunn.
 8 |   lua_getglobal(L, "cutorch");
 9 |   lua_getfield(L, -1, "getState");
10 |   lua_call(L, 0, 1);
11 |   THCState *state = (THCState*) lua_touserdata(L, -1);
12 |   lua_pop(L, 2);
13 |   return state;
14 | }
15 | 
16 | } } }
17 | 


--------------------------------------------------------------------------------
/src/Utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef FBCUNN_UTILS_H
 2 | #define FBCUNN_UTILS_H
 3 | 
 4 | #include <lua.hpp>
 5 | #include "THCGeneral.h"
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch {
 8 | 
 9 | THCState* getCutorchState(lua_State* L);
10 | 
11 | } } }
12 | 
13 | #endif
14 | 


--------------------------------------------------------------------------------
/src/WeightedLookupTable.cu:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015 Facebook
 3 |  */
 4 | 
 5 | #include "cuda/CudaUtils.cuh"
 6 | #include "cuda/DeviceTensor.cuh"
 7 | #include "cuda/WarpReductions.cuh"
 8 | 
 9 | using namespace facebook::cuda;
10 | 
11 | namespace facebook { namespace deeplearning { namespace torch {
12 | namespace detail {
13 | 
14 | namespace {
15 | 
16 | __global__ void scaleByWeight(DeviceTensor<float, 2> output,
17 |                               DeviceTensor<float, 2> input,
18 |                               DeviceTensor<float, 1> weights) {
19 |   // Values computed per thread
20 |   const int VT = 4;
21 | 
22 |   // Each block computes a 4x128 section of the output, with each
23 |   // warp handling a 1x128 section.
24 | 
25 |   int rowIdx = blockIdx.x * blockDim.y + threadIdx.y;
26 |   if (rowIdx < weights.getSize(0)) {
27 |     float weight = weights[rowIdx];
28 | 
29 |     #pragma unroll
30 |     for (int i = 0; i < VT; i++) {
31 |       int colIdx = blockDim.x * (VT * blockIdx.y + i) + threadIdx.x;
32 |       if (colIdx < input.getSize(1)) {
33 |         output[rowIdx][colIdx] = input[rowIdx][colIdx] * weight;
34 |       }
35 |     }
36 |   }
37 | }
38 | 
39 | }
40 | 
41 | void launchWeightedLookupTableScaleByWeightKernel(cudaStream_t stream,
42 |                                                   DeviceTensor<float, 2>& output,
43 |                                                   DeviceTensor<float, 2>& input,
44 |                                                   DeviceTensor<float, 1>& weight) {
45 |   dim3 grid(cuda::ceil(output.getSize(0), 4), cuda::ceil(output.getSize(1), 128));
46 |   dim3 block(32, 4);
47 | 
48 |   scaleByWeight<<<grid, block, 0, stream>>>(output, input, weight);
49 | }
50 | 
51 | }}}}
52 | 


--------------------------------------------------------------------------------
/src/WeightedLookupTableHost.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2015 Facebook
 3 |  */
 4 | 
 5 | #include "cuda/DeviceTensor.cuh"
 6 | #include "src/Utils.h"
 7 | #include "src/DeviceTensorUtils.h"
 8 | #include "THC.h"
 9 | 
10 | #include <lua.hpp>
11 | #include <TH.h>
12 | #include <luaT.h>
13 | 
14 | using namespace facebook::cuda;
15 | 
16 | namespace facebook { namespace deeplearning { namespace torch {
17 | 
18 | namespace detail {
19 | void launchWeightedLookupTableScaleByWeightKernel(
20 |   cudaStream_t stream,
21 |   DeviceTensor<float, 2>& output,
22 |   DeviceTensor<float, 2>& input,
23 |   DeviceTensor<float, 1>& weight);
24 | }
25 | 
26 | namespace {
27 | 
28 | int scaleByWeight(lua_State* L) {
29 |   THCState* state = getCutorchState(L);
30 |   auto output  = (THCudaTensor*)luaT_checkudata(L, 1, "torch.CudaTensor");
31 |   const auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor");
32 |   const auto weight = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor");
33 | 
34 |   DeviceTensor<float, 2> cudaOutput = torchToDeviceTensor<float, 2>(state, output);
35 |   DeviceTensor<float, 2> cudaInput = torchToDeviceTensor<float, 2>(state, input);
36 |   DeviceTensor<float, 1> cudaWeight = torchToDeviceTensor<float, 1>(state, weight);
37 | 
38 |   detail::launchWeightedLookupTableScaleByWeightKernel(
39 |     THCState_getCurrentStream(state),
40 |     cudaOutput, cudaInput, cudaWeight);
41 | 
42 |   return 0;
43 | }
44 | 
45 | const luaL_Reg functions[] = {
46 |   {"WeightedLookupTable_scaleByWeight", scaleByWeight},
47 |   {nullptr, nullptr},
48 | };
49 | 
50 | } // namespace
51 | 
52 | void initWeightedLookupTableCuda(lua_State* L) {
53 |   luaT_pushmetatable(L, "torch.CudaTensor");
54 |   luaT_registeratname(L, functions, "nn");
55 |   lua_pop(L, 1);
56 | }
57 | 
58 | }}}  // namespaces
59 | 


--------------------------------------------------------------------------------
/src/fft/CuFFTConvolution_AccGradParameters.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | struct THCudaTensor;
 6 | struct THCState;
 7 | 
 8 | namespace facebook { namespace deeplearning { namespace torch {
 9 | 
10 | void CuFFTConvolution_ReferenceAccGradParameters(THCState* state,
11 |                                                  THCudaTensor* inputTH,
12 |                                                  THCudaTensor* kernelsTH,
13 |                                                  THCudaTensor* outputTH,
14 |                                                  THCudaTensor* gradBiasTH,
15 |                                                  float scale,
16 |                                                  THCudaTensor* inputComplexTH,
17 |                                                  THCudaTensor* kernelsComplexTH,
18 |                                                  THCudaTensor* outputComplexTH);
19 | 
20 | void CuFFTConvolution_AccGradParameters(THCState* state,
21 |                                         THCudaTensor* inputTH,
22 |                                         THCudaTensor* kernelsTH,
23 |                                         THCudaTensor* outputTH,
24 |                                         THCudaTensor* gradBiasTH,
25 |                                         float scale,
26 |                                         THCudaTensor* inputComplexTH,
27 |                                         THCudaTensor* kernelsComplexTH,
28 |                                         THCudaTensor* outputComplexTH,
29 |                                         THCudaTensor* inputComplexTTH,
30 |                                         THCudaTensor* kernelsComplexTTH,
31 |                                         THCudaTensor* outputComplexTTH);
32 | 
33 | class CuFFTConvolution;
34 | 
35 | 
36 | // This version can be preconfigured with cublasHandle, cufftHandle and
37 | // cudaStreams. Use this one for performance and reuse resources.
38 | void CuFFTConvolution_AccGradParameters(THCState* state,
39 |                                         CuFFTConvolution* conv,
40 |                                         THCudaTensor* gradOutputTH,
41 |                                         THCudaTensor* gradBiasTH,
42 |                                         float scale);
43 | } } } // namespace
44 | 


--------------------------------------------------------------------------------
/src/fft/CuFFTConvolution_UpdateGradInput.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | 
  3 | #include "src/fft/CuFFTConvolution_UpdateGradInput.cuh"
  4 | 
  5 | #include "cuda/CudaUtils.cuh"
  6 | #include "cuda/DeviceTensor.cuh"
  7 | #include "src/DeviceTensorUtils.h"
  8 | #include "THCTensor.h"
  9 | #include "src/CuBLASWrapper.h"
 10 | #include "src/fft/CuFFTWrapper.cuh"
 11 | #include "src/fft/CuFFTConvolution.cuh"
 12 | #include "src/fft/Utils.cuh"
 13 | 
 14 | #include <cublas_v2.h>
 15 | #include <cuda_runtime.h>
 16 | #include <cufft.h>
 17 | #include <glog/logging.h>
 18 | 
 19 | using namespace facebook::cuda;
 20 | 
 21 | namespace facebook { namespace deeplearning { namespace torch {
 22 | 
 23 | // Assumes complex is float[2]
 24 | __global__ void referenceUpdateGradInput(DeviceTensor<float, 5> inputComplex,
 25 |                                          DeviceTensor<float, 5> weightComplex,
 26 |                                          DeviceTensor<float, 5> outputComplex)
 27 | {
 28 |   // Input originally real, we have circular Hermitian symmetry:
 29 |   // X[k] = X∗[−k mod N] .
 30 |   const int Batches = inputComplex.getSize(0);
 31 |   const int Weight = weightComplex.getSize(0);
 32 |   const int InputRows = inputComplex.getSize(2);
 33 |   const int InputCols = inputComplex.getSize(3);
 34 |   for (int batch = 0; batch < Batches; ++batch) {
 35 |     for (int filter = 0; filter < Weight; ++filter) {
 36 |       for (int inputRow = 0; inputRow < InputRows; ++inputRow) {
 37 |         for (int inputCol = 0; inputCol < InputCols; ++inputCol) {
 38 |           for (int inputPlane = 0; inputPlane < inputComplex.getSize(1);
 39 |                ++inputPlane) {
 40 |             cuFloatComplex* inp = inputComplex[batch][inputPlane]
 41 |               [inputRow][inputCol].dataAs<cuFloatComplex>();
 42 |             if (filter == 0) {
 43 |               inp->x = 0.0f;
 44 |               inp->y = 0.0f;
 45 |             }
 46 | 
 47 |             cuFloatComplex weight = weightComplex[filter][inputPlane]
 48 |               [inputRow][inputCol].ldgAs<cuFloatComplex>();
 49 | 
 50 |             cuFloatComplex output = outputComplex[batch][filter][inputRow]
 51 |                       [inputCol].ldgAs<cuFloatComplex>();
 52 | 
 53 |             *inp = cuCfmaf(weight, output, *inp);
 54 |           }
 55 |         }
 56 |       }
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | void CuFFTConvolution_ReferenceUpdateGradInput(THCState* state,
 62 |                                                THCudaTensor* inputTH,
 63 |                                                THCudaTensor* weightTH,
 64 |                                                THCudaTensor* outputTH,
 65 |                                                THCudaTensor* inputComplexTH,
 66 |                                                THCudaTensor* weightComplexTH,
 67 |                                                THCudaTensor* outputComplexTH) {
 68 |   DeviceTensor<float, 4> weight =
 69 |     torchToDeviceTensor<float, 4>(state, weightTH);
 70 |   DeviceTensor<float, 4> input =
 71 |     torchToDeviceTensor<float, 4>(state, inputTH);
 72 |   DeviceTensor<float, 4> output =
 73 |     torchToDeviceTensor<float, 4>(state, outputTH);
 74 | 
 75 |   DeviceTensor<float, 5> inputComplex =
 76 |     torchToDeviceTensor<float, 5>(state, inputComplexTH);
 77 |   DeviceTensor<float, 5> outputComplex =
 78 |     torchToDeviceTensor<float, 5>(state, outputComplexTH);
 79 |   DeviceTensor<float, 5> weightComplex =
 80 |     torchToDeviceTensor<float, 5>(state, weightComplexTH);
 81 | 
 82 |   fft2d<2>(weight, weightComplex);
 83 |   fft2d<2>(output, outputComplex);
 84 | 
 85 |   dim3 grid(1);
 86 |   dim3 block(1);
 87 |   referenceUpdateGradInput<<<grid, block>>>(
 88 |     inputComplex, weightComplex, outputComplex);
 89 | 
 90 |   fft2d<2>(input, inputComplex, FFTParameters().inverse());
 91 | }
 92 | 
 93 | void CuFFTConvolution_UpdateGradInput(THCState* state,
 94 |                                       THCudaTensor* inputTH,
 95 |                                       THCudaTensor* weightTH,
 96 |                                       THCudaTensor* outputTH,
 97 |                                       THCudaTensor* inputComplexTH,
 98 |                                       THCudaTensor* weightComplexTH,
 99 |                                       THCudaTensor* outputComplexTH,
100 |                                       THCudaTensor* inputComplexTTH,
101 |                                       THCudaTensor* weightComplexTTH,
102 |                                       THCudaTensor* outputComplexTTH) {
103 |   CuFFTConvolution conv((ConvolutionPass(ConvolutionPass::kUpdateGradInput)));
104 |   conv.withInputAndBuffers(state, inputTH, inputComplexTH, inputComplexTTH)
105 |     .withFiltersAndBuffers(state, weightTH, weightComplexTH, weightComplexTTH)
106 |     .withOutputAndBuffers(state, outputTH, outputComplexTH, outputComplexTTH)
107 |     .run();
108 | }
109 | 
110 | void CuFFTConvolution_UpdateGradInput(CuFFTConvolution* conv) {
111 |   conv->run();
112 | }
113 | 
114 | } } } // namespace
115 | 


--------------------------------------------------------------------------------
/src/fft/CuFFTConvolution_UpdateGradInput.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | struct THCudaTensor;
 6 | struct THCState;
 7 | 
 8 | namespace facebook { namespace deeplearning { namespace torch {
 9 | 
10 | void CuFFTConvolution_ReferenceUpdateGradInput(THCState* state,
11 |                                                THCudaTensor* inputTH,
12 |                                                THCudaTensor* kernelsTH,
13 |                                                THCudaTensor* outputTH,
14 |                                                THCudaTensor* inputComplexTH,
15 |                                                THCudaTensor* kernelsComplexTH,
16 |                                                THCudaTensor* outputComplexTH);
17 | 
18 | // CuFFTConvolution calls require 2 sets of buffers for each
19 | // input / kernels / output tensor.
20 | // - The first set is used to perform FFTs
21 | // - The second set is used to hold the transpose of the FFTs for the
22 | //   subsequent gemm calls.
23 | // The first set must always be supplied, the second will be constructed if
24 | // passed NULL.
25 | void CuFFTConvolution_UpdateGradInput(THCState* state,
26 |                                       THCudaTensor* inputTH,
27 |                                       THCudaTensor* kernelsTH,
28 |                                       THCudaTensor* outputTH,
29 |                                       THCudaTensor* inputComplexTH,
30 |                                       THCudaTensor* kernelsComplexTH,
31 |                                       THCudaTensor* outputComplexTH,
32 |                                       THCudaTensor* inputComplexTTH,
33 |                                       THCudaTensor* kernelsComplexTTH,
34 |                                       THCudaTensor* outputComplexTTH);
35 | 
36 | class CuFFTConvolution;
37 | 
38 | // This version can be preconfigured with cublasHandle, cufftHandle and
39 | // cudaStreams. Use this one for performance and reuse resources.
40 | void CuFFTConvolution_UpdateGradInput(CuFFTConvolution* conv);
41 | 
42 | } } } // namespace
43 | 


--------------------------------------------------------------------------------
/src/fft/CuFFTConvolution_UpdateOutput.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | 
  3 | #include "src/fft/CuFFTConvolution_UpdateOutput.cuh"
  4 | 
  5 | #include "cuda/CudaUtils.cuh"
  6 | #include "cuda/DeviceTensor.cuh"
  7 | #include "src/DeviceTensorUtils.h"
  8 | #include "THCTensor.h"
  9 | #include "src/ConvolutionBias.cuh"
 10 | #include "src/CuBLASWrapper.h"
 11 | #include "src/fft/CuFFTWrapper.cuh"
 12 | #include "src/fft/CuFFTConvolution.cuh"
 13 | #include "src/fft/Utils.cuh"
 14 | 
 15 | #include <cublas_v2.h>
 16 | #include <cuda_runtime.h>
 17 | #include <cufft.h>
 18 | #include <glog/logging.h>
 19 | 
 20 | using namespace facebook::cuda;
 21 | 
 22 | namespace facebook { namespace deeplearning { namespace torch {
 23 | 
 24 | // Assumes complex is float[2]
 25 | __global__ void referenceUpdateOuput(DeviceTensor<float, 5> inputComplex,
 26 |                                      DeviceTensor<float, 5> filtersComplex,
 27 |                                      DeviceTensor<float, 5> outputComplex)
 28 | {
 29 |   // Input originally real, we have circular Hermitian symmetry:
 30 |   // X[k] = X∗[−k mod N] .
 31 |   const int Batches = inputComplex.getSize(0);
 32 |   const int Filters = filtersComplex.getSize(0);
 33 |   const int OutputRows = outputComplex.getSize(2);
 34 |   const int OutputCols = outputComplex.getSize(3);
 35 |   for (int batch = 0; batch < Batches; ++batch) {
 36 |     for (int filter = 0; filter < Filters; ++filter) {
 37 |       for (int outputRow = 0; outputRow < OutputRows; ++outputRow) {
 38 |         for (int outputCol = 0; outputCol < OutputCols; ++outputCol) {
 39 |           cuFloatComplex* out = outputComplex[batch][filter]
 40 |             [outputRow][outputCol].dataAs<cuFloatComplex>();
 41 |           out->x = 0.0f;
 42 |           out->y = 0.0f;
 43 |           for (int inputPlane = 0; inputPlane < inputComplex.getSize(1);
 44 |                ++inputPlane) {
 45 |             cuFloatComplex input =
 46 |               inputComplex[batch][inputPlane]
 47 |               [outputRow][outputCol].ldgAs<cuFloatComplex>();
 48 | 
 49 |             cuFloatComplex filters =
 50 |               cuConjf(filtersComplex[filter][inputPlane]
 51 |                       [outputRow][outputCol].ldgAs<cuFloatComplex>());
 52 | 
 53 |             *out = cuCfmaf(input, filters, *out);
 54 |           }
 55 |         }
 56 |       }
 57 |     }
 58 |   }
 59 | }
 60 | 
 61 | void CuFFTConvolution_ReferenceUpdateOutput(THCState* state,
 62 |                                             THCudaTensor* inputTH,
 63 |                                             THCudaTensor* kernelsTH,
 64 |                                             THCudaTensor* outputTH,
 65 |                                             THCudaTensor* biasTH,
 66 |                                             THCudaTensor* inputComplexTH,
 67 |                                             THCudaTensor* kernelsComplexTH,
 68 |                                             THCudaTensor* outputComplexTH) {
 69 |   DeviceTensor<float, 4> filters =
 70 |     torchToDeviceTensor<float, 4>(state, kernelsTH);
 71 |   DeviceTensor<float, 4> input =
 72 |     torchToDeviceTensor<float, 4>(state, inputTH);
 73 |   DeviceTensor<float, 4> output =
 74 |     torchToDeviceTensor<float, 4>(state, outputTH);
 75 | 
 76 |   DeviceTensor<float, 5> inputComplex =
 77 |     torchToDeviceTensor<float, 5>(state, inputComplexTH);
 78 |   DeviceTensor<float, 5> outputComplex =
 79 |     torchToDeviceTensor<float, 5>(state, outputComplexTH);
 80 |   DeviceTensor<float, 5> filtersComplex =
 81 |     torchToDeviceTensor<float, 5>(state, kernelsComplexTH);
 82 | 
 83 |   fft2d<2>(input, inputComplex);
 84 |   fft2d<2>(filters, filtersComplex);
 85 | 
 86 |   dim3 grid(1);
 87 |   dim3 block(1);
 88 |   referenceUpdateOuput<<<grid, block>>>(
 89 |     inputComplex, filtersComplex, outputComplex);
 90 | 
 91 |   fft2d<2>(output, outputComplex, FFTParameters().inverse());
 92 | 
 93 |   bias::updateOutputBias(state, outputTH, biasTH);
 94 | }
 95 | 
 96 | void CuFFTConvolution_UpdateOutput(THCState* state,
 97 |                                    THCudaTensor* inputTH,
 98 |                                    THCudaTensor* kernelsTH,
 99 |                                    THCudaTensor* outputTH,
100 |                                    THCudaTensor* biasTH,
101 |                                    THCudaTensor* inputComplexTH,
102 |                                    THCudaTensor* kernelsComplexTH,
103 |                                    THCudaTensor* outputComplexTH,
104 |                                    THCudaTensor* inputComplexTTH,
105 |                                    THCudaTensor* kernelsComplexTTH,
106 |                                    THCudaTensor* outputComplexTTH) {
107 |   CuFFTConvolution conv((ConvolutionPass(ConvolutionPass::kUpdateOutput)));
108 |   conv.withInputAndBuffers(state,
109 |                            inputTH, inputComplexTH, inputComplexTTH)
110 |     .withFiltersAndBuffers(state,
111 |                            kernelsTH, kernelsComplexTH, kernelsComplexTTH)
112 |     .withOutputAndBuffers(state,
113 |                           outputTH, outputComplexTH, outputComplexTTH)
114 |     .run();
115 | 
116 |   bias::updateOutputBias(state, outputTH, biasTH);
117 | }
118 | 
119 | void CuFFTConvolution_UpdateOutput(THCState* state,
120 |                                    CuFFTConvolution* conv,
121 |                                    THCudaTensor* outputTH,
122 |                                    THCudaTensor* biasTH) {
123 |   conv->run();
124 | 
125 |   bias::updateOutputBias(state, outputTH, biasTH);
126 | }
127 | 
128 | } } } // namespace
129 | 


--------------------------------------------------------------------------------
/src/fft/CuFFTConvolution_UpdateOutput.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | struct THCudaTensor;
 6 | struct THCState;
 7 | 
 8 | namespace facebook { namespace deeplearning { namespace torch {
 9 | 
10 | void CuFFTConvolution_ReferenceUpdateOutput(THCState* state,
11 |                                             THCudaTensor* inputTH,
12 |                                             THCudaTensor* kernelsTH,
13 |                                             THCudaTensor* outputTH,
14 |                                             THCudaTensor* biasTH,
15 |                                             THCudaTensor* inputComplexTH,
16 |                                             THCudaTensor* kernelsComplexTH,
17 |                                             THCudaTensor* outputComplexTH);
18 | 
19 | // CuFFTConvolution calls require 2 sets of buffers for each
20 | // input / kernels / output tensor.
21 | // - The first set is used to perform FFTs
22 | // - The second set is used to hold the transpose of the FFTs for the
23 | //   subsequent gemm calls.
24 | // The first set must always be supplied, the second will be constructed if
25 | // passed NULL.
26 | void CuFFTConvolution_UpdateOutput(THCState* state,
27 |                                    THCudaTensor* inputTH,
28 |                                    THCudaTensor* kernelsTH,
29 |                                    THCudaTensor* outputTH,
30 |                                    THCudaTensor* biasTH,
31 |                                    THCudaTensor* inputComplexTH,
32 |                                    THCudaTensor* kernelsComplexTH,
33 |                                    THCudaTensor* outputComplexTH,
34 |                                    THCudaTensor* inputComplexTTH,
35 |                                    THCudaTensor* kernelsComplexTTH,
36 |                                    THCudaTensor* outputComplexTTH);
37 | 
38 | class CuFFTConvolution;
39 | 
40 | // This version can be preconfigured with cublasHandle, cufftHandle and
41 | // cudaStreams. Use this one for performance and reuse resources.
42 | void CuFFTConvolution_UpdateOutput(THCState* state,
43 |                                    CuFFTConvolution* conv,
44 |                                    THCudaTensor* outputTH,
45 |                                    THCudaTensor* biasTH);
46 | 
47 | } } } // namespace
48 | 


--------------------------------------------------------------------------------
/src/fft/CuFFTWrapper.cuh:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | #pragma once
  3 | 
  4 | #include "cuda/DeviceTensor.cuh"
  5 | #include "cuda/fbfft/FBFFT.cuh"
  6 | #include "src/fft/Utils.cuh"
  7 | 
  8 | #include <cufft.h>
  9 | 
 10 | namespace facebook { namespace deeplearning { namespace torch {
 11 | 
 12 | // Can add layout stuff later if needed
 13 | class FFTParameters {
 14 |  public:
 15 |   // Default is forward, normalized FFT.
 16 |   // Normalization occurs only in inverse FFT (by 1 / (M.N)) since CuFFT does
 17 |   // unnormalized FFTs by default
 18 |   FFTParameters() :
 19 |       version(cufft), direction_(true), normalize_(true), padLeft_(0), padUp_(0)
 20 |     {}
 21 | 
 22 |   operator facebook::cuda::fbfft::FBFFTParameters() const {
 23 |     facebook::cuda::fbfft::FBFFTParameters res;
 24 |     res = res.normalize(normalize_).withPadLeft(padLeft_).withPadUp(padUp_);
 25 |     return (direction_) ? res.forward() : res.inverse();
 26 |   }
 27 | 
 28 |   FFTParameters& withCufft() {
 29 |     version = cufft;
 30 |     return *this;
 31 |   }
 32 | 
 33 |   FFTParameters& withFbfft() {
 34 |     version = fbfft;
 35 |     return *this;
 36 |   }
 37 | 
 38 |   FFTParameters& forward() {
 39 |     direction_ = true;
 40 |     return *this;
 41 |   }
 42 | 
 43 |   FFTParameters& inverse() {
 44 |     direction_ = false;
 45 |     return *this;
 46 |   }
 47 | 
 48 |   FFTParameters& normalize(bool n) {
 49 |     normalize_ = n;
 50 |     return *this;
 51 |   }
 52 | 
 53 |   FFTParameters& withPadLeft(int p) {
 54 |     padLeft_ = p;
 55 |     return *this;
 56 |   }
 57 | 
 58 |   FFTParameters& withPadUp(int p) {
 59 |     padUp_ = p;
 60 |     return *this;
 61 |   }
 62 | 
 63 |   bool forwardFFT() const { return  direction_; }
 64 |   bool inverseFFT() const { return !direction_; }
 65 |   bool normalizeFFT() const { return normalize_; }
 66 |   bool cuFFT() const { return version == cufft; }
 67 |   bool fbFFT() const { return version == fbfft; }
 68 |   int padLeft() const { return padLeft_; }
 69 |   int padUp() const { return padUp_; }
 70 | 
 71 |   template <bool Hermitian>
 72 |   std::vector<long> makeComplexTensorSizes(
 73 |       long batch, long plane, long y, long x) {
 74 |     // Until fbfft supports rectangular ffts just assert it does not
 75 |     assert(cuFFT() || y == x);
 76 |     std::vector<long> result(4);
 77 |     result[0] = batch;
 78 |     result[1] = plane;
 79 |     result[2] = (fbFFT() && Hermitian) ? numHermitian(y) : y;
 80 |     result[3] = (cuFFT() && Hermitian) ? numHermitian(x) : x;
 81 |     return result;
 82 |   }
 83 | 
 84 |   // Replaces cufft plans in the case of fbfft, only needed for sizes > 32.
 85 |   // For <= 32 we do everything in place.
 86 |   std::vector<long> makeTmpBufferSizes(
 87 |       long batch, long plane, long y, long x) {
 88 |     assert(fbFFT());
 89 |     // Until fbfft supports rectangular ffts just assert it does not
 90 |     assert(y == x);
 91 |     if (y <= 32) {
 92 |       std::vector<long> result;
 93 |       return result;
 94 |     }
 95 |     std::vector<long> result(4);
 96 |     result[0] = batch;
 97 |     result[1] = plane;
 98 |     if (forwardFFT()) {
 99 |       result[2] = numHermitian(y);
100 |     } else {
101 |       result[2] = y;
102 |     }
103 |     result[3] = x;
104 |     return result;
105 |   }
106 | 
107 |   enum FFTVersion {
108 |     cufft = 0,
109 |     fbfft = 1
110 |   } version;
111 | 
112 |  private:
113 |   bool direction_;
114 |   bool normalize_;
115 |   int padLeft_;
116 |   int padUp_;
117 | };
118 | 
119 | template <int NumBatch, int RealTensorDim>
120 | cufftHandle
121 | makeCuFFTPlan(const cuda::DeviceTensor<float, RealTensorDim>& real,
122 |               const cuda::DeviceTensor<float, RealTensorDim + 1>& complex,
123 |               FFTParameters params = FFTParameters());
124 | 
125 | template <int BatchDims>
126 | void fft1d(cuda::DeviceTensor<float, BatchDims + 1>& real,
127 |            cuda::DeviceTensor<float, BatchDims + 2>& complex,
128 |            FFTParameters params = FFTParameters(),
129 |            cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to
130 |                                      // encode lack of a plan
131 |            cudaStream_t stream = NULL);
132 | 
133 | template <int BatchDims>
134 | void fft2d(cuda::DeviceTensor<float, BatchDims + 2>& real,
135 |            cuda::DeviceTensor<float, BatchDims + 3>& complex,
136 |            FFTParameters params = FFTParameters(),
137 |            cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to
138 |                                      // encode lack of a plan
139 |            cudaStream_t stream = NULL);
140 | 
141 | template <int BatchDims>
142 | void fft3d(cuda::DeviceTensor<float, BatchDims + 3>& real,
143 |            cuda::DeviceTensor<float, BatchDims + 4>& complex,
144 |            FFTParameters params = FFTParameters(),
145 |            cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to
146 |                                      // encode lack of a plan
147 |            cudaStream_t stream = NULL);
148 | 
149 | template <int NumBatch, int RealTensorDim>
150 | void fft(cuda::DeviceTensor<float, RealTensorDim>& real,
151 |          cuda::DeviceTensor<float, RealTensorDim + 1>& complex,
152 |          FFTParameters params = FFTParameters(),
153 |          cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to
154 |                                    // encode lack of a plan
155 |          cudaStream_t stream = NULL);
156 | } } } // namespace
157 | 


--------------------------------------------------------------------------------
/src/fft/FBFFTDevice.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "cuda/fbfft/FBFFT.cuh"
 4 | #include "cuda/fbfft/FBFFTCommon.cuh"
 5 | 
 6 | namespace facebook { namespace cuda { namespace fbfft {
 7 | 
 8 | template
 9 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft1D<1>(
10 |     DeviceTensor<float, 2>& real,
11 |     DeviceTensor<float, 3>& complex,
12 |     const int padL,
13 |     cudaStream_t s);
14 | 
15 | template
16 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2D<1>(
17 |     DeviceTensor<float, 3>& real,
18 |     DeviceTensor<float, 4>& complex,
19 |     const int padL,
20 |     const int padU,
21 |     cudaStream_t s);
22 | 
23 | template
24 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2D<1>(
25 |     DeviceTensor<Complex, 3>& complexSrc,
26 |     DeviceTensor<Complex, 3>& complexDst,
27 |     cudaStream_t s);
28 | 
29 | template
30 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft1D<1>(
31 |     DeviceTensor<float, 2>& real,
32 |     DeviceTensor<float, 3>& complex,
33 |     const int padL,
34 |     cudaStream_t s);
35 | 
36 | template
37 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft2D<1>(
38 |     DeviceTensor<float, 4>& srcComplexAsFloat,
39 |     DeviceTensor<float, 4>& dstComplexAsFloat,
40 |     cudaStream_t s);
41 | 
42 | template
43 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft2D<1>(
44 |     DeviceTensor<Complex, 3>& srcComplex,
45 |     DeviceTensor<float, 3>& realDst,
46 |     const int padL,
47 |     const int padU,
48 |     cudaStream_t s);
49 | 
50 | }}}
51 | 


--------------------------------------------------------------------------------
/src/fft/FBFFTHost.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cuda/DeviceTensor.cuh"
 5 | 
 6 | namespace facebook { namespace deeplearning { namespace torch {
 7 | 
 8 | template <int Batch>
 9 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft1dHost(
10 |   facebook::cuda::DeviceTensor<float, Batch + 1>& real,
11 |   facebook::cuda::DeviceTensor<float, Batch + 2>& complexAsFloat,
12 |   facebook::cuda::fbfft::FBFFTParameters params =
13 |   facebook::cuda::fbfft::FBFFTParameters(),
14 |   cudaStream_t s = 0);
15 | 
16 | // If calling a 2D-fft of size > 32 we need a buffer to avoid a race condition
17 | // between reads and writes to device memory on the corner turn.
18 | template <int Batch>
19 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2dHost(
20 |   facebook::cuda::DeviceTensor<float, Batch + 2>& real,
21 |   facebook::cuda::DeviceTensor<float, Batch + 3>& complexAsFloat,
22 |   facebook::cuda::DeviceTensor<float, Batch + 3>* bufferAsFloat,
23 |   facebook::cuda::fbfft::FBFFTParameters params =
24 |   facebook::cuda::fbfft::FBFFTParameters(),
25 |   cudaStream_t s = 0);
26 | 
27 | // If calling a 2D-fft of size > 32 we need a buffer to avoid a race condition
28 | // between reads and writes to device memory on the corner turn.
29 | template <int Batch>
30 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft(
31 |   THCState* state,
32 |   THCudaTensor* real,
33 |   THCudaTensor* complex,
34 |   THCudaTensor* buffer = nullptr,
35 |   facebook::cuda::fbfft::FBFFTParameters params =
36 |   facebook::cuda::fbfft::FBFFTParameters());
37 | 
38 | } } } // namespace
39 | 


--------------------------------------------------------------------------------
/src/fft/FFTIteratedConvolution.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #include "src/DeviceTensorUtils.h"
 4 | #include "THCTensor.h"
 5 | 
 6 | #include "cuda/DeviceTensor.cuh"
 7 | #include "cuda/fbfft/FFTIteratedConvolution.cuh"
 8 | 
 9 | #include <thrust/host_vector.h>
10 | #include <thrust/device_vector.h>
11 | 
12 | using namespace facebook::cuda;
13 | 
14 | namespace facebook { namespace deeplearning { namespace torch {
15 | 
16 | typedef struct {
17 |   THCudaTensor* tensor;
18 |   int padL;
19 |   int padU;
20 | } TiledDeviceTensorFFI;
21 | 
22 | #define LOG_TARGET LOG(INFO)
23 | 
24 | #define INSTANTIATE_ITERATED_CONVOLUTION(DIM, FFT_SIZE)                 \
25 |   if (THCudaTensor_nDimension(state, weight) == DIM &&                  \
26 |       fftSize == FFT_SIZE) {                                            \
27 |     thrust::host_vector<fbfft::detail::TiledDeviceTensor<float, DIM> >  \
28 |       tiledInputs;                                                      \
29 |     thrust::host_vector<fbfft::detail::TiledDeviceTensor<float, DIM> >  \
30 |       tiledOutputs;                                                     \
31 |     for (int i = 0; i < numTiles; ++i) {                                \
32 |       DeviceTensor<float, DIM> ti(                                      \
33 |         torchToDeviceTensor<float, DIM>(state, input[i].tensor));       \
34 |       fbfft::detail::TiledDeviceTensor<float, DIM> inp(                 \
35 |         ti,                                                             \
36 |         input[i].padL,                                                  \
37 |         input[i].padU);                                                 \
38 |       /* TODO: emplace_back */                                          \
39 |       tiledInputs.push_back(inp);                                       \
40 |                                                                         \
41 |       DeviceTensor<float, DIM> to(                                      \
42 |         torchToDeviceTensor<float, DIM>(state, output[i].tensor));      \
43 |       fbfft::detail::TiledDeviceTensor<float, DIM> out(                 \
44 |         to,                                                             \
45 |         output[i].padL,                                                 \
46 |         output[i].padU);                                                \
47 |       /* TODO: emplace_back */                                          \
48 |       tiledOutputs.push_back(out);                                      \
49 |     }                                                                   \
50 |                                                                         \
51 |     thrust::device_vector<fbfft::detail::TiledDeviceTensor<float, DIM> > \
52 |       ins = tiledInputs;                                                \
53 |     thrust::device_vector<fbfft::detail::TiledDeviceTensor<float, DIM> > \
54 |       outs = tiledOutputs;                                              \
55 |                                                                         \
56 |     DeviceTensor<float, DIM> wei(                                       \
57 |       torchToDeviceTensor<float, DIM>(state, weight));                  \
58 |     bool res =                                                          \
59 |       fbfft::detail::FFTIteratedConvolution<FFT_SIZE>(                  \
60 |         thrust::raw_pointer_cast(&ins[0]),                              \
61 |         thrust::raw_pointer_cast(&outs[0]),                             \
62 |         wei,                                                            \
63 |         pass,                                                           \
64 |         scale,                                                          \
65 |         batchSize,                                                      \
66 |         ins.size(),                                                     \
67 |         THCState_getCurrentStream(state));                              \
68 |     if (!res) { THError("Error in iterated convolution"); }             \
69 |   }
70 | 
71 | extern "C" void convolveIteratedFFI(THCState* state,
72 |                                     TiledDeviceTensorFFI* input,
73 |                                     THCudaTensor* weight,
74 |                                     TiledDeviceTensorFFI* output,
75 |                                     int numTiles,
76 |                                     int fftSize,
77 |                                     fbfft::detail::FFTConvolutionPassFFI pass,
78 |                                     float scale) {
79 |   // TODO: accGrad all on same stream, updateOutput / updateGradInput async
80 |   int batchSize = THCudaTensor_size(state, input[0].tensor, 0);
81 | 
82 |   ////////////////////////////////////////////////////////
83 |   // FFT of size 32
84 |   ////////////////////////////////////////////////////////
85 |   INSTANTIATE_ITERATED_CONVOLUTION(4, 32);
86 | 
87 |   ////////////////////////////////////////////////////////
88 |   // FFT of size 16
89 |   ////////////////////////////////////////////////////////
90 |   INSTANTIATE_ITERATED_CONVOLUTION(4, 16);
91 | 
92 |   ////////////////////////////////////////////////////////
93 |   // FFT of size 8
94 |   ////////////////////////////////////////////////////////
95 |   INSTANTIATE_ITERATED_CONVOLUTION(4, 8);
96 | }
97 | 
98 | }}}
99 | 


--------------------------------------------------------------------------------
/src/fft/SpatialConvolutionCuFFT.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2014 Facebook
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/fft/CuFFTStrategy.h"
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch {
 8 | namespace detail {
 9 | 
10 | void updateOutputTH(THCState* state,
11 |                     const THParams& p,
12 |                     const ProblemSizes& originalSizes,
13 |                     const CuFFTStrategy& s);
14 | 
15 | void updateGradInputTH(THCState* state,
16 |                        const THParams& p,
17 |                        const ProblemSizes& originalSizes,
18 |                        const CuFFTStrategy& s);
19 | 
20 | void accGradParametersTH(THCState* state,
21 |                          const THParams& p,
22 |                          const ProblemSizes& originalSizes,
23 |                          const CuFFTStrategy& s);
24 | 
25 | void cleanupBuffers();
26 | 
27 | }}}}
28 | 


--------------------------------------------------------------------------------
/src/fft/SpatialConvolutionCuFFTTuner.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2014 Facebook
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "src/fft/CuFFTStrategy.h"
 6 | #include <folly/Optional.h>
 7 | 
 8 | struct THCState;
 9 | 
10 | namespace facebook { namespace deeplearning { namespace torch {
11 | 
12 | struct SpatialConvolutionCuFFTTuner {
13 |   static folly::Optional<CuFFTStrategy> getBestPerformance(THCState* state,
14 |                                                            ProblemSizes pbs);
15 | };
16 | 
17 | }}}
18 | 


--------------------------------------------------------------------------------
/src/fft/Utils.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2014-present Facebook. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | namespace facebook { namespace deeplearning { namespace torch {
 5 | 
 6 | // Depending on whether cuFFT is expected, use the Hermitian symmetry
 7 | // properties that cufft exploits on the rows.
 8 | template <typename T>
 9 | __device__ __host__ T numHermitian(T commonCols) {
10 |   return commonCols / 2 + 1;
11 | 
12 | }
13 | 
14 | }}} // namespace
15 | 


--------------------------------------------------------------------------------
/src/fft/Utils.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | 
  3 | #pragma once
  4 | 
  5 | #include "thpp/Tensor.h"
  6 | #include "THCTensor.h"
  7 | #include "src/CudaTensorUtils.h"
  8 | 
  9 | #include <glog/logging.h>
 10 | #include <vector>
 11 | 
 12 | namespace facebook { namespace deeplearning { namespace torch {
 13 | enum class FFTOutputSpecification : bool { InPlace = true, OutOfPlace = false };
 14 | 
 15 | // Given a 4-D input tensor in (?, ?, row, col) storage mode and a common
 16 | // padding specification for Rows and Cols, creates a real and complex cuda
 17 | // tensor suitable for cuFFT.
 18 | // If the FFTOutputSpecification is InPlace then complex and real alias the same
 19 | // storage buffer.
 20 | // The real 'time' tensor has:
 21 | //   - same dimensionality as the input tensor (4)
 22 | //   - same sizes as the input tensor
 23 | //   - modified strides to accommodate padding to (commonRows, commonCols)
 24 | // The complex 'frequency' tensor has:
 25 | //   - dimensionality 5 to support AoS with S == cufftComplex == float[2]
 26 | //   - size == stride == (?, ?, NumRows, NumCols / 2 + 1) to accommodate the
 27 | //     output of cufft R2C which has only 1/2 the data due to Hermitian
 28 | //     symmetry (X[k] == X*[-k mod NumCols])
 29 | //
 30 | // Warning going to multi-GPUs: In Version 6.0 only a subset of single GPU
 31 | // functionality is supported for two GPU execution.
 32 | // http://docs.nvidia.com/cuda/cufft/index.html#ixzz39Wu2cUWp
 33 | // TODO:#4846735 extend for 1-D and 3-D FFTs
 34 | // Always dim 4 (3b+1fft, 2b+2fft, 1b+3fft) atm, extend later
 35 | //
 36 | // This method always copies the real data.
 37 | // TODO(4948477) Remove the copy when it is not needed
 38 | template <int FFTDim>
 39 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
 40 | makeCuFFTTensorReal(
 41 |   THCState* state,
 42 |   THCudaTensor* in,
 43 |   const std::vector<long>& commonDims,
 44 |   THCudaTensor* candidateCudaStorageReal = nullptr,
 45 |   FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace);
 46 | 
 47 | // Given a real tensor that is properly padded for interpolation, construct a
 48 | // complex tensor that will hold the output of the CuFFT_R2C operation.
 49 | // If in place, reuse the real THCudaTensor storage
 50 | // Otherwise, if candidateCudaStorageComplex is large enough, use it.
 51 | // Otherwise allocate a new cuda buffer.
 52 | //
 53 | // This method never copies data but will fill with 0 if allocation occurs.
 54 | template <int FFTDim>
 55 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
 56 | makeCuFFTTensorComplex(
 57 |   THCState* state,
 58 |   THCudaTensor* real,
 59 |   const std::vector<long>& commonDims,
 60 |   THCudaTensor* candidateCudaStorageComplex = nullptr,
 61 |   FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace);
 62 | 
 63 | // Given a 4-D vector containing the sizes, this allocates a full tensor of
 64 | // the specified sizes with strides matching exactly.
 65 | // If candidate storage is specified it will try to reuse the storage.
 66 | // This version does not need a model tensor but requires all dims to be
 67 | // specified a-priori.
 68 | template <int FFTDim>
 69 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
 70 | makeCuFFTTensorComplex(
 71 |   THCState* state,
 72 |   const std::vector<long>& allDims,
 73 |   THCudaTensor* candidateCudaStorageComplex = nullptr);
 74 | 
 75 | // Make properly sized and padded real and complex tensors on the Cuda device
 76 | // This version is wasteful and always creates new storage; used in tests
 77 | template <int FFTDim>
 78 | std::pair<std::unique_ptr<THCudaTensor, CudaTensorDeleter>,
 79 |           std::unique_ptr<THCudaTensor, CudaTensorDeleter>>
 80 | makeCuFFTTensors(
 81 |   THCState* state,
 82 |   THCudaTensor* in,
 83 |   const std::vector<long>& commonDims,
 84 |   FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace);
 85 | 
 86 | // Make properly sized and padded real and complex tensors on the Cuda device
 87 | // This version is wasteful and always creates new storage; used in tests
 88 | template <int FFTDim>
 89 | std::pair<std::unique_ptr<THCudaTensor, CudaTensorDeleter>,
 90 |           std::unique_ptr<THCudaTensor, CudaTensorDeleter>>
 91 | makeCuFFTTensors(
 92 |   THCState* state,
 93 |   thpp::Tensor<float>& in,
 94 |   const std::vector<long>& commonDims,
 95 |   FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace);
 96 | 
 97 | } } } // namespace
 98 | 
 99 | #include "Utils-inl.h"
100 | 


--------------------------------------------------------------------------------
/src/init.cu:
--------------------------------------------------------------------------------
 1 | #include "luaT.h"
 2 | #include "THC.h"
 3 | 
 4 | #include "TemporalMaxPooling.cu"
 5 | 
 6 | LUA_EXTERNC DLL_EXPORT int luaopen_libfbcunn(lua_State *L);
 7 | 
 8 | int luaopen_libfbcunn(lua_State *L)
 9 | {
10 |   lua_newtable(L);
11 | 
12 |   fbcunn_TemporalMaxPooling_init(L);
13 | 
14 |   return 1;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/util/AsyncCopier.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Facebook
 3 |  * @author Tudor Bosman (tudorb@fb.com)
 4 |  */
 5 | 
 6 | #ifndef DEEPLEARNING_TORCH_CUDA_UTIL_ASYNCCOPIER_H_
 7 | #define DEEPLEARNING_TORCH_CUDA_UTIL_ASYNCCOPIER_H_
 8 | 
 9 | #include <cstdint>
10 | #include <deque>
11 | #include <memory>
12 | #include <mutex>
13 | #include <vector>
14 | #include <cuda_runtime_api.h>
15 | #include <folly/Optional.h>
16 | #include <folly/small_vector.h>
17 | 
18 | namespace facebook { namespace cuda {
19 | 
20 | class AsyncCopier {
21 |  public:
22 |   explicit AsyncCopier(size_t bufferSize);
23 | 
24 |   void copyHtoD(void* dest, const void* src, size_t size);
25 | 
26 |  private:
27 |   class Deallocator {
28 |    public:
29 |     void operator()(uint8_t* ptr) const;
30 |   };
31 | 
32 |   struct Event {
33 |     explicit Event(int device);
34 | 
35 |     int device;
36 |     folly::Optional<cudaEvent_t> event;
37 |     ssize_t refCount;
38 |   };
39 | 
40 |   struct AllocatedBlock {
41 |     AllocatedBlock(size_t s, size_t l) : start(s), length(l) { }
42 |     size_t start;
43 |     size_t length;
44 |     Event* event = nullptr;
45 |   };
46 | 
47 |   static bool pollEvent(Event* event);  // returns true if completed
48 |   static void waitEvent(Event* event);
49 | 
50 |   typedef folly::small_vector<AllocatedBlock, 2> RangeVec;
51 |   RangeVec getRangesLocked() const;
52 |   Event* getEventLocked();
53 |   void releaseEventLocked(Event* event);
54 | 
55 |   const size_t bufferSize_;
56 |   std::unique_ptr<uint8_t[], Deallocator> buffer_;
57 | 
58 |   std::mutex mutex_;
59 |   std::vector<std::deque<Event>> events_;
60 |   std::vector<std::vector<Event*>> freeEvents_;
61 |   std::deque<AllocatedBlock> allocated_;
62 | };
63 | 
64 | }}  // namespaces
65 | 
66 | #endif /* DEEPLEARNING_TORCH_CUDA_UTIL_ASYNCCOPIER_H_ */
67 | 


--------------------------------------------------------------------------------
/src/util/GlobalAsyncCopier.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Facebook
 3 |  * @author Tudor Bosman (tudorb@fb.com)
 4 |  */
 5 | 
 6 | #include "src/util/GlobalAsyncCopier.h"
 7 | 
 8 | #include <cstdlib>
 9 | #include <folly/Conv.h>
10 | #include <folly/Memory.h>
11 | 
12 | #include "src/util/AsyncCopier.h"
13 | 
14 | using namespace facebook::cuda;
15 | 
16 | constexpr size_t kDefaultBufferSizeMB = 16;
17 | const char* const kBufferSizeEnvVar = "FB_CUDA_ASYNC_COPIER_BUFFER_SIZE_MB";
18 | 
19 | std::unique_ptr<AsyncCopier> makeGlobalCopier() {
20 |   size_t bufferSize = kDefaultBufferSizeMB;
21 |   auto ptr = getenv(kBufferSizeEnvVar);
22 |   if (ptr) {
23 |     bufferSize = folly::to<size_t>(ptr);
24 |   }
25 | 
26 |   return std::make_unique<AsyncCopier>(bufferSize << 20);
27 | }
28 | 
29 | extern "C" void fbCudaAsyncMemcpyHtoD(void* dest,
30 |                                       const void* src,
31 |                                       size_t size) {
32 |   static auto copier = makeGlobalCopier();
33 |   copier->copyHtoD(dest, src, size);
34 | }
35 | 


--------------------------------------------------------------------------------
/src/util/GlobalAsyncCopier.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Copyright 2014 Facebook
 3 |  * @author Tudor Bosman (tudorb@fb.com)
 4 |  */
 5 | 
 6 | #ifndef DEEPLEARNING_TORCH_CUDA_UTIL_GLOBALASYNCCOPIER_H_
 7 | #define DEEPLEARNING_TORCH_CUDA_UTIL_GLOBALASYNCCOPIER_H_
 8 | 
 9 | #include <stddef.h>
10 | 
11 | #ifdef __cplusplus
12 | extern "C" {
13 | #endif
14 | 
15 | void fbCudaAsyncMemcpyHtoD(void* dest, const void* src, size_t size);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif
20 | 
21 | #endif /* DEEPLEARNING_TORCH_CUDA_UTIL_GLOBALASYNCCOPIER_H_ */
22 | 


--------------------------------------------------------------------------------
/src/util/Misc.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 2 | 
 3 | #include "src/util/Misc.h"
 4 | 
 5 | #include <folly/Format.h>
 6 | #include <mutex>
 7 | #include <unordered_map>
 8 | 
 9 | using namespace std;
10 | 
11 | namespace facebook { namespace cuda {
12 | 
13 | cudaStream_t getComputeStream() {
14 |   // It would be nice to compute on non-default streams from time to time,
15 |   // but there's a *lot* of code to change.
16 |   return 0;
17 | }
18 | 
19 | [[noreturn]] void throwCudaError(cudaError_t error, const char* msg) {
20 |   auto string = msg ?
21 |     folly::sformat("{}: CUDA error {} ({})", msg, int(error),
22 |                    cudaGetErrorString(error)) :
23 |       folly::sformat("CUDA error {} ({})", int(error),
24 |                      cudaGetErrorString(error));
25 |   throw std::runtime_error(string);
26 | }
27 | 
28 | } }
29 | 


--------------------------------------------------------------------------------
/src/util/Misc.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include "cuda/util/CachedDeviceProperties.h"
 6 | 
 7 | #include <cuda_runtime.h>
 8 | 
 9 | namespace facebook { namespace cuda {
10 | 
11 | [[noreturn]] void throwCudaError(cudaError_t, const char* msg);
12 | 
13 | inline void
14 | checkCudaError(cudaError_t error, const char* msg = 0) {
15 |   if (error != cudaSuccess) {
16 |     throwCudaError(error, msg);
17 |   }
18 | }
19 | 
20 | class OnDevice {
21 |   int m_home;
22 |  public:
23 |   explicit OnDevice(int newDev) : m_home(getDevice()) {
24 |     checkCudaError(cudaSetDevice(newDev));
25 |   }
26 | 
27 |   ~OnDevice() {
28 |     checkCudaError(cudaSetDevice(m_home));
29 |   }
30 | };
31 | 
32 | cudaStream_t getComputeStream();
33 | 
34 | } }
35 | 


--------------------------------------------------------------------------------
/src/util/Transform.cu:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 2 | 
 3 | #include <algorithm>
 4 | #include <assert.h>
 5 | 
 6 | #include "src/util/Transform.cuh"
 7 | 
 8 | namespace facebook { namespace cuda {
 9 | 
10 | template<typename Operator>
11 | __global__ static void
12 | transformKernel(const typename Operator::Input* input,
13 |                 typename Operator::Output* out,
14 |                 size_t n) {
15 | 
16 |   Operator op;
17 |   size_t start = threadIdx.x + blockIdx.x * blockDim.x;
18 |   if (start >= n) return;
19 |   out[start] = op(input[start]);
20 | }
21 | 
22 | size_t roundUp(double d) {
23 |   return size_t(ceil(d));
24 | }
25 | 
26 | template<typename Op>
27 | void transform(cudaStream_t stream,
28 |                const typename Op::Input* input,
29 |                typename Op::Output* out,
30 |                size_t n) {
31 |   static const int kThreadsPerBlock = 128;
32 |   assert(n > 0);
33 |   int totalNumBlocks = int(ceil(1.0 * n / kThreadsPerBlock));
34 |   dim3 blockDim(kThreadsPerBlock);
35 |   dim3 gridDim(totalNumBlocks);
36 |   transformKernel<Op><<<gridDim, blockDim, 0, stream>>>(input, out, n);
37 | }
38 | 
39 | template void transform<ToHalf>(cudaStream_t stream,
40 |                                 const ToHalf::Input* in,
41 |                                 ToHalf::Output* out,
42 |                                 size_t n);
43 | template void transform<ToFloat>(cudaStream_t stream,
44 |                                  const ToFloat::Input* in,
45 |                                  ToFloat::Output* out,
46 |                                  size_t n);
47 | 
48 | } }
49 | 


--------------------------------------------------------------------------------
/src/util/Transform.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | 
 5 | #include <stdint.h>
 6 | #include <cuda.h>
 7 | #include <cuda_runtime.h>
 8 | 
 9 | namespace facebook { namespace cuda {
10 | 
11 | /*
12 |  * A generic interface for dense point-to-point operations.
13 |  */
14 | template<typename Operator>
15 | void transform(cudaStream_t stream,
16 |                const typename Operator::Input* input,
17 |                typename Operator::Output* out, size_t n);
18 | 
19 | typedef uint16_t half_t;
20 | 
21 | // Some pointwise operations. They must publicly define Input and
22 | // Output types, and provide an operator() mapping one input to one
23 | // output.
24 | struct ToHalf {
25 |   typedef float Input;
26 |   typedef half_t Output;
27 |   Output __device__ operator()(const Input f) {
28 |     return __float2half_rn(f);
29 |   }
30 | };
31 | 
32 | struct ToFloat {
33 |   typedef half_t Input;
34 |   typedef float Output;
35 |   Output __device__ operator()(const Input h) {
36 |     return __half2float(h);
37 |   }
38 | };
39 | 
40 | } }
41 | 


--------------------------------------------------------------------------------
/test/CudaTensorTestKernels.cu:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | #include "cuda/DeviceTensor.cuh"
  3 | #include "cuda/util/CachedDeviceProperties.h"
  4 | #include "src/DeviceTensorUtils.h"
  5 | 
  6 | #include <cuda.h>
  7 | 
  8 | using namespace facebook::cuda;
  9 | using namespace facebook::cuda;
 10 | 
 11 | namespace facebook { namespace deeplearning { namespace torch {
 12 | 
 13 | __global__ void testAssignment1dKernel(DeviceTensor<float, 1> tensor) {
 14 |   // Thread grid is already sized exactly for our tensor
 15 |   tensor[threadIdx.x] = threadIdx.x;
 16 | }
 17 | 
 18 | bool testAssignment1d(THCState* state, THCudaTensor* t) {
 19 |   DeviceTensor<float, 1> tensor =
 20 |     torchToDeviceTensor<float, 1>(state, t);
 21 | 
 22 |   const cudaDeviceProp& deviceProp = getDeviceProperties(0);
 23 | 
 24 |   if (deviceProp.maxThreadsDim[0] < tensor.getSize(0)) {
 25 |     // tensor too large to be covered exactly by threads in one block alone
 26 |     return false;
 27 |   }
 28 | 
 29 |   testAssignment1dKernel<<<1, tensor.getSize(0)>>>(tensor);
 30 | 
 31 |   return (cudaGetLastError() == cudaSuccess);
 32 | }
 33 | 
 34 | __global__ void testAssignment3dKernel(DeviceTensor<float, 3> tensor) {
 35 |   // Thread grid is already sized exactly for our tensor
 36 |   tensor[threadIdx.z][threadIdx.y][threadIdx.x] =
 37 |     tensor.getSize(0) * threadIdx.z +
 38 |     tensor.getSize(1) * threadIdx.y +
 39 |     tensor.getSize(2) * threadIdx.x;
 40 | }
 41 | 
 42 | bool testAssignment3d(THCState* state, THCudaTensor* t) {
 43 |   DeviceTensor<float, 3> tensor = torchToDeviceTensor<float, 3>(state, t);
 44 | 
 45 |   const cudaDeviceProp& deviceProp = getDeviceProperties(0);
 46 | 
 47 |   for (int i = 0; i < 3; ++i) {
 48 |     if (deviceProp.maxThreadsDim[i] < tensor.getSize(i)) {
 49 |       // tensor too large to be covered exactly by threads in one block alone
 50 |       return false;
 51 |     }
 52 |   }
 53 | 
 54 |   dim3 threadsPerBlock(tensor.getSize(2),
 55 |                        tensor.getSize(1),
 56 |                        tensor.getSize(0));
 57 |   testAssignment3dKernel<<<1, threadsPerBlock>>>(tensor);
 58 | 
 59 |   return (cudaGetLastError() == cudaSuccess);
 60 | }
 61 | 
 62 | template <int NewDim, int Dim>
 63 | bool verifyUpcast(DeviceTensor<float, NewDim> up,
 64 |                   DeviceTensor<float, Dim> orig) {
 65 |   int shift = NewDim - Dim;
 66 | 
 67 |   // Check extended dimensions size and stride
 68 |   for (int i = 0; i < shift; ++i) {
 69 |     if (up.getSize(i) != 1) {
 70 |       return false;
 71 |     } else if (up.getStride(i) !=
 72 |                orig.getStride(0) * orig.getSize(0)) {
 73 |       return false;
 74 |     }
 75 |   }
 76 | 
 77 |   // Check original dimensions size and stride
 78 |   for (int i = shift; i < NewDim; ++i) {
 79 |     if (up.getSize(i) != orig.getSize(i - shift)) {
 80 |       return false;
 81 |     } else if (up.getStride(i) != orig.getStride(i - shift)) {
 82 |       return false;
 83 |     }
 84 |   }
 85 | 
 86 |   return true;
 87 | }
 88 | 
 89 | bool testUpcast(THCState* state, THCudaTensor* t) {
 90 |   DeviceTensor<float, 3> tensor = torchToDeviceTensor<float, 3>(state, t);
 91 | 
 92 |   if (!verifyUpcast(tensor.upcastOuter<4>(), tensor)) {
 93 |     return false;
 94 |   } else if (!verifyUpcast(tensor.upcastOuter<5>(), tensor)) {
 95 |     return false;
 96 |   }
 97 | 
 98 |   return true;
 99 | }
100 | 
101 | bool testDowncastTo2d(THCState* state, THCudaTensor* t) {
102 |   DeviceTensor<float, 3> tensor = torchToDeviceTensor<float, 3>(state, t);
103 |   DeviceTensor<float, 2> downTensor = tensor.downcastOuter<2>();
104 | 
105 |   if (downTensor.getSize(0) !=
106 |       tensor.getSize(0) * tensor.getSize(1)) {
107 |     return false;
108 |   } else if (downTensor.getStride(0) !=
109 |              tensor.getSize(2) * tensor.getStride(2)) {
110 |     return false;
111 |   } else if (downTensor.getSize(1) !=
112 |              tensor.getSize(2)) {
113 |     return false;
114 |   } else if (downTensor.getStride(1) !=
115 |              tensor.getStride(2)) {
116 |     return false;
117 |   }
118 | 
119 |   return true;
120 | }
121 | 
122 | bool testDowncastTo1d(THCState* state, THCudaTensor* t) {
123 |   DeviceTensor<float, 3> tensor = torchToDeviceTensor<float, 3>(state, t);
124 |   DeviceTensor<float, 1> downTensor = tensor.downcastOuter<1>();
125 | 
126 |   if (downTensor.getSize(0) !=
127 |       tensor.getSize(0) * tensor.getSize(1) * tensor.getSize(2)) {
128 |     return false;
129 |   } else if (downTensor.getStride(0) !=
130 |              tensor.getStride(2)) {
131 |     return false;
132 |   }
133 | 
134 |   return true;
135 | }
136 | 
137 | __global__ void testDowncastWritesKernel(DeviceTensor<float, 1> tensor) {
138 |   // Thread grid is already sized exactly for our tensor
139 |   tensor[threadIdx.x] = 1.0f;
140 | }
141 | 
142 | bool testDowncastWrites(THCState* state, THCudaTensor* t) {
143 |   DeviceTensor<float, 3> tensor = torchToDeviceTensor<float, 3>(state, t);
144 |   DeviceTensor<float, 1> downTensor = tensor.downcastOuter<1>();
145 | 
146 |   testDowncastWritesKernel<<<1, downTensor.getSize(0)>>>(downTensor);
147 |   return (cudaGetLastError() == cudaSuccess);
148 | }
149 | 
150 | } } } // namespace
151 | 


--------------------------------------------------------------------------------
/test/CudaTensorTestKernels.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | struct THCudaTensor;
 5 | 
 6 | ///
 7 | /// Collection of kernels for testing DeviceTensor<>
 8 | ///
 9 | 
10 | namespace facebook { namespace deeplearning { namespace torch {
11 | 
12 | /// Assign values to the tensor via CudaTensor based on position
13 | bool testAssignment1d(THCState* state, THCudaTensor* tensor);
14 | bool testAssignment3d(THCState* state, THCudaTensor* tensor);
15 | 
16 | /// Test upcasting to a higher-dimensional tensor
17 | bool testUpcast(THCState* state, THCudaTensor* tensor);
18 | 
19 | /// Downcast tests
20 | bool testDowncastTo2d(THCState* state, THCudaTensor* tensor);
21 | bool testDowncastTo1d(THCState* state, THCudaTensor* tensor);
22 | bool testDowncastWrites(THCState* state, THCudaTensor* tensor);
23 | 
24 | } } } // namespace
25 | 


--------------------------------------------------------------------------------
/test/InputCentricConvolution_UpdateOutput.cuh:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | struct THCudaTensor;
 5 | struct THCState;
 6 | 
 7 | namespace facebook { namespace deeplearning { namespace torch { namespace test {
 8 | 
 9 | bool InputCentricRelayoutConvolution_UpdateOutput(THCState* state,
10 |                                                   THCudaTensor* inputTH,
11 |                                                   THCudaTensor* kernelsTH,
12 |                                                   long filterRowStride,
13 |                                                   long filterColStride,
14 |                                                   THCudaTensor* outputTH);
15 | 
16 | } } } }
17 | 


--------------------------------------------------------------------------------
/test/ReferenceConvolutions.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2004-present Facebook. All Rights Reserved.
  2 | #pragma once
  3 | 
  4 | #include "src/Tensor.h"
  5 | 
  6 | #include <folly/Optional.h>
  7 | #include <tuple>
  8 | 
  9 | namespace facebook { namespace deeplearning { namespace torch { namespace test {
 10 | 
 11 | ///
 12 | /// Reference convolution/cross-correlation implementations
 13 | ///
 14 | 
 15 | /// Returns the output size based on the input and filter size and
 16 | /// stride for a valid-only convolution or cross-correlation
 17 | constexpr long
 18 | getValidConvSize(long inputSize, long filterSize, long filterStride) {
 19 |   return ((inputSize - filterSize) / filterStride) + 1;
 20 | }
 21 | 
 22 | /// Returns the output size based on the input and filter size and
 23 | /// stride for a reverse valid-only convolution or cross-correlation
 24 | constexpr long
 25 | getValidRevConvSize(long inputSize, long filterSize, long filterStride) {
 26 |   return inputSize - (filterSize - 1) * filterStride;
 27 | }
 28 | 
 29 | /// Returns the output size based on the input and filter size and
 30 | /// stride for a full convolution or cross-correlation
 31 | constexpr long
 32 | getFullConvSize(long inputSize, long filterSize, long filterStride) {
 33 |   return (inputSize - 1) * filterStride + filterSize;
 34 | }
 35 | 
 36 | /// Input to output:
 37 | ///
 38 | /// input (batch x img planes x img row x img col)
 39 | /// star (valid only)
 40 | /// filters (filter planes x img planes x filter row x filter col)
 41 | /// =
 42 | /// output (batch x filter planes x
 43 | ///         getValidConvSize(img row, filter row, stride),
 44 | ///         getValidConvSize(img col, filter col, stride))
 45 | /// Optional input padding is expressed as <top, bottom, left, right>
 46 | /// on each innermost 2d plane.
 47 | Tensor<float>
 48 | crossCorrelationValidOnly(
 49 |   const Tensor<float>& input,
 50 |   const Tensor<float>& filters,
 51 |   long filterRowStride,
 52 |   long filterColStride,
 53 |   const folly::Optional<std::tuple<long, long, long, long>>& padding =
 54 |   folly::none);
 55 | 
 56 | Tensor<float>
 57 | crossCorrelationValidOnlyInputCentric(
 58 |   const Tensor<float>& input,
 59 |   const Tensor<float>& filters,
 60 |   long filterRowStride,
 61 |   long filterColStride,
 62 |   const folly::Optional<std::tuple<long, long, long, long>>& padding =
 63 |   folly::none);
 64 | 
 65 | /// Output gradient to input gradient:
 66 | ///
 67 | /// output (batch x filter planes x
 68 | ///         getValidConvSize(img row, filter row, stride),
 69 | ///         getValidConvSize(img col, filter col, stride))
 70 | /// * (full)
 71 | /// filters (filter planes x img planes x filter row x filter col)
 72 | /// =
 73 | /// input (batch x img planes x img row x img col)
 74 | /// Optional input padding is expressed as <top, bottom, left, right>
 75 | /// on each innermost 2d plane.
 76 | Tensor<float>
 77 | convolutionFull(
 78 |   const Tensor<float>& output,
 79 |   const Tensor<float>& filters,
 80 |   long filterRowStride,
 81 |   long filterColStride,
 82 |   const folly::Optional<std::tuple<long, long, long, long>>& padding =
 83 |   folly::none);
 84 | 
 85 | /// Output gradient to weights:
 86 | ///
 87 | /// input (batch x img planes x img row x img col)
 88 | /// star (valid only)
 89 | /// output (batch x filter planes x
 90 | ///         getValidRevConvSize(img row, filter row, stride),
 91 | ///         getValidRevConvSize(img col, filter col, stride))
 92 | /// =
 93 | /// weight gradient (filter planes x img planes x filter row x filter col)
 94 | /// Optional input padding is expressed as <top, bottom, left, right>
 95 | /// on each innermost 2d plane. Scale is a multiplicative factor
 96 | /// applied pointwise to every output point
 97 | Tensor<float>
 98 | crossCorrelationReverseValidOnly(
 99 |   const Tensor<float>& input,
100 |   const Tensor<float>& output,
101 |   long filterRowStride,
102 |   long filterColStride,
103 |   float scale,
104 |   const folly::Optional<std::tuple<long, long, long, long>>& padding =
105 |   folly::none);
106 | 
107 | } } } } // namespace
108 | 


--------------------------------------------------------------------------------
/test/TestUtils.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | #pragma once
 4 | #include "cuda/DeviceTensor.cuh"
 5 | #include "src/CudaTensorUtils.h"
 6 | #include "src/DeviceTensorUtils.h"
 7 | #include "THCTensor.h"
 8 | #include "src/fft/CuFFTConvolution_UpdateOutput.cuh"
 9 | #include "src/fft/Utils.h"
10 | 
11 | #include <folly/Optional.h>
12 | #include <tuple>
13 | 
14 | namespace facebook { namespace deeplearning { namespace torch { namespace test {
15 | 
16 | // Constructs a full CUDA tensor of the same size as the input
17 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
18 | makeTHCudaTensorSameSize(THCState* state, const thpp::Tensor<float>& t);
19 | 
20 | // Constructs a full CUDA tensor with constant values
21 | thpp::Tensor<float>
22 | makeRandomTestTensor(std::initializer_list<long> sizeList);
23 | 
24 | thpp::Tensor<float> makeTestTensor(std::initializer_list<long> sizeList,
25 |                                    float constant);
26 | 
27 | // Constructs a CUDA tensor by scaling the factor list
28 | thpp::Tensor<float> makeTestTensor(
29 |   std::initializer_list<long> sizeList,
30 |   std::initializer_list<float> factorList,
31 |   const folly::Optional<std::tuple<long, long, long, long>>& padding =
32 |   folly::none);
33 | 
34 | // Constructs a full CUDA tensor by scaling {0.1f, 0.2f, 0.3f, 0.4f}
35 | thpp::Tensor<float> makeTestTensor(std::initializer_list<long> sizeList);
36 | 
37 | 
38 | bool isWithin(float a, float b, float relativeError = 1e-5f);
39 | 
40 | // Returns true or false if the two tensors match within some relative
41 | // error; also returns the 2d slice where they first differ as a
42 | // string if they do.
43 | // PrecisionDebug controls how many digits are printed on error in the
44 | // returned string.
45 | // If compareInter is set to true, comparison will only be performed on the
46 | // intersection subtensors:
47 | // [0, min(reference.size(0), test.size(0))] x ... x
48 | //   [0, min(reference.size(dim-1), test.size(dim-1))]
49 | // This is useful for kernels that write tail garbage
50 | std::pair<bool, std::string>
51 | compareTensors(const thpp::Tensor<float>& reference,
52 |                const thpp::Tensor<float>& test,
53 |                float relativeError = 1e-5f,
54 |                int precisionDebug = 4,
55 |                bool compareInter = false);
56 | 
57 | // Constructs a full CUDA tensor of the same size as the input
58 | template <int Dim>
59 | std::unique_ptr<THCudaTensor, CudaTensorDeleter>
60 | makeTHCudaTensorSameSize(THCState* state,
61 |                          const cuda::DeviceTensor<float, Dim>& t) {
62 |   std::vector<long> sizes;
63 |   std::vector<long> strides;
64 |   for (int i = 0; i < Dim; ++i) {
65 |     sizes.push_back(t.getSize(i));
66 |     strides.push_back(t.getStride(i));
67 |   }
68 | 
69 |   return makeTHCudaTensorFull(state, sizes, strides);
70 | }
71 | 
72 | }}}} // namespace
73 | 


--------------------------------------------------------------------------------
/test/benchmark_fft.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright 2004-present Facebook. All Rights Reserved.
  2 | require('fb.luaunit')
  3 | 
  4 | require 'cunn'
  5 | 
  6 | require 'fbcunn'
  7 | 
  8 | torch.setdefaulttensortype('torch.FloatTensor')
  9 | 
 10 | local test = {}
 11 | 
 12 | -- These are used for fast, exhaustive search over the parameters space
 13 | -- Can be overridden by setting problemSizes to non-{}
 14 | local batchList = {
 15 |       128, 64, 32,
 16 |       }
 17 | local filterList = {
 18 |       128, 96, 64, 32, 24,
 19 |       }
 20 | local planeList = {
 21 |       128, 96, 64, 32, 24, 3
 22 |       }
 23 | local inputRowList = {
 24 |       128, 96, 64, 32, 16, 13
 25 |       }
 26 | local inputColList = {
 27 |       128, 96, 64, 32, 16, 13
 28 |       }
 29 | local kernelRowList = {
 30 |       11, 9, 7, 5, 3
 31 |       }
 32 | local kernelColList = {}
 33 | 
 34 | -- batch, filters, plane, row, col, kernelRow, kernelCol overrides
 35 | -- the List arguments
 36 | -- This is particularly useful to explore tradeoffs between cufft
 37 | -- efficiency at various interpolation sizes and amount of work in
 38 | -- transpose + mxm
 39 | 
 40 | -- Soumith's benchmark sizes
 41 | local fixedSizes = {
 42 | --    {128,  96,   3, 128, 128,  11,  11},
 43 | --    {128,  64,  64,  64,  64,   9,   9},
 44 | --    {128, 128, 128,  32,  32,   9,   9},
 45 | --    {128, 128, 128,  16,  16,   7,   7},
 46 | --    {128, 384, 384,  13,  13,   3,   3},
 47 |     {128, 96, 256,  31,  31,   5,   5}, -- 1 GPU
 48 |     {128, 96, 128,  31,  31,   5,   5}, -- 2 GPU
 49 |     {64, 96, 256,  31,  31,   5,   5},  -- 2 GPU
 50 |     {128, 96, 256, 21,  31,   5,   5},  -- 2 GPU, 27 / 2 = 14 + 4 + 3
 51 |     {128, 96, 64,  31,  31,   5,   5},  -- 4 GPU
 52 |     {32, 96, 256,  31,  31,   5,   5},  -- 4 GPU
 53 |     {128, 96, 256, 14,  31,   5,   5},  -- 4 GPU, 27 / 4 = 7 + 4 + 3
 54 |     {64, 96, 256, 21,  31,   5,   5},   -- 4 GPU, 27 / 2 = 14 + 4 + 3
 55 |     {128, 96, 128, 21,  31,   5,   5},  -- 4 GPU, 27 / 2 = 14 + 4 + 3
 56 |     {64, 96, 128,  31,  31,   5,   5},  -- 2 GPU
 57 |   }
 58 | 
 59 | -- Running         76      81      84      8       9       92      88
 60 | -- Running         176     3       9       8       1       13      54
 61 | 
 62 | -- Set this to {} to run a small search around the fixedSizes
 63 | local problemSizes = fixedSizes -- {}
 64 | 
 65 | local problemSize = {}
 66 | 
 67 | local function testLoop()
 68 |   -- Just allocate some dummy placeholder to get to the proper
 69 |   -- function in the lua module
 70 |   local net = nn.SpatialConvolutionCuFFT(1, 1, 1, 1)
 71 |   local input = torch.Tensor(1, 1, 1, 1):normal():cuda()
 72 | 
 73 |   if table.getn(problemSize) > 0 then
 74 |     batchList = {problemSize[1]}
 75 |     filterList = {problemSize[2]}
 76 |     planeList = {problemSize[3]}
 77 |     inputRowList = {problemSize[4]}
 78 |     inputColList = {problemSize[5]}
 79 |     kernelRowList = {problemSize[6]}
 80 |     kernelColList = {}
 81 |   end
 82 | 
 83 |   local batches = torch.Tensor(batchList):cuda()
 84 |   local filters = torch.Tensor(filterList):cuda()
 85 |   local planes = torch.Tensor(planeList):cuda()
 86 |   local inputRows = torch.Tensor(inputRowList):cuda()
 87 |   local inputCols = torch.Tensor(inputColList):cuda()
 88 |   local kernelRows = torch.Tensor(kernelRowList):cuda()
 89 |   local kernelCols = torch.Tensor(kernelColList):cuda()
 90 | 
 91 |   print('-------------------------------------------------------')
 92 |   net:explorePerformance(input, batches, filters,
 93 |     planes, inputRows, inputCols, kernelRows, kernelCols)
 94 | 
 95 |   net:cleanupBuffers(input)
 96 |   collectgarbage()
 97 | end
 98 | 
 99 | if table.getn(problemSizes) >= 1 then
100 |   for i = 1, table.getn(problemSizes) do
101 |     problemSize = problemSizes[i]
102 |     testLoop()
103 |   end
104 | else
105 |   testLoop()
106 | end
107 | 


--------------------------------------------------------------------------------
/test/fb_test.lua:
--------------------------------------------------------------------------------
1 | require('fb.luaunit')
2 | require('fbtorch')
3 | 
4 | include('test.lua')
5 | 
6 | nn.testfbcunn()
7 | 


--------------------------------------------------------------------------------
/test/run_test_HSM_seed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | for tt in 1 2 4 8 16 32
 3 | do
 4 |     export OMP_NUM_THREADS=$tt
 5 |     echo ""
 6 |     echo ""
 7 |     echo "number of threads $tt"
 8 |     _build/opt/deeplearning/torch/th.llar deeplearning/torch/layers/test/test_HSM_speed.lua
 9 | done
10 | 


--------------------------------------------------------------------------------
/test/test_CuBLAS.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright 2004-present Facebook. All Rights Reserved.
  2 | require 'fb.luaunit'
  3 | require 'fbtorch'
  4 | require 'cunn'
  5 | require 'fbcunn'
  6 | 
  7 | torch.setdefaulttensortype('torch.FloatTensor')
  8 | 
  9 | local fb_test = {}
 10 | 
 11 | -- Let C = m-by-n and A = m-by-k
 12 | -- Format is m, n, k, seqIter, batch, numHandles, numStreams
 13 | local problemSize = {
 14 | -- Sanity tests
 15 | -- Trivial mxm, no batch, no iter
 16 |     {1, 1, 2, {}, {}, 0, 0},
 17 |     {1, 1, 2, {}, {}, 0, 1},
 18 |     {1, 1, 2, {}, {}, 1, 0},
 19 |     {1, 1, 2, {}, {}, 1, 1},
 20 |     {1, 1, 2, {}, {}, 16, 16},
 21 | -- 2x4 <- 2x8 * 8x4 as 1 iter, 1 batch
 22 |    {2, 4, 8, {1}, {1}, 1, 1},
 23 | -- 2x4 <- 2x8 * 8x4 as 1 iter, no batch
 24 |     {2, 4, 8, {1}, {}, 1, 1},
 25 | -- 2x4 <- 2x8 * 8x4 as no iter, 1 batch
 26 |     {2, 4, 8, {}, {1}, 1, 1},
 27 | -- 2x4 <- 2x8 * 8x4 as no iter, no batch
 28 |     {2, 4, 8, {}, {}, 1, 1},
 29 | -- 128x128 <- 128x128 * 128x128 as 4x4 iter, 4x4 batch
 30 |     {128, 128, 128, {4, 4}, {4, 4}, 1, 1},
 31 |     {1024, 1024, 1024, {1, 1}, {1, 1}, 1, 1},
 32 |     {1024, 1024, 1024, {}, {}, 1, 1},
 33 | --  Various way of performing temporal convolution of 512: 32 -> 16
 34 |     {16, 1024, 512, {}, {1}, 1, 1},
 35 |     {16, 1024, 512, {}, {}, 1, 1},
 36 |     {1, 1024, 512, {16}, {1}, 1, 1},
 37 |     {1, 1024, 512, {1}, {16}, 1, 1},
 38 |     {32 * 16, 1024, 512, {1}, {1}, 1, 1},
 39 |     {1, 1024, 512, {16 * 32}, {1}, 1, 1},
 40 |     {16, 1024, 512, {32}, {1}, 16, 1},
 41 |     {16, 1024, 512, {1}, {32}, 0, 0},
 42 |     {1, 1024, 512, {1}, {16 * 32}, 1, 1},
 43 |   }
 44 | 
 45 | -- This test exercises the performance of multi-handle + multi-stream on many
 46 | -- small gemms.
 47 | local _testMultiHandlePerf = {
 48 |   {513, 513, 513, {53}, {}, 0, 0},
 49 |   {513, 513, 513, {53}, {}, 1, 1},
 50 |   {513, 513, 513, {53}, {}, 1, 4},
 51 |   {513, 513, 513, {53}, {}, 4, 1},
 52 |   {513, 513, 513, {53}, {}, 4, 4},
 53 | }
 54 | 
 55 | local function concat(t1,t2)
 56 |     local res = {}
 57 |     for i=1,#t1 do
 58 |         res[#res + 1] = t1[i]
 59 |     end
 60 |     for i=1,#t2 do
 61 |         res[#res + 1] = t2[i]
 62 |     end
 63 |     return res
 64 | end
 65 | 
 66 | local function testLoop(problemSize)
 67 |   -- Just allocate some dummy placeholder to get to the proper
 68 |   -- function in the lua module
 69 |   local net = nn.CuBLASWrapper(true)
 70 | 
 71 |   local m = problemSize[1]
 72 |   local n = problemSize[2]
 73 |   local k = problemSize[3]
 74 |   local seqIter = problemSize[4]
 75 |   local batch = problemSize[5]
 76 |   local handles = problemSize[6]
 77 |   local streams = problemSize[7]
 78 |   local seqBatch = concat(seqIter, batch)
 79 |   local sA = torch.LongStorage(concat(seqBatch, {m, k}))
 80 |   local sB = torch.LongStorage(concat(seqBatch, {k, n}))
 81 |   local sC = torch.LongStorage(concat(seqBatch, {m, n}))
 82 |   local A = torch.Tensor(sA):cuda()
 83 |   local B = torch.Tensor(sB):cuda()
 84 |   local C = torch.Tensor(sC):cuda()
 85 | 
 86 |   cutorch.reserveBlasHandles(handles)
 87 |   cutorch.reserveStreams(streams)
 88 |   cutorch.synchronize()
 89 |   net:matmult(A, B, C, seqIter, batch)
 90 |   mytester:assert(true)
 91 | 
 92 |   cutorch.synchronize()
 93 |   collectgarbage()
 94 | end
 95 | 
 96 | function fb_test.testGEMMs()
 97 |   for i = 1, table.getn(_testMultiHandlePerf) do
 98 |     testLoop(_testMultiHandlePerf[i])
 99 |   end
100 |   for i = 1, table.getn(problemSize) do
101 |     testLoop(problemSize[i])
102 |   end
103 | end
104 | 
105 | mytester = torch.Tester()
106 | mytester:add(fb_test)
107 | mytester:run()
108 | 


--------------------------------------------------------------------------------
/test/test_DataParallel.lua:
--------------------------------------------------------------------------------
  1 | local fboptim = require('fboptim')
  2 | -- Copyright 2004-present Facebook. All Rights Reserved.
  3 | 
  4 | require 'fb.luaunit'
  5 | require 'optim'
  6 | require 'fbcunn'
  7 | require 'cunn'
  8 | 
  9 | -- Hyper-params. We're targeting a toy problem that computes
 10 | -- some function of its inputs.
 11 | local inputWidth = 32
 12 | local hiddenWidth = 512
 13 | local nHidden = 2
 14 | local outputWidth = 1
 15 | local numGPUs = cutorch.getDeviceCount()
 16 | 
 17 | local function targetFunction(x)
 18 |     -- admittedly tough for us to learn, but hey.
 19 |     local retval = torch.Tensor(outputWidth)
 20 |     local sum = x:sum()
 21 |     retval[1] = math.sin(sum)
 22 |     return retval
 23 | end
 24 | 
 25 | local function genInput()
 26 |    return torch.randn(inputWidth)
 27 | end
 28 | 
 29 | local function genWideInput()
 30 |    return torch.randn(inputWidth * numGPUs)
 31 | end
 32 | 
 33 | local function getNarrowedInputRange(i)
 34 |     assert(type(i) == 'number')
 35 |     local rangeStart = 1 + ((i - 1) * inputWidth)
 36 |     local rangeEnd = rangeStart + (inputWidth) - 1
 37 |     return rangeStart, rangeEnd
 38 | end
 39 | 
 40 | local function getNarrowedInput(input, i)
 41 |     assert(torch.typename(input))
 42 |     assert(type(i) == 'number')
 43 |     return input[{ {getNarrowedInputRange(i)} }]
 44 | end
 45 | 
 46 | local function genWideExample()
 47 |     local samp = genWideInput()
 48 |     local retval = torch.Tensor(outputWidth * numGPUs)
 49 |     for i = 1,numGPUs do
 50 |         retval[i] = targetFunction(getNarrowedInput(samp, i))
 51 |     end
 52 |     return samp:cuda(), retval:cuda()
 53 | end
 54 | 
 55 | local function simpleModel()
 56 |     local seq = nn.Sequential()
 57 |     local pred = inputWidth
 58 |     for i = 1,nHidden do
 59 |        seq:add(nn.Linear(pred, hiddenWidth))
 60 |        seq:add(nn.Tanh())
 61 |        pred = hiddenWidth
 62 |     end
 63 |     seq:add(nn.Linear(hiddenWidth, outputWidth))
 64 |     seq:add(nn.Tanh())
 65 |     return seq
 66 | end
 67 | 
 68 | local function tensorsAreProbablySimilar(l, r, epsilon)
 69 |     epsilon = epsilon or 0.00001
 70 |     return math.abs(l:norm() - r:norm()) < epsilon
 71 | end
 72 | 
 73 | function testDataParallel()
 74 |    -- Set up models on each GPU.
 75 |    local dp = nn.DataParallel(1)
 76 |    local simpleModels = {}
 77 |    for i = 1,numGPUs do
 78 |       if i == 1 then
 79 |          simpleModels[i] = simpleModel()
 80 |       else
 81 |          simpleModels[i] = simpleModels[1]:clone()
 82 |       end
 83 |       dp:add(simpleModels[i])
 84 |    end
 85 | 
 86 |    -- CPU models to cross-validate
 87 |    local cpuModels = {}
 88 |    local function syncCPUModels()
 89 |       for i = 1,numGPUs do
 90 |          cpuModels[i] = simpleModels[i]:clone()
 91 |          cpuModels[i] = cpuModels[i]:double()
 92 |       end
 93 |    end
 94 |    syncCPUModels()
 95 | 
 96 |    -- Check an input/output pair against the CPU models
 97 |    local function checkWideResult(inputs, outputs)
 98 |       local function checkOneResult(input, modIdx, expectedOutput)
 99 |          input = input:double() -- de-cudify
100 |          assert(tensorsAreProbablySimilar(cpuModels[modIdx]:forward(input),
101 |                                           expectedOutput))
102 |       end
103 |       for j = 1, numGPUs do
104 |          checkOneResult(getNarrowedInput(inputs, j), j, outputs[{ {j} }])
105 |       end
106 |    end
107 | 
108 |    local function checkCPUModelsAreEquivalent()
109 |       syncCPUModels()
110 |       local input = genInput()
111 |       local out = cpuModels[1]:forward(input)
112 |       for j = 2, numGPUs do
113 |          assert(tensorsAreProbablySimilar(out, cpuModels[j]:forward(input)))
114 |       end
115 |    end
116 |    checkCPUModelsAreEquivalent()
117 | 
118 |    dp:cuda()
119 | 
120 |    -- Make sure forward produces same results as an individual copy
121 |    for i=1, 10 do
122 |       local inputs, targets = genWideExample()
123 |       local outputs = dp:forward(inputs)
124 |       syncCPUModels()
125 |       checkWideResult(inputs, outputs)
126 |    end
127 | 
128 |    local optimState = {
129 |       learningRate = 1e-1,
130 |       weightDecay = 1e-4,
131 |       momentum = 0.9,
132 |       learningRateDecay = 1e-7
133 |    }
134 | 
135 |    local timer = torch.Timer()
136 |    local opt = nn.Optim(dp, optimState)
137 |    local criterion = nn.MSECriterion():cuda()
138 | 
139 |    local num_iteration = 10
140 |    timer:reset()
141 |    for i=1, num_iteration do
142 |       local inputs, targets = genWideExample()
143 |       local outputs = dp:forward(inputs)
144 |       syncCPUModels()
145 |       checkWideResult(inputs, outputs)
146 |       opt:optimize(fboptim.sgd, inputs, targets, criterion)
147 |       local out = dp:forward(inputs)
148 |       local err = criterion:forward(out, targets)
149 |    end
150 |    checkCPUModelsAreEquivalent()
151 | 
152 |    -- Check only the speed for forward/backward.
153 |    timer:reset();
154 |    for i=1, num_iteration do
155 |       local inputs, targets = genWideExample()
156 |       dp:forward(inputs)
157 |       opt:optimize(fboptim.sgd, inputs, targets, criterion)
158 |    end
159 | end
160 | 
161 | LuaUnit:main()
162 | 


--------------------------------------------------------------------------------
/test/test_GroupKMaxPooling.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright 2004-present Facebook. All Rights Reserved.
  2 | 
  3 | require('fb.luaunit')
  4 | 
  5 | require('math')
  6 | 
  7 | require('fbtorch')
  8 | 
  9 | require('nn')
 10 | 
 11 | require('fbcunn')
 12 | require('fbnn')
 13 | 
 14 | function run_GroupKMaxPooling_updateOutput(n, d, k)
 15 |     -- n = number of words
 16 |     -- d = dimension of embeddings
 17 |     -- k = k-max pooling
 18 |     local input = torch.randn(n, d)
 19 |     local kmax = nn.GroupKMaxPooling(k)
 20 | 
 21 |     local output = kmax:updateOutput(input)
 22 | 
 23 |     assert(output == kmax.output)
 24 |     assert(output:size(1) == k)
 25 |     assert(output:size(2) == input:size(2))
 26 | 
 27 |     local norms = torch.norm(input, 2, 2)
 28 |     local _, kmax_indices = torch.sort(norms, 1)
 29 |     kmax_indices = kmax_indices[{{-k,-1}}]
 30 |     kmax_indices = torch.sort(kmax_indices, 1)
 31 | 
 32 |     local kmax_result = torch.Tensor(k, input:size(2))
 33 |     for i = 1, kmax_indices:size(1) do
 34 |         kmax_result:select(1, i):copy(input:select(1, kmax_indices[i][1]))
 35 |     end
 36 | 
 37 |     assert(torch.sum(torch.eq(kmax_result, output)) == torch.numel(output))
 38 | end
 39 | 
 40 | function test_GroupKMaxPooling_updateOutput()
 41 |     run_GroupKMaxPooling_updateOutput(10, 11, 4)
 42 | end
 43 | 
 44 | function run_GroupKMaxPooling_updateOutput_batch(b, n, d, k)
 45 |     -- b = batch size
 46 |     -- n = number of words
 47 |     -- d = dimension of embeddings
 48 |     -- k = k-max pooling
 49 |     local input = torch.randn(b, n, d)
 50 |     local kmax = nn.GroupKMaxPooling(k)
 51 | 
 52 |     local output = kmax:updateOutput(input)
 53 | 
 54 |     assert(output == kmax.output)
 55 |     assert(output:size(1) == b)
 56 |     assert(output:size(2) == k)
 57 |     assert(output:size(3) == input:size(3))
 58 | 
 59 |     local norms = torch.norm(input, 2, 3):squeeze()
 60 |     local _, kmax_indices = torch.sort(norms, 2)
 61 |     kmax_indices = kmax_indices:sub(1, -1, -k, -1)
 62 |     kmax_indices = torch.sort(kmax_indices, 2)
 63 | 
 64 |     local kmax_result = torch.Tensor(input:size(1), k, input:size(3))
 65 |     kmax_result:fill(0.0)
 66 | 
 67 |     for i = 1, kmax_indices:size(1) do
 68 |         for j = 1, kmax_indices:size(2) do
 69 |             kmax_result:select(1, i):select(1, j):copy(
 70 |                 input:select(1, i):select(1, kmax_indices[i][j]))
 71 |         end
 72 |     end
 73 | 
 74 |     assert(torch.sum(torch.eq(kmax_result, output)) == torch.numel(output))
 75 | end
 76 | 
 77 | function test_GroupKMaxPooling_updateOutput_batch()
 78 |     run_GroupKMaxPooling_updateOutput_batch(15, 10, 11, 4)
 79 | end
 80 | 
 81 | function run_GroupKMaxPooling_updateGradInput(n, d, k)
 82 |     -- n = number of words
 83 |     -- d = dimension of embeddings
 84 |     -- k = k-max pooling
 85 |     local input = torch.randn(n, d)
 86 | 
 87 |     local kmax = nn.GroupKMaxPooling(k)
 88 | 
 89 |     local output = kmax:updateOutput(input)
 90 | 
 91 |     local delta = torch.randn(output:size())
 92 | 
 93 |     local gradInput = kmax:updateGradInput(input, delta)
 94 | 
 95 |     assert(gradInput == kmax.gradInput)
 96 | 
 97 |     assert(gradInput:sum() == delta:sum())
 98 | end
 99 | 
100 | function test_GroupKMaxPooling_updateGradInput()
101 |     run_GroupKMaxPooling_updateOutput(10, 11, 4)
102 | end
103 | 
104 | 
105 | function run_GroupKMaxPooling_updateGradInput_batch(b, n, d, k)
106 |     -- n = number of words
107 |     -- d = dimension of embeddings
108 |     -- k = k-max pooling
109 |     local input = torch.randn(b, n, d)
110 | 
111 |     local kmax = nn.GroupKMaxPooling(k)
112 | 
113 |     local output = kmax:updateOutput(input)
114 | 
115 |     local delta = torch.randn(output:size())
116 | 
117 |     local gradInput = kmax:updateGradInput(input, delta)
118 | 
119 |     assert(gradInput == kmax.gradInput)
120 | 
121 |     assert(gradInput:sum() == delta:sum())
122 | end
123 | 
124 | function test_GroupKMaxPooling_updateGradInput_batch()
125 |     run_GroupKMaxPooling_updateOutput(12, 10, 11, 4)
126 | end
127 | 
128 | LuaUnit:main()
129 | 


--------------------------------------------------------------------------------
/test/test_HSM.lua:
--------------------------------------------------------------------------------
  1 | -- Copyright 2004-present Facebook. All Rights Reserved.
  2 | -- Author: Michael Mathieu <myrhev@fb.com>
  3 | 
  4 | require 'fbcunn'
  5 | require 'fbnn'
  6 | 
  7 | local function test_finite_diff_gradInput(model, input, target)
  8 |     local eps = 1e-3
  9 |     local output = model:updateOutput(input, target)
 10 |     local gradInput = model:updateGradInput(input, target):clone()
 11 | 
 12 |     local gradInput2 = torch.Tensor(input:size())
 13 |     if input:dim() == 1 then
 14 |         for i = 1,input:size(1) do
 15 |             input[i] = input[i] + eps
 16 |             local outputP = model:updateOutput(input, target)
 17 |             input[i] = input[i] - 2*eps
 18 |             local outputM = model:updateOutput(input, target)
 19 |             input[i] = input[i] + eps
 20 |             gradInput2[i] = (outputP - outputM) / (2*eps)
 21 |         end
 22 |     else
 23 |         assert(input:dim() == 2)
 24 |         for i = 1,input:size(1) do
 25 |             for j = 1,input:size(2) do
 26 |                 input[i][j] = input[i][j] + eps
 27 |                 local outputP = model:updateOutput(input, target)
 28 |                 input[i][j] = input[i][j] - 2*eps
 29 |                 local outputM = model:updateOutput(input, target)
 30 |                 input[i][j] = input[i][j] + eps
 31 |                 gradInput2[i][j] = (outputP - outputM) / (2*eps)
 32 |             end
 33 |         end
 34 |     end
 35 |     return (gradInput - gradInput2):abs():max()
 36 | end
 37 | 
 38 | local function test_finite_diff_accGrads(model, input, target, scale)
 39 |     local eps = 1e-3
 40 |     scale = scale or 1
 41 | 
 42 |     local w, dw = model:getParameters()
 43 | 
 44 |     dw:zero()
 45 |     local output = model:updateOutput(input, target)
 46 |     local gradInput = model:updateGradInput(input, target):clone()
 47 |     model:accGradParameters(input, target, scale)
 48 |     local gradParams = dw:clone()
 49 | 
 50 |     local gradParams2 = torch.Tensor(w:size(1))
 51 |     for i = 1,w:size(1) do
 52 |         w[i] = w[i] + eps
 53 |         local outputP = model:updateOutput(input, target)
 54 |         w[i] = w[i] - 2*eps
 55 |         local outputM = model:updateOutput(input, target)
 56 |         w[i] = w[i] + eps
 57 |         gradParams2[i] = scale * (outputP - outputM) / (2*eps)
 58 |     end
 59 | 
 60 |     return (gradParams - gradParams2):abs():max()
 61 | end
 62 | 
 63 | for i = 1,100 do
 64 |    print("Iteration " .. i)
 65 |     local n_clusters = torch.random(10)
 66 |     local n_class = torch.random(50) + n_clusters - 1
 67 |     local mapping = {}
 68 |     local n_class_in_cluster = {}
 69 |     for i = 1, n_class do
 70 |         local cluster = torch.random(n_clusters)
 71 |         n_class_in_cluster[cluster] = n_class_in_cluster[cluster] or 0
 72 |         n_class_in_cluster[cluster] = n_class_in_cluster[cluster] + 1
 73 |         mapping[i] = {cluster, n_class_in_cluster[cluster]}
 74 |     end
 75 |     for i = 1,n_clusters do
 76 |         if n_class_in_cluster[i] == nil then
 77 |             n_class_in_cluster[i] = 1
 78 |             mapping[1+#mapping] = {i, 1}
 79 |             n_class = n_class + 1
 80 |         end
 81 |     end
 82 |     local input_size = torch.random(100) + 1
 83 |     local model = nn.HSM(mapping, input_size)
 84 | 
 85 |     local input = torch.randn(input_size)
 86 |     local target = torch.LongTensor(1)
 87 |     target[1] = torch.random(n_class)
 88 |     local err = test_finite_diff_gradInput(model, input, target)
 89 |     assert(err < 1e-2)
 90 |     err = test_finite_diff_accGrads(model, input, target)
 91 |     assert(err < 1e-2)
 92 |     local scale = torch.rand(1)[1]
 93 |     err = test_finite_diff_accGrads(model, input, target, scale)
 94 |     assert(err < 1e-2)
 95 | 
 96 |     local batch_size = torch.random(10)
 97 |     input = torch.randn(batch_size, input_size)
 98 |     target = torch.LongTensor(batch_size)
 99 |     for i = 1, batch_size do
100 |         target[i] = torch.random(n_class)
101 |     end
102 |     err = test_finite_diff_gradInput(model, input, target);
103 |     assert(err < 1e-2)
104 |     err = test_finite_diff_accGrads(model, input, target)
105 |     assert(err < 1e-2)
106 |     err = test_finite_diff_accGrads(model, input, target, scale)
107 |     assert(err < 1e-2)
108 | 
109 |     -- test directUpdate
110 |     local w, dw = model:getParameters()
111 |     dw:normal()
112 |     local initdw = dw:clone()
113 |     model:updateOutput(input, target)
114 |     model:updateGradInput(input, target)
115 |     model:accGradParameters(input, target, scale, false)
116 |     local w1 = w:clone():add(dw)
117 |     model:updateOutput(input, target)
118 |     model:updateGradInput(input, target)
119 |     model:accGradParameters(input, target, scale, true)
120 |     w:add(initdw)
121 |     err = w:add(-1, w1):abs():max()
122 |     assert(err < 1e-5)
123 | end
124 | 


--------------------------------------------------------------------------------
/test/test_LinearNB.lua:
--------------------------------------------------------------------------------
 1 | require 'nn'
 2 | require 'cunn'
 3 | require 'fbtorch'
 4 | require 'fbcunn'
 5 | 
 6 | local mytester = torch.Tester()
 7 | local LinearNBTest = {}
 8 | local jac = nn.Jacobian
 9 | 
10 | local precision = 1e-5
11 | 
12 | function testAll(targettype)
13 |     targettype = targettype or 'torch.DoubleTensor'
14 |     local ini = math.random(3,5)
15 |     local inj_vals = {math.random(3,5), 1}  -- Also test the inj = 1 spatial case
16 |     local input = torch.Tensor(ini):zero():type(targettype)
17 | 
18 |     for ind, inj in pairs(inj_vals) do
19 |         local module = nn.LinearNB(ini, inj)
20 |         if targettype == 'torch.CudaTensor' then
21 |             module = module:cuda()
22 |         end
23 | 
24 |         -- 1D
25 |         local err = jac.testJacobian(module, input)
26 |         mytester:assertlt(err, precision, 'error on state ')
27 | 
28 |         local err = jac.testJacobianParameters(module, input, module.weight,
29 |                                                module.gradWeight)
30 |         mytester:assertlt(err, precision, 'error on weight ')
31 | 
32 |         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
33 |         mytester:assertlt(err, precision, 'error on weight [direct update] ')
34 | 
35 |         for t,err in pairs(jac.testAllUpdate(module, input,
36 |                                              'weight', 'gradWeight')) do
37 |             mytester:assertlt(err, precision, string.format(
38 |                                   'error on weight [%s]', t))
39 |         end
40 | 
41 |         -- 2D
42 |         local nframe = math.random(50,70)
43 |         local input = torch.Tensor(nframe, ini):zero():type(targettype)
44 | 
45 |         local err = jac.testJacobian(module,input)
46 |         mytester:assertlt(err, precision, 'error on state ')
47 | 
48 |         local err = jac.testJacobianParameters(module, input, module.weight,
49 |                                                module.gradWeight)
50 |         mytester:assertlt(err,precision, 'error on weight ')
51 | 
52 |         local err = jac.testJacobianUpdateParameters(module, input, module.weight)
53 |         mytester:assertlt(err,precision, 'error on weight [direct update] ')
54 | 
55 |         for t,err in pairs(jac.testAllUpdate(module, input,
56 |                                              'weight', 'gradWeight')) do
57 |             mytester:assertlt(err, precision, string.format(
58 |                                   'error on weight [%s]', t))
59 |         end
60 | 
61 |         -- IO
62 |         local ferr,berr = jac.testIO(module, input)
63 |         mytester:asserteq(ferr, 0, torch.typename(module)
64 |                               .. ' - i/o forward err ')
65 |         mytester:asserteq(berr, 0, torch.typename(module)
66 |                               .. ' - i/o backward err ')
67 |     end
68 | end
69 | 
70 | function LinearNBTest.testDouble()
71 |     testAll()
72 | end
73 | 
74 | 
75 | mytester:add(LinearNBTest)
76 | mytester:run()
77 | 


--------------------------------------------------------------------------------
/test/test_OneBitDataParallel.lua:
--------------------------------------------------------------------------------
  1 | require('fb.luaunit')
  2 | require('fbtorch')
  3 | require('fbcunn')
  4 | require('fbnn')
  5 | local TU = require('test.test_Util')
  6 | local fboptim = require('fboptim')
  7 | 
  8 | local function dp()
  9 |     return nn.OneBitDataParallel(
 10 |         1,
 11 |         {momentum_rate=1.0, adagrad_learning_rate=1.0, min_elements=20}
 12 |     )
 13 | end
 14 | 
 15 | 
 16 | function testDataParallelRunsForwardPass()
 17 |     local sim = TU.Sim {
 18 |         num_hidden = 2,
 19 |         output_width = 1,
 20 |         hidden_width = 512,
 21 |         input_width = 32,
 22 |         num_columns = 4,
 23 |     }
 24 | 
 25 |     local model, columns = sim:build_data_parallel(dp())
 26 |     local inputs, _ = sim:gen_wide_example()
 27 |     local outputs = model:forward(inputs)
 28 | 
 29 |     for column_id = 1, sim.opts.num_columns do
 30 |         local column_input = sim:get_narrowed_input(inputs, column_id):double()
 31 |         print(column_input:size())
 32 |         local gpu_output = outputs[{ {column_id} }]
 33 |         local cpu_output = columns[column_id]:forward(column_input)
 34 | 
 35 |         local norm_delta = TU.tensor_norm_difference(gpu_output, cpu_output)
 36 | 
 37 |         print(column_input:size(), gpu_output:size(), cpu_output:size())
 38 |         print(gpu_output:norm(), cpu_output:norm())
 39 |         assertTrue(norm_delta < 1E-5)
 40 |     end
 41 | end
 42 | 
 43 | function testDataParallelOnForwardPassIsEquivalentToSeparateColumns()
 44 |     local sim = TU.Sim {
 45 |         num_hidden = 2,
 46 |         output_width = 1,
 47 |         hidden_width = 512,
 48 |         input_width = 32,
 49 |         num_columns = 4,
 50 |     }
 51 | 
 52 |     local model, columns = sim:build_data_parallel(dp())
 53 |     local inputs, _ = sim:gen_wide_example()
 54 |     local outputs = model:forward(inputs)
 55 | 
 56 |     for column_id = 1, sim.opts.num_columns do
 57 |         local column_input = sim:get_narrowed_input(inputs, column_id):double()
 58 |         print(column_input:size())
 59 |         local gpu_output = outputs[{ {column_id} }]
 60 |         local cpu_output = columns[column_id]:forward(column_input)
 61 | 
 62 |         local norm_delta =
 63 |             TU.tensor_norm_difference(gpu_output, cpu_output)
 64 | 
 65 |         print(column_input:size(), gpu_output:size(), cpu_output:size())
 66 |         print(gpu_output:norm(), cpu_output:norm())
 67 |         assertTrue(norm_delta < 1E-5)
 68 |     end
 69 | end
 70 | 
 71 | function testDataParallelOnOptimLearns()
 72 |     local sim = TU.Sim {
 73 |         num_hidden = 1,
 74 |         output_width = 1,
 75 |         hidden_width = 500,
 76 |         input_width = 5,
 77 |         num_columns = 4,
 78 |         num_opt_rounds = 2,
 79 |     }
 80 | 
 81 |     local optim_state = {
 82 |         learningRate = 1e-1,
 83 |         weightDecay = 1e-4,
 84 |         momentum = 0.9,
 85 |         learningRateDecay = 1e-7
 86 |     }
 87 | 
 88 |     local model, _columns = sim:build_data_parallel(dp())
 89 |     local opt = nn.Optim(model, optim_state)
 90 |     local criterion = nn.MSECriterion():cuda()
 91 | 
 92 |     for round = 1,sim.opts.num_opt_rounds do
 93 |         local inputs, targets = sim:gen_wide_example()
 94 |         local _outputs = model:forward(inputs)
 95 |         opt:optimize(fboptim.sgd, inputs, targets, criterion)
 96 |         local out = model:forward(inputs)
 97 |         print(out)
 98 |         local err = criterion:forward(out, targets)
 99 |         print(round, err)
100 |     end
101 | end
102 | 
103 | LuaUnit:main()
104 | 


--------------------------------------------------------------------------------
/test/test_OneBitQuantization.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | require('fb.luaunit')
 4 | require('fbcunn')
 5 | 
 6 | local num_tries = 10
 7 | 
 8 | function testExactQuantization()
 9 |    for tries = 1, num_tries do
10 |       local q = nn.OneBitQuantization()
11 |       local t = torch.CudaTensor(torch.random(50), torch.random(50))
12 | 
13 |       print('Quantizing ' .. t:size(1) .. ' x ' .. t:size(2))
14 | 
15 |       -- We will get exact quantization if there is only one positive
16 |       -- and one negative value in each row.
17 |       for row = 1, t:size(1) do
18 |          local pos_value = torch.uniform(10)
19 |          local neg_value = -torch.uniform(10)
20 | 
21 |          for col = 1, t:size(2) do
22 |             local val = pos_value
23 | 
24 |             if torch.bernoulli(0.5) == 0 then
25 |                val = neg_value
26 |             end
27 | 
28 |             t[row][col] = val
29 |          end
30 |       end
31 | 
32 |       local quantized = q:quantize(t)
33 |       local dequantized =
34 |          q:dequantize(quantized, q.avg_pos, q.avg_neg, t:size(2))
35 | 
36 |       assertTrue((dequantized:float() - t:float()):abs():max() < 1e-5)
37 |    end
38 | end
39 | 
40 | function testErrorDecaysToZero()
41 |    for tries = 1, num_tries do
42 |       -- In order to show that quantization error works, we should be
43 |       -- able to send a matrix via quantization, and then successfully
44 |       -- send the zero matrix.
45 |       -- For each successive pass, the quantization error should diminish
46 |       -- and on the receiving side, we should get something that approximates
47 |       -- the original matrix.
48 |       local q = nn.OneBitQuantization()
49 | 
50 |       -- Send two matrices
51 |       local orig1 =
52 |          torch.randn(10 + torch.random(30), 10 + torch.random(30)):cuda()
53 |       local orig2 = torch.randn(orig1:size(1), orig1:size(2)):cuda()
54 | 
55 |       -- This is the signal that we wish to approximate
56 |       local orig = orig1:float() + orig2:float()
57 | 
58 |       print('Quantizing ' .. orig:size(1) .. ' x ' .. orig:size(2))
59 | 
60 |       -- pass `orig1`
61 |       local quantized = q:quantize(orig1)
62 |       local dequantized =
63 |          q:dequantize(quantized, q.avg_pos, q.avg_neg, orig1:size(2))
64 | 
65 |       -- dequantized will become the approximation to `orig`
66 |       local approx = dequantized:float()
67 | 
68 |       -- pass `orig2`
69 |       quantized = q:quantize(orig2)
70 |       dequantized =
71 |          q:dequantize(quantized, q.avg_pos, q.avg_neg, orig2:size(2))
72 |       approx:add(dequantized:float())
73 | 
74 |       -- Now, after sending some signal, we will pass 0 a couple of times, in
75 |       -- order to flush the quantization error. The number of passes required is
76 |       -- related to the size of the original matrix and is also dependent upon
77 |       -- floating point precision.
78 |       local zeros = torch.CudaTensor(orig:size(1), orig:size(2))
79 |       zeros:zero()
80 | 
81 |       for passes = 1, 100 do
82 |          quantized = q:quantize(zeros)
83 |          dequantized =
84 |             q:dequantize(quantized, q.avg_pos, q.avg_neg, zeros:size(2))
85 | 
86 |          approx:add(dequantized:float())
87 |       end
88 | 
89 |       assertTrue((orig:float() - approx):abs():max() < 5e-4)
90 |    end
91 | end
92 | 
93 | LuaUnit:main()
94 | 


--------------------------------------------------------------------------------
/test/test_OneBitSGD.lua:
--------------------------------------------------------------------------------
  1 | require('fb.luaunit')
  2 | require('fbtorch')
  3 | require('cutorch')
  4 | require('cunn')
  5 | require('fbcunn')
  6 | 
  7 | local TU = require('test.test_Util')
  8 | local pl = require('pl.import_into')()
  9 | local _fbd = require('fb.debugger')
 10 | local OBSGD = require('fbcunn.OneBitSGD')
 11 | 
 12 | torch.setdefaulttensortype('torch.CudaTensor')
 13 | 
 14 |  function testQuantizerOnSimpleExample()
 15 |      local gradient = torch.Tensor({{1}, {-1}})
 16 |      local accumulated = torch.Tensor():typeAs(gradient):resizeAs(gradient)
 17 |      local quantizer = nn.OneBitQuantization()
 18 |      local quantized, avg_pos, avg_neg =
 19 |          OBSGD.quantize_gradient(
 20 |              gradient, quantizer, accumulated)
 21 |      TU.assertTensorEquals(quantizer.quantization_error, quantizer.quantization_error:clone():zero())
 22 |      TU.assertTensorEquals(avg_pos, torch.Tensor({{1}, {0}}))
 23 |      TU.assertTensorEquals(avg_neg, torch.Tensor({{0}, {-1}}))
 24 |  end
 25 | 
 26 |  function testQuantizationReducesNormOfMatrix()
 27 |      for _ = 1,50 do
 28 |          local gradient = torch.Tensor(100, 20):normal()
 29 |          local accumulated = torch.Tensor():typeAs(gradient):resizeAs(gradient)
 30 |          local quantizer = nn.OneBitQuantization()
 31 | 
 32 |          OBSGD.quantize_gradient(
 33 |              gradient, quantizer, accumulated)
 34 |          assertTrue(gradient:norm() > quantizer.quantization_error:norm())
 35 |      end
 36 |  end
 37 | 
 38 | local function build_agg()
 39 |     return OBSGD.OneBitAggregator(
 40 |         {momentum_rate=1.0, adagrad_learning_rate=1.0},
 41 |         function() return torch.Tensor(5, 5):zero() end,
 42 |         function(dst, src) dst:copy(src) end,
 43 |         1
 44 |     )
 45 | end
 46 | 
 47 | 
 48 | function testOBSGDSmoothing()
 49 |     local agg = build_agg()
 50 | 
 51 |     local smoothed =
 52 |         agg:_smooth_gradient(agg.gradient_tensor_factory():fill(1))
 53 |     TU.assertTensorEquals(smoothed, agg.gradient_tensor_factory():fill(1))
 54 | end
 55 | 
 56 |  function testOBSGDAveraging()
 57 |      local agg = build_agg()
 58 |      local num_columns = 5
 59 |      local gradients = pl.List.range(num_columns):map(
 60 |          function(i) return agg.gradient_tensor_factory():fill(i) end)
 61 |      local averaged_gradients = agg:_accumulate_quantized_gradients(gradients)
 62 |      TU.assertTensorEquals(
 63 |          averaged_gradients,
 64 |          agg.gradient_tensor_factory():fill((num_columns+1) / 2)
 65 |      )
 66 |  end
 67 | 
 68 | function testOBSGDAggregation()
 69 |     local agg = build_agg()
 70 | 
 71 |     local num_columns = 5
 72 |     local gradients = pl.List.range(num_columns):map(
 73 |         function(i) return agg.gradient_tensor_factory():fill(i) end)
 74 | 
 75 |     local averaged_gradients = agg:_accumulate_quantized_gradients(gradients)
 76 |     TU.assertTensorEquals(
 77 |         averaged_gradients,
 78 |         agg.gradient_tensor_factory():fill((num_columns+1) / 2)
 79 |     )
 80 | end
 81 | 
 82 | 
 83 |  function testOBSGDEndTOEnd()
 84 |      local agg = build_agg()
 85 | 
 86 |      local num_columns = 5
 87 |      local gradients = pl.List.range(num_columns):map(
 88 |          function(i) return agg.gradient_tensor_factory():fill(i) end)
 89 | 
 90 |      local gradients_to_run = gradients:map(function(t) return t:clone() end)
 91 | 
 92 |      agg:run(gradients_to_run)
 93 | 
 94 |      TU.assertTensorEquals(
 95 |          agg.home_quantizer.quantization_error,
 96 |          agg.gradient_tensor_factory():zero()
 97 |      )
 98 |      TU.assertTensorEquals(
 99 |          agg.adagrad_history,
100 |          agg.gradient_tensor_factory():fill(9)
101 |      )
102 |      TU.assertTensorEquals(
103 |          agg.momentum_history,
104 |          agg.gradient_tensor_factory():fill(1)
105 |      )
106 | 
107 |      -- Gradients get quantized to zero
108 |      local after_expected = agg.gradient_tensor_factory():fill(1)
109 |      for _, t in ipairs(pl.tablex.zip(gradients, gradients_to_run)) do
110 |          local _before, after = table.unpack(t)
111 |          TU.assertTensorEquals(after, after_expected)
112 |      end
113 |  end
114 | 
115 | function testMomentumWorks()
116 |     local gradient = torch.Tensor({1})
117 |     local momentum_history = torch.Tensor({50})
118 |     local momentum_rate = 5.0
119 |     local new_gradient =
120 |         OBSGD.momentum(gradient, momentum_rate, momentum_history)
121 |     TU.assertTensorAlmostEquals(new_gradient, torch.Tensor({251}))
122 | end
123 | 
124 | LuaUnit:main()
125 | 


--------------------------------------------------------------------------------
/test/test_SequentialCriterion.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | -- Author: Michael Mathieu <myrhev@fb.com>
 3 | 
 4 | require 'fb.luaunit'
 5 | require 'nn'
 6 | require 'fbcunn'
 7 | require 'fbnn'
 8 | 
 9 | local test_repeats = 100
10 | 
11 | local function testSequentialCriterion_run(input_size, n_classes,
12 |                                            module, crit, targettype)
13 |    module = module:clone()
14 |    crit = crit:clone()
15 |    local modcrit = nn.SequentialCriterion(module:clone(), crit:clone())
16 |    targettype = targettype or torch.Tensor():type()
17 | 
18 |    local batch_size = torch.random(100)
19 |    local input = torch.rand(batch_size, input_size)
20 |    local target =
21 |       torch.rand(batch_size):mul(n_classes):add(1):floor():type(targettype)
22 | 
23 |    local output1 = modcrit:forward(input, target)
24 |    local z2 = module:forward(input)
25 |    local output2 = crit:forward(z2, target)
26 |    assertTrue(math.abs(output1-output2) < 1e-5)
27 | 
28 |    local gradInput1 = modcrit:updateGradInput(input, target)
29 |    local derr_do2 = crit:updateGradInput(z2, target)
30 |    local gradInput2 = module:updateGradInput(input, derr_do2)
31 |    assertTrue(gradInput1:clone():add(-1, gradInput2):abs():max() < 1e-5)
32 | 
33 |    modcrit:zeroGradParameters()
34 |    module:zeroGradParameters()
35 |    if crit.zeroGradParameters then
36 |       crit:zeroGradParameters()
37 |    end
38 |    modcrit:accGradParameters(input, target)
39 |    if crit.accGradParameters then
40 |       crit:accGradParameters(z2, target)
41 |    end
42 |    module:accGradParameters(input, derr_do2)
43 |    modcrit:updateParameters(1)
44 |    if crit.updateParameters then
45 |       crit:updateParameters(1)
46 |    end
47 |    module:updateParameters(1)
48 |    local output1 = modcrit:forward(input, target)
49 |    local z2 = module:forward(input)
50 |    local output2 = crit:forward(z2, target)
51 |    assertTrue(math.abs(output1-output2) < 1e-5)
52 | end
53 | 
54 | local function make_HSM(n_clusters, n_class, input_size)
55 |    local mapping = {}
56 |    local n_class_in_cluster = {}
57 |    for i = 1, n_class do
58 |       local cluster = torch.random(n_clusters)
59 |       n_class_in_cluster[cluster] = n_class_in_cluster[cluster] or 0
60 |       n_class_in_cluster[cluster] = n_class_in_cluster[cluster] + 1
61 |       mapping[i] = {cluster, n_class_in_cluster[cluster]}
62 |    end
63 |    for i = 1,n_clusters do
64 |       if n_class_in_cluster[i] == nil then
65 |          n_class_in_cluster[i] = 1
66 |          mapping[1+#mapping] = {i, 1}
67 |          n_class = n_class + 1
68 |       end
69 |    end
70 |    return nn.HSM(mapping, input_size)
71 | end
72 | 
73 | function testSequentialCriterion()
74 |    for i = 1, test_repeats do
75 |       -- try with NLL
76 |       local input_size = torch.random(200)
77 |       local n_classes = torch.random(200)
78 |       local module = nn.Linear(input_size, n_classes)
79 |       local crit = nn.ClassNLLCriterion()
80 |       testSequentialCriterion_run(input_size, n_classes, module,
81 |                                   crit, 'torch.LongTensor')
82 | 
83 |       -- try with HSM
84 |       local input1_size = torch.random(200)
85 |       local input2_size = torch.random(200)
86 |       local n_classes = torch.random(200)
87 |       local module = nn.Sequential()
88 |       module:add(nn.Linear(input1_size, input2_size))
89 |       module:add(nn.Threshold())
90 |       local crit = make_HSM(20, n_classes, input2_size)
91 |       testSequentialCriterion_run(input1_size, n_classes, module,
92 |                                   crit, 'torch.LongTensor')
93 |    end
94 | end
95 | 
96 | LuaUnit:main()
97 | 


--------------------------------------------------------------------------------
/test/test_SparseNLLCriterion.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | -- Author: Michael Mathieu <myrhev@fb.com>
 3 | 
 4 | require 'fb.luaunit'
 5 | require 'cutorch'
 6 | require 'nn'
 7 | require 'fbcunn'
 8 | require 'fbnn'
 9 | 
10 | local test_repeats = 5
11 | 
12 | local function test_finite_diff_gradInput(model, input, target)
13 |     local eps = 1e-3
14 |     local output = model:updateOutput(input, target)
15 |     local gradInput = model:updateGradInput(input, target):clone()
16 | 
17 |     local gradInput2 = torch.Tensor(input:size())
18 |     local outputP = torch.Tensor(output:size())
19 |     local outputM = torch.Tensor(output:size())
20 |     if input:dim() == 1 then
21 |         for i = 1,input:size(1) do
22 |             input[i] = input[i] + eps
23 |             outputP:copy(model:updateOutput(input, target))
24 |             input[i] = input[i] - 2*eps
25 |             outputM:copy(model:updateOutput(input, target))
26 |             input[i] = input[i] + eps
27 |             gradInput2[i] = outputP:add(-1, outputM):div(2*eps)
28 |         end
29 |     else
30 |         assert(input:dim() == 2)
31 |         for i = 1,input:size(1) do
32 |             for j = 1,input:size(2) do
33 |                 input[i][j] = input[i][j] + eps
34 |                 outputP:copy(model:updateOutput(input, target))
35 |                 input[i][j] = input[i][j] - 2*eps
36 |                 outputM:copy(model:updateOutput(input, target))
37 |                 input[i][j] = input[i][j] + eps
38 |                 gradInput2[i][j] = outputP:add(-1, outputM):div(2*eps)
39 |             end
40 |         end
41 |     end
42 |     gradInput2 = gradInput2:type(input:type())
43 |     return gradInput:add(-1, gradInput2):abs():max()
44 | end
45 | 
46 | local function test_sparseNLL(K, n_classes, batch_size, cuda)
47 |    local crit = nn.SparseNLLCriterion(K)
48 |    local input = torch.randn(batch_size, n_classes)
49 |    local targetP = torch.randn(batch_size, K):abs()
50 |    local targetIdx = torch.LongTensor(batch_size, K)
51 |    if cuda then
52 |       crit = crit:cuda()
53 |       input = input:cuda()
54 |       targetP = targetP:cuda()
55 |       targetIdx = torch.CudaTensor(targetIdx:size()):copy(targetIdx)
56 |    end
57 |    for i = 1, batch_size do
58 |       targetP[i]:div(targetP[i]:sum())
59 |       local p = torch.randperm(n_classes)
60 |       targetIdx[i]:copy(p[{{1,K}}])
61 |    end
62 |    -- fprop
63 |    local output_test = 0
64 |    for i = 1, batch_size do
65 |       for j = 1, K do
66 |          output_test = output_test - input[i][targetIdx[i][j] ] * targetP[i][j]
67 |       end
68 |    end
69 |    output_test = output_test / batch_size
70 |    local fprop_err =
71 |       math.abs(output_test - crit:forward(input, {targetP, targetIdx})[1])
72 | 
73 |    --bprop
74 |    local bprop_err =
75 |       test_finite_diff_gradInput(crit, input, {targetP, targetIdx})
76 | 
77 |    return fprop_err, bprop_err
78 | end
79 | 
80 | function testSparseNLLCriterion()
81 |    for k = 1, test_repeats do
82 |       local n_classes = torch.random(100)
83 |       local K = torch.random(n_classes)
84 |       local batch_size = torch.random(32)
85 |       local err1, err2 = test_sparseNLL(K, n_classes, batch_size, false)
86 |       assertTrue(err1 < 1e-3)
87 |       assertTrue(err2 < 1e-3)
88 |       local err1, err2 = test_sparseNLL(K, n_classes, batch_size, true)
89 |       assertTrue(err1 < 1e-3)
90 |       assertTrue(err2 < 1e-3)
91 |    end
92 | end
93 | 
94 | LuaUnit:main()
95 | 


--------------------------------------------------------------------------------
/test/test_Util.lua:
--------------------------------------------------------------------------------
 1 | require('fb.luaunit')
 2 | require('fbtorch')
 3 | require('cunn')
 4 | require('optim')
 5 | 
 6 | local M = {}
 7 | 
 8 | local pl = require('pl.import_into')()
 9 | 
10 | function M.tensor_norm_difference(l, r)
11 |     return math.abs(l:norm() - r:norm())
12 | end
13 | 
14 | function M.assertTensorEquals(a, b)
15 |     assertEquals(0, (a:clone():add(b:clone():mul(-1))):abs():sum())
16 | end
17 | 
18 | function M.assertTensorAlmostEquals(a, b, eps)
19 |     assertTrue((a:clone():add(b:clone():mul(-1))):norm() < (eps or 1E-10))
20 | end
21 | 
22 | local Sim = pl.class()
23 | M.Sim = Sim
24 | 
25 | function Sim:_init(opts)
26 |     self.opts = opts
27 | end
28 | 
29 | function Sim:build_column()
30 |     local seq = nn.Sequential()
31 |     local pred = self.opts.input_width
32 |     for i = 1,self.opts.num_hidden do
33 |         seq:add(nn.Linear(pred, self.opts.hidden_width))
34 |         seq:add(nn.Tanh())
35 |         pred = self.opts.hidden_width
36 |     end
37 |     seq:add(nn.Linear(self.opts.hidden_width, self.opts.output_width))
38 |     seq:add(nn.Tanh())
39 |     return seq
40 | end
41 | 
42 | function Sim:build_data_parallel(dp)
43 |     local num_gpus = cutorch.getDeviceCount()
44 |     local columns = {}
45 | 
46 |     for column_id = 1,self.opts.num_columns do
47 |         local gpu_id = column_id % num_gpus
48 |         if gpu_id == 0 then gpu_id = num_gpus end
49 |         print(gpu_id)
50 |         cutorch.withDevice(
51 |             gpu_id,
52 |             function()
53 |                 local column = self:build_column()
54 |                 table.insert(columns, column:clone())
55 |                 dp:add(column:clone(), gpu_id)
56 |             end
57 |         )
58 |     end
59 |     return dp:cuda(), columns
60 | end
61 | 
62 | function Sim:target_function(x)
63 |     -- admittedly tough for us to learn, but hey.
64 |     local retval = torch.Tensor(self.opts.output_width)
65 |     local sum = x:sum()
66 |     retval[1] = math.sin(sum)
67 |     return retval
68 | end
69 | 
70 | function Sim:gen_wide_input()
71 |     return torch.randn(self.opts.input_width * self.opts.num_columns)
72 | end
73 | 
74 | function Sim:get_narrowed_input_range(i)
75 |     assert(type(i) == 'number')
76 |     local range_start = 1 + ((i - 1) * self.opts.input_width)
77 |     local range_end = range_start + (self.opts.input_width) - 1
78 |     return range_start, range_end
79 | end
80 | 
81 | function Sim:get_narrowed_input(input, i)
82 |     assert(torch.typename(input))
83 |     assert(type(i) == 'number')
84 |     return input[{ {self:get_narrowed_input_range(i)} }]
85 | end
86 | 
87 | function Sim:gen_wide_example()
88 |     local samp = self:gen_wide_input()
89 |     local retval = torch.Tensor(self.opts.output_width * self.opts.num_columns)
90 |     for i = 1,self.opts.num_columns do
91 |         retval[i] = self:target_function(self:get_narrowed_input(samp, i))
92 |     end
93 |     return samp:cuda(), retval:cuda()
94 | end
95 | 
96 | return M
97 | 


--------------------------------------------------------------------------------
/test/test_WeightedLookupTable.lua:
--------------------------------------------------------------------------------
 1 | -- Copyright 2004-present Facebook. All Rights Reserved.
 2 | 
 3 | require('fbtorch')
 4 | require('fb.luaunit')
 5 | require('fbcunn')
 6 | 
 7 | require('nn')
 8 | 
 9 | local function all(tensor)
10 |     return torch.sum(torch.ne(tensor, 0)) == tensor:numel()
11 | end
12 | 
13 | local function almost_equal(t1, t2, tol)
14 |     return torch.lt(torch.abs(t1 - t2), tol)
15 | end
16 | 
17 | -- w = weighted
18 | -- u = unweighted
19 | -- e.g.
20 | -- wlut = weighted lookup table
21 | -- ulut = unweighted lookup table
22 | 
23 | function test_WeightedLookupTable_forward()
24 |     local embedding_dim = 4
25 |     local table_size = 30
26 |     local input_length = 9
27 |     local tol = 1e-8
28 | 
29 |     local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda()
30 |     local ulut = nn.LookupTable(table_size, embedding_dim):cuda()
31 |     ulut.weight:copy(wlut.weight)
32 |     assert(all(torch.eq(wlut.weight, ulut.weight)))
33 | 
34 |     local uinput = torch.rand(input_length):mul(table_size):ceil()
35 |     local weights = torch.rand(input_length, 1)
36 |     local winput = torch.cat(uinput, weights, 2)
37 | 
38 |     local woutput = wlut:forward(winput:cuda())
39 |     local uoutput = ulut:forward(uinput:cuda())
40 |     weights = weights:cuda()
41 |     local expected_woutput = torch.cmul(uoutput, weights:expandAs(uoutput))
42 | 
43 |     assert(all(almost_equal(woutput:float(), expected_woutput:float(), tol)))
44 | end
45 | 
46 | function test_WeightedLookupTable_accGradParameters()
47 |     local embedding_dim = 4
48 |     local table_size = 30
49 |     local input_length = 9
50 |     local tol = 1e-5
51 | 
52 |     local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda()
53 |     local ulut = nn.LookupTable(table_size, embedding_dim):cuda()
54 |     ulut.weight:copy(wlut.weight)
55 |     assert(all(torch.eq(wlut.weight, ulut.weight)))
56 | 
57 |     local uinput = torch.rand(input_length):mul(table_size):ceil()
58 |     local weights = torch.range(1, input_length):reshape(input_length, 1)
59 |     local winput = torch.cat(uinput, weights, 2)
60 | 
61 |     winput = winput:cuda()
62 |     uinput = uinput:cuda()
63 |     local woutput = wlut:forward(winput)
64 |     local uoutput = ulut:forward(uinput)
65 | 
66 |     local wgradOutput = torch.randn(woutput:size())
67 |     local ugradOutput = torch.cmul(wgradOutput, weights:expandAs(wgradOutput))
68 | 
69 |     wlut:accGradParameters(winput, wgradOutput:cuda(), 1)
70 |     ulut:accGradParameters(uinput, ugradOutput:cuda(), 1)
71 | 
72 |     assert(all(almost_equal(wlut.gradWeight:float(), ulut.gradWeight:float(), tol)))
73 | end
74 | 
75 | 
76 | LuaUnit:main()
77 | 


--------------------------------------------------------------------------------