├── .dokx ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── INSTALL.md ├── LICENSE ├── PATENTS ├── README.md ├── cmake ├── FindFolly.cmake ├── FindGlog.cmake ├── FindTHPP.cmake └── MultiLevelIncludes.cmake ├── docs.sh ├── fbcunn ├── AbstractParallel.lua ├── BatchNormalization.lua ├── CuBLASWrapper.lua ├── DataParallel.lua ├── FFTCDefs.lua ├── FFTWrapper.lua ├── FeatureLPPooling.lua ├── HalfPrecision.lua ├── LookupTableGPU.lua ├── ModelParallel.lua ├── OneBitDataParallel.lua ├── OneBitQuantization.lua ├── OneBitSGD.lua ├── SpatialBatchNormalization.lua ├── SpatialConvolution.lua ├── SpatialConvolutionCuFFT.lua ├── SpatialConvolutionFBFFT.lua ├── SpatialConvolutionFBFFTGemm.lua ├── SpatialConvolutionFFT.lua ├── SpatialConvolutionFFTTiled.lua ├── SpatialConvolutionFFTTiledAsync.lua ├── SpatialConvolutionFFTTiledIterated.lua ├── SpatialConvolutionFFTTiledSync.lua ├── TemporalConvolutionFB.lua ├── TemporalKMaxPooling.lua └── init.lua ├── rocks └── fbcunn-scm-1.rockspec ├── src ├── BLASParameters.cpp ├── BLASParameters.h ├── BatchNormalization.cu ├── ConvolutionBias.cu ├── ConvolutionBias.cuh ├── CrossMapNormalization.cu ├── CrossMapNormalization.cuh ├── CrossMapNormalizationHost.cpp ├── CuBLASWrapper.cpp ├── CuBLASWrapper.h ├── CuBLASWrapperLua.cpp ├── CudaTensorUtils.cpp ├── CudaTensorUtils.h ├── DeviceTensorUtils-inl.h ├── DeviceTensorUtils.h ├── FeatureLPPooling.cu ├── FeatureLPPooling.cuh ├── FeatureLPPoolingHost.cpp ├── HSM.cu ├── HSMHost.cpp ├── HalfPrec.cpp ├── HalfPrec.h ├── HalfPrecKernels.cu ├── HalfPrecTest.cpp ├── Includes.h ├── InitCuda.cpp ├── LocallyConnected.cuh ├── LocallyConnectedAccGradParameters.cu ├── LocallyConnectedHost.cpp ├── LocallyConnectedUpdateGradInput.cu ├── LocallyConnectedUpdateOutput.cu ├── LookupTableGPU.cu ├── LookupTableGPUHost.cpp ├── LuaUtils.h ├── MM.cu ├── MM.h ├── OneBitQuantization.cu ├── OneBitQuantization.cuh ├── OneBitQuantizationHost.cpp ├── SparseNLLCriterion.cu ├── SparseNLLCriterion.cuh ├── SparseNLLCriterionHost.cpp ├── SpatialBatchNormalization.cu ├── Storage.h ├── TemporalConvolutionFBHost.cpp ├── TemporalConvolutionTBC.cu ├── TemporalConvolutionTBC.cuh ├── TemporalConvolutionTBCHost.cpp ├── TemporalKMaxPooling.cu ├── TemporalKMaxPooling.cuh ├── TemporalKMaxPoolingHost.cpp ├── TemporalMaxPooling.cu ├── Tensor.h ├── Utils.cpp ├── Utils.h ├── WeightedLookupTable.cu ├── WeightedLookupTableHost.cpp ├── fft │ ├── CuFFTConvolution.cpp │ ├── CuFFTConvolution.cuh │ ├── CuFFTConvolution_AccGradParameters.cu │ ├── CuFFTConvolution_AccGradParameters.cuh │ ├── CuFFTConvolution_UpdateGradInput.cu │ ├── CuFFTConvolution_UpdateGradInput.cuh │ ├── CuFFTConvolution_UpdateOutput.cu │ ├── CuFFTConvolution_UpdateOutput.cuh │ ├── CuFFTStrategy.h │ ├── CuFFTWrapper.cu │ ├── CuFFTWrapper.cuh │ ├── FBFFTDevice.cu │ ├── FBFFTHost.cpp │ ├── FBFFTHost.h │ ├── FFTIteratedConvolution.cu │ ├── FFTWrapperLua.cpp │ ├── SpatialConvolutionCuFFT.cpp │ ├── SpatialConvolutionCuFFT.h │ ├── SpatialConvolutionCuFFTHost.cpp │ ├── SpatialConvolutionCuFFTTuner.cpp │ ├── SpatialConvolutionCuFFTTuner.h │ ├── Utils-inl.h │ ├── Utils.cuh │ └── Utils.h ├── init.cu └── util │ ├── AsyncCopier.cpp │ ├── AsyncCopier.h │ ├── GlobalAsyncCopier.cpp │ ├── GlobalAsyncCopier.h │ ├── Misc.cpp │ ├── Misc.h │ ├── Transform.cu │ └── Transform.cuh └── test ├── BiasTest.cpp ├── ConvolutionTest.cpp ├── CuBLASTest.cpp ├── CudaTensorTest.cpp ├── CudaTensorTestKernels.cu ├── CudaTensorTestKernels.cuh ├── FFTTest.cpp ├── InputCentricConvolution_UpdateOutput.cu ├── InputCentricConvolution_UpdateOutput.cuh ├── ReferenceConvolutions.cpp ├── ReferenceConvolutions.h ├── TestUtils.cpp ├── TestUtils.h ├── benchmark_fft.lua ├── fb_test.lua ├── run_test_HSM_seed.sh ├── test.lua ├── test_BatchNormalization.lua ├── test_ClassHierarchicalNLLCriterion.lua ├── test_CuBLAS.lua ├── test_CuFFT.lua ├── test_DataParallel.lua ├── test_FBFFTTiling.lua ├── test_FFT.lua ├── test_FFTModule.lua ├── test_FeatureLPPooling.lua ├── test_GroupKMaxPooling.lua ├── test_HSM.lua ├── test_HSM_cuda.lua ├── test_HSM_speed.lua ├── test_KMaxPooling.lua ├── test_LinearNB.lua ├── test_LocallyConnected.lua ├── test_LookupTableGPU.lua ├── test_ModelParallel.lua ├── test_OneBitDataParallel.lua ├── test_OneBitQuantization.lua ├── test_OneBitSGD.lua ├── test_SequentialCriterion.lua ├── test_SparseModules.lua ├── test_SparseNLLCriterion.lua ├── test_SpatialConvolutionTuned.lua ├── test_TemporalConvolutionFB.lua ├── test_TemporalKMaxPooling.lua ├── test_Threshold.lua ├── test_Util.lua ├── test_VolumetricMaxPooling.lua ├── test_WeightedLookupTable.lua └── volumetric_average_pooling_test.lua /.dokx: -------------------------------------------------------------------------------- 1 | return { 2 | packageName = 'fbcunn', 3 | tocLevel = 'class', 4 | githubURL = 'facebook/fbcunn', 5 | exclude = {'**/init.lua', '**/test/*', '**/nn_layers.lua'} 6 | } 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | TARGETS 2 | facebook 3 | build 4 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "cuda"] 2 | path = cuda 3 | url = https://github.com/facebook/fbcuda.git 4 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | - Find a machine with [Ubuntu 14.04+](http://www.ubuntu.com/) and an NVIDIA GPU with compute capability 3.5 or above 2 | 3 | Then, install everything that is needed by using the instructions below: 4 | 5 | Install CUDA 6 | ============= 7 | ```bash 8 | sudo apt-get install build-essential 9 | ``` 10 | 11 | If you are using a Virtual Machine (like Amazon EC2 instances), install: 12 | ```bash 13 | sudo apt-get update 14 | sudo apt-get install linux-generic 15 | ``` 16 | 17 | Download the CUDA .deb file for Linux Ubuntu 14.04 64-bit from this page: https://developer.nvidia.com/cuda-downloads 18 | It would be a file named similar this: cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb 19 | Now, install it using: 20 | ```bash 21 | sudo dpkg -i cuda-repo-ubuntu1404-7-5-local_7.5-18_amd64.deb 22 | sudo apt-get update 23 | sudo apt-get install cuda 24 | echo "export PATH=/usr/local/cuda/bin/:\$PATH; export LD_LIBRARY_PATH=/usr/local/cuda/lib64/:\$LD_LIBRARY_PATH; " >>~/.bashrc && source ~/.bashrc 25 | ``` 26 | 27 | Restart your computer 28 | 29 | Install CuDNN 30 | - Go to https://developer.nvidia.com/cuDNN and use the Download button (you have to register and login to download. no way around that.) 31 | - Download cuDNN R3 for Linux. You will download a file cudnn-7.0-linux-x64-v3.0-prod.tgz 32 | then use the commands: 33 | ```bash 34 | tar -xvf cudnn-7.0-linux-x64-v3.0-prod.tgz 35 | sudo cp cuda/include/*.h /usr/local/cuda/include 36 | sudo cp cuda/lib64/*.so* /usr/local/cuda/lib64 37 | ``` 38 | 39 | Install Torch Dependencies 40 | ========================== 41 | ```bash 42 | curl -sk https://raw.githubusercontent.com/torch/ezinstall/master/install-deps | bash -e 43 | ``` 44 | 45 | Install Torch in a local folder 46 | ================================ 47 | ```bash 48 | git clone https://github.com/torch/distro.git ~/torch --recursive 49 | cd ~/torch; ./install.sh 50 | ``` 51 | 52 | If you want to uninstall torch, you can use the command: `rm -rf ~/torch` 53 | 54 | Install Folly, fbthrift, thpp and fblualib 55 | ============================================ 56 | ```bash 57 | curl -sk https://raw.githubusercontent.com/soumith/fblualib/master/install_all.sh | bash -e 58 | ``` 59 | 60 | Install fbcunn 61 | ============== 62 | ```bash 63 | git clone https://github.com/torch/nn && ( cd nn && git checkout getParamsByDevice && luarocks make rocks/nn-scm-1.rockspec ) 64 | 65 | git clone https://github.com/facebook/fbtorch.git && ( cd fbtorch && luarocks make rocks/fbtorch-scm-1.rockspec ) 66 | 67 | git clone https://github.com/facebook/fbnn.git && ( cd fbnn && luarocks make rocks/fbnn-scm-1.rockspec ) 68 | 69 | # go get a coffee 70 | git clone https://github.com/facebook/fbcunn.git && ( cd fbcunn && luarocks make rocks/fbcunn-scm-1.rockspec ) 71 | ``` 72 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For fbcunn software 4 | 5 | Copyright (c) 2014, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /PATENTS: -------------------------------------------------------------------------------- 1 | Additional Grant of Patent Rights Version 2 2 | 3 | "Software" means the fbcunn software distributed by Facebook, Inc. 4 | 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable 7 | (subject to the termination provision below) license under any Necessary 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise 9 | transfer the Software. For avoidance of doubt, no license is granted under 10 | Facebook’s rights in any patent claims that are infringed by (i) modifications 11 | to the Software made by you or any third party or (ii) the Software in 12 | combination with any software or other technology. 13 | 14 | The license granted hereunder will terminate, automatically and without notice, 15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate 16 | directly or indirectly, or take a direct financial interest in, any Patent 17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate 18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or 19 | in part from any software, technology, product or service of Facebook or any of 20 | its subsidiaries or corporate affiliates, or (iii) against any party relating 21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its 22 | subsidiaries or corporate affiliates files a lawsuit alleging patent 23 | infringement against you in the first instance, and you respond by filing a 24 | patent infringement counterclaim in that lawsuit against that party that is 25 | unrelated to the Software, the license granted hereunder will not terminate 26 | under section (i) of this paragraph due to such counterclaim. 27 | 28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is 29 | necessarily infringed by the Software standing alone. 30 | 31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, 32 | or contributory infringement or inducement to infringe any patent, including a 33 | cross-claim or counterclaim. 34 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `fbcunn` - Deep Learning CUDA Extensions from Facebook AI Research. 2 | 3 | ## What? 4 | [Deep Learning](http://en.wikipedia.org/wiki/Deep_learning) is a popular kid in machine learning these days. 5 | At [Facebook AI Research](http://research.facebook.com/ai/) we've been doing quite a bit of deep learning research. 6 | This repository contains our highly engineered deep learning modules for GPUs, to accelerate your own deep learning endeavors. 7 | It plugs into the [Torch-7](https://github.com/torch/torch7/wiki/Cheatsheet) framework and installs seamlessly via `luarocks`, 8 | and is fully compatible with torch's [nn](https://github.com/torch/nn) package. 9 | 10 | In summary, we're releasing fast nn modules for Convnets and neural networks in general: 11 | - Fast spatial convolution modules that use FFT to accelerate convolutions. [We wrote a paper about them](http://arxiv.org/abs/1412.7580) if you'd like to read more. 12 | - Fast Temporal convolutions that are 1.5x to 10x faster compared to Torch's cunn implementations. 13 | - nn.DataParallel and nn.ModelParallel containers. Plug your model in them and see it accelerate over multiple GPUs 14 | - Wrappers to use FFT/IFFT as nn modules. 15 | - Fast LookupTable that is used for Neural Language Models and word embeddings. Much faster than the one in torch/nn 16 | - Hierarchical SoftMax module, now classifying 1 million classes is a practically viable strategy 17 | - LP and Max Pooling over feature maps (usable for MaxOut). 18 | - more goodies. Full documentation and spec is here: https://facebook.github.io/fbcunn/fbcunn/ 19 | 20 | Examples: 21 | - Training an imagenet based classifier in Torch-7 using multiple GPUs (showcasing our FFT convolutions as well as our ModelParallel container) 22 | 23 | ## Why? 24 | We know that science and technology progress faster when researchers exchange ideas and tools. Making significant progress in AI will take the participation of the entire research community, and We want to do what we can to make the field progress faster. That is why we love open science and open source. We publish our research with open access, very often on [Arxiv](http://arxiv.org), on [our members' web sites](http://research.facebook.com/ai), and eventually on the [FAIR publications page](https://research.facebook.com/publications/ai/). And we share our code right here! 25 | 26 | ## Who is this for? 27 | This will help you if you want to train large-scale deep learning systems (particularly convolutional nets) for image recognition, NLP, or other applications. This will help you particularly well if already are a Torch user. 28 | 29 | ## How to install them? 30 | You will find step-by-step and detailed installation instructions in **[INSTALL.md](INSTALL.md)** 31 | 32 | We've worked hard to make the install as pain-free as possible. If you have an issue, use github issues, we'll try our best to help. 33 | 34 | ## How to use them? 35 | 36 | - The DataParallel and ModelParallel modules are super-simple to use. The unit-test doubles as both an example as well as a test. There is also a practical example of ModelParallel in examples/imagenet. If you want more examples, please do ask. 37 | ```lua 38 | m = nn.DataParallel():add(nn.SpatialConvolution(...)):add(nn.ReLU()) -- see, so simple 39 | ``` 40 | 41 | - Convolution modules are even simpler to use. They are fully API compatible with their [nn equivalents](https://github.com/torch/nn/blob/master/doc/convolution.md). For an example, look at examples/imagenet 42 | ```lua 43 | conv = nn.SpatialConvolutionCuFFT(...) -- fast spatial convolutions! 44 | conv = nn.TemporalConvolutionFB(...) -- fast temporal convolutions! 45 | ``` 46 | 47 | - LookupTable is named `nn.LookupTableGPU` and Hierarchical SoftMax as `nn.HSM`, they are super-simple to use as well, check the docs out. 48 | 49 | https://facebook.github.io/fbcunn/fbcunn/ 50 | 51 | The unit tests in the test/ folder also double as examples! If you have a question, do ask. 52 | 53 | 54 | ## I want exact details of everything... 55 | 56 | API docs, generated with [torch-dokx](https://github.com/deepmind/torch-dokx), are available at http://facebook.github.io/fbcunn/fbcunn/ 57 | 58 | Some of the unit tests need [fbnn](https://github.com/facebook/fbnn) 59 | 60 | ## License 61 | 62 | `fbcunn` is BSD-licensed. We also provide an additional patent 63 | grant. 64 | -------------------------------------------------------------------------------- /cmake/FindFolly.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | # 8 | # - Try to find folly 9 | # This will define 10 | # FOLLY_FOUND 11 | # FOLLY_INCLUDE_DIR 12 | # FOLLY_LIBRARIES 13 | 14 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR) 15 | 16 | INCLUDE(FindPackageHandleStandardArgs) 17 | 18 | FIND_LIBRARY(FOLLY_LIBRARY folly) 19 | FIND_PATH(FOLLY_INCLUDE_DIR "folly/String.h") 20 | 21 | SET(FOLLY_LIBRARIES ${FOLLY_LIBRARY}) 22 | 23 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Folly 24 | REQUIRED_ARGS FOLLY_INCLUDE_DIR FOLLY_LIBRARIES) 25 | -------------------------------------------------------------------------------- /cmake/FindGlog.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | # 8 | # GLOG_FOUND 9 | # GLOG_INCLUDE_DIR 10 | # GLOG_LIBRARIES 11 | 12 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR) 13 | 14 | INCLUDE(FindPackageHandleStandardArgs) 15 | 16 | FIND_LIBRARY(GLOG_LIBRARY glog) 17 | FIND_PATH(GLOG_INCLUDE_DIR "glog/logging.h") 18 | 19 | SET(GLOG_LIBRARIES ${GLOG_LIBRARY}) 20 | 21 | FIND_PACKAGE_HANDLE_STANDARD_ARGS( 22 | Glog 23 | REQUIRED_ARGS GLOG_INCLUDE_DIR GLOG_LIBRARY) 24 | -------------------------------------------------------------------------------- /cmake/FindTHPP.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | # 8 | # - Try to find thpp 9 | # This will define 10 | # THPP_FOUND 11 | # THPP_INCLUDE_DIR 12 | # THPP_LIBRARIES 13 | 14 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR) 15 | 16 | INCLUDE(FindPackageHandleStandardArgs) 17 | 18 | FIND_LIBRARY(THPP_LIBRARY thpp) 19 | FIND_PATH(THPP_INCLUDE_DIR "thpp/Tensor.h") 20 | 21 | SET(THPP_LIBRARIES ${THPP_LIBRARY}) 22 | 23 | FIND_PACKAGE_HANDLE_STANDARD_ARGS(Folly 24 | REQUIRED_ARGS THPP_INCLUDE_DIR THPP_LIBRARIES) 25 | -------------------------------------------------------------------------------- /cmake/MultiLevelIncludes.cmake: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2014, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the BSD-style license found in the 5 | # LICENSE file in the root directory of this source tree. An additional grant 6 | # of patent rights can be found in the PATENTS file in the same directory. 7 | 8 | # Some projects are installed individually as part of a larger tree, but 9 | # include paths always reference the full include path in the tree. This 10 | # module makes it easier to do so. 11 | # 12 | # Suppose you have a source tree fblualib/thrift/submodule, which is built at 13 | # the submodule level (so you have fblualib/thrift/submodule/CMakeLists.txt) 14 | # Files inside submodule include each other (and files from other sibling 15 | # directories) with the full path: 16 | # 17 | # #include 18 | # #include 19 | # #include 20 | # #include 21 | # 22 | # MLI_SET_DEPTH(2) at the root of your CMakeLists.txt would set "../.." 23 | # as the include path (so fblualib is a subdirectory of that), making 24 | # the includes work. Also, it will set MLI_INCLUDE_OUTPUT_DIR and 25 | # MLI_INCLUDE_RELATIVE_OUTPUT_DIR to directories inside the build tree 26 | # where any generators should output header files so they can be found 27 | # via #include. (we recreate the lowest 2 levels of the hierarchy underneath 28 | # ${CMAKE_BINARY_DIR}) 29 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.7 FATAL_ERROR) 30 | 31 | FUNCTION(MLI_SET_DEPTH level) 32 | SET(dirs) 33 | SET(dir ${CMAKE_SOURCE_DIR}) 34 | SET(relinc) 35 | FOREACH(i RANGE 1 ${level}) 36 | GET_FILENAME_COMPONENT(bn ${dir} NAME) 37 | GET_FILENAME_COMPONENT(dir ${dir} PATH) 38 | LIST(APPEND dirs ${bn}) 39 | SET(relinc "${relinc}/..") 40 | ENDFOREACH() 41 | LIST(REVERSE dirs) 42 | STRING(REPLACE ";" "/" relpath "${dirs}") 43 | SET(MLI_INCLUDE_OUTPUT_DIR 44 | "${CMAKE_BINARY_DIR}/${relpath}" 45 | PARENT_SCOPE) 46 | SET(MLI_INCLUDE_RELATIVE_OUTPUT_DIR 47 | "${relpath}" 48 | PARENT_SCOPE) 49 | INCLUDE_DIRECTORIES( 50 | "${CMAKE_SOURCE_DIR}/${relinc}" 51 | "${CMAKE_BINARY_DIR}") 52 | ENDFUNCTION() 53 | -------------------------------------------------------------------------------- /docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | set -e 4 | DIR=/tmp/fbcunn 5 | rm -rf $DIR && mkdir -p $DIR 6 | CURDIR=$(pwd) 7 | echo $CURDIR 8 | ( 9 | cd $DIR 10 | dokx-build-package-docs -o . $CURDIR 11 | # Fix relative links in HTML to point to the CDN 12 | replace "../_highlight/highlight.pack.js" "//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/highlight.min.js" 13 | replace "../_highlight/styles/github.css" "//cdnjs.cloudflare.com/ajax/libs/highlight.js/8.4/styles/github.min.css" 14 | 15 | git init 16 | git checkout -b gh-pages 17 | git add . 18 | git commit -m "Documentation" 19 | git remote add origin git@github.com:facebook/fbcunn.git 20 | git push --set-upstream origin gh-pages -f 21 | ) 22 | -------------------------------------------------------------------------------- /fbcunn/CuBLASWrapper.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | local CuBLASWrapper = torch.class('nn.CuBLASWrapper') 4 | 5 | function CuBLASWrapper:__init(timed) 6 | self.iterDims = 0 7 | self.batchDims = 0 8 | self.handles = 0 9 | self.streams = 0 10 | self.timed = timed or false 11 | end 12 | 13 | function CuBLASWrapper:matmult( 14 | A, B, C, iterDims, batchDims, transA, transB, scale) 15 | self.transA = transA or 'n' 16 | self.transB = transB or 'n' 17 | self.iterDims = table.getn(iterDims) or 0 18 | self.batchDims = table.getn(batchDims) or 0 19 | self.scale = scale or 1.0 20 | A.nn.CuBLASWrapper_matmult(self, A, B, C) 21 | end 22 | 23 | function CuBLASWrapper:matmultComplex( 24 | A, B, C, iterDims, batchDims, transA, transB, scale) 25 | self.transA = transA or 'n' 26 | self.transB = transB or 'n' 27 | self.iterDims = table.getn(iterDims) or 0 28 | self.batchDims = table.getn(batchDims) or 0 29 | self.scale = scale or 1.0 30 | A.nn.CuBLASWrapper_matmultComplex(self, A, B, C) 31 | end 32 | 33 | function CuBLASWrapper:transpose( 34 | A, B, separator, transposeMetaData, handle, stream) 35 | self.separator = separator or 0 36 | self.transposeMetaData = transposeMetaData or false 37 | self.handle = handle or 1 -- always handle 1 by default 38 | self.stream = stream or 0 39 | A.nn.CuBLASWrapper_transpose(self, A, B) 40 | end 41 | 42 | function CuBLASWrapper:transposeComplex( 43 | A, B, separator, transposeMetaData, handle, stream) 44 | self.separator = separator or 0 45 | self.transposeMetaData = transposeMetaData or false 46 | self.handle = handle or 1 -- always handle 1 by default 47 | self.stream = stream or 0 48 | A.nn.CuBLASWrapper_transposeComplex(self, A, B) 49 | end 50 | -------------------------------------------------------------------------------- /fbcunn/FFTCDefs.lua: -------------------------------------------------------------------------------- 1 | local ffi = require 'ffi' 2 | 3 | ffi.cdef[[ 4 | void updateOutputBiasFFI(THCState*, THCudaTensor*, THCudaTensor*); 5 | void accGradParametersBiasFFI( 6 | THCState*, THCudaTensor*, THCudaTensor*, float scale); 7 | void transposeMMFFI(THCState*, 8 | THCudaTensor* tA, 9 | THCudaTensor* tB, 10 | THCudaTensor* tC, 11 | float invNorm, 12 | bool conjugateTransposeA, 13 | bool conjugateTransposeB, 14 | bool accumulate); 15 | typedef struct { 16 | static const int FFT_UpdateOutput = 0; 17 | static const int FFT_UpdateGradInput = 1; 18 | static const int FFT_AccGradParameters = 2; 19 | int pass; 20 | } FFTConvolutionPassFFI; 21 | typedef struct { 22 | THCudaTensor* tensor; 23 | int padL; 24 | int padU; 25 | } TiledDeviceTensorFFI; 26 | void convolveIteratedFFI(THCState* state, 27 | TiledDeviceTensorFFI* input, 28 | THCudaTensor* weight, 29 | TiledDeviceTensorFFI* output, 30 | int numTiles, 31 | int fftSize, 32 | FFTConvolutionPassFFI pass, 33 | float scale); 34 | ]] 35 | -------------------------------------------------------------------------------- /fbcunn/FeatureLPPooling.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | require 'cutorch' 4 | require 'nn' 5 | 6 | local FeatureLPPooling, parent = 7 | torch.class('nn.FeatureLPPooling', 'nn.Module') 8 | 9 | --[[ 10 | Possible inputs that we handle: 11 | 12 | #### `batch_mode = false` 13 | The dimensionality of the input chooses between the following modes: 14 | 15 | ``` 16 | [feature dim] 17 | [feature dim][opt dim 1] 18 | [feature dim][opt dim 1][opt dim 2] 19 | ``` 20 | 21 | #### `batch_mode = true` 22 | The dimensionality of the input chooses between the following modes: 23 | ``` 24 | [batch dim][feature dim] 25 | [batch dim][feature dim][opt dim 1] 26 | [batch dim][feature dim][opt dim 1][opt dim 2] 27 | ``` 28 | 29 | The output has the same number of dimensions as the input, except the feature 30 | dimension size is reduced to ((`input` - `width`) / `stride`) + 1 31 | ]] 32 | function FeatureLPPooling:__init(width, stride, power, batch_mode) 33 | parent.__init(self) 34 | 35 | if (width < 2 or width > 16) then 36 | error('width must be within 2 to 16 at the moment') 37 | end 38 | 39 | if (stride < 1 or stride > 4) then 40 | error('stride must be within 1 to 4 at the moment') 41 | end 42 | 43 | self.width = width 44 | self.stride = stride 45 | self.power = power 46 | self.batch_mode = batch_mode 47 | 48 | self.output = torch.Tensor() 49 | self.gradInput = torch.Tensor() 50 | end 51 | 52 | function FeatureLPPooling:updateOutput(input) 53 | if torch.type(input) == 'torch.CudaTensor' then 54 | input.nn.FeatureLPPooling_updateOutput(self, input) 55 | else 56 | error('CUDA only supported at the moment') 57 | end 58 | return self.output 59 | end 60 | 61 | function FeatureLPPooling:updateGradInput(input, gradOutput) 62 | if torch.type(input) == 'torch.CudaTensor' then 63 | input.nn.FeatureLPPooling_updateGradInput(self, input, gradOutput) 64 | else 65 | error('CUDA only supported at the moment') 66 | end 67 | return self.gradInput 68 | end 69 | -------------------------------------------------------------------------------- /fbcunn/HalfPrecision.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | local libhalfprec = require('libhalfprec') 4 | 5 | local function truncate(floats) 6 | return libhalfprec.toFloatCUDA(libhalfprec.toHalfCUDA(floats)) 7 | end 8 | 9 | local HalfPrecision, parent = 10 | torch.class('nn.HalfPrecision', 'nn.Module') 11 | 12 | function HalfPrecision:__init() 13 | parent.__init(self) 14 | self.output = torch.CudaTensor() 15 | self.gradInput = torch.CudaTensor() 16 | end 17 | 18 | function HalfPrecision:updateOutput(input) 19 | input = input:contiguous():cuda() 20 | self.output = truncate(input) 21 | self.output:resizeAs(input) 22 | return self.output 23 | end 24 | 25 | function HalfPrecision:updateGradInput(input, gradOutput) 26 | self.gradInput = truncate(gradOutput) 27 | self.gradInput:resizeAs(gradOutput) 28 | return self.gradInput 29 | end 30 | -------------------------------------------------------------------------------- /fbcunn/LookupTableGPU.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | -- Author: Michael Mathieu 3 | 4 | require 'cunn' 5 | 6 | --[[ 7 | Fast lookup table, supporting both CPU and GPU modes. 8 | ]] 9 | local LookupTableGPU, parent = torch.class('nn.LookupTableGPU', 'nn.Module') 10 | 11 | 12 | --[[ 13 | If `featuresInDim2` is `true`, an input of dimension `batchSize` ${\times}$ `N` will produce an output of size `batchSize` ${\times}$ `nOutput` 14 | ${\times}$ `N`. If it is set to `false` (default) it will produce an output 15 | of size `batchSize` ${\times}$ `N` ${\times}$ `nOutput`. 16 | ]] 17 | function LookupTableGPU:__init(nInput, nOutput, featuresInDim2) 18 | parent:__init(self) 19 | self.nInput = nInput 20 | self.nOutput = nOutput 21 | self.featuresInDim2 = featuresInDim2 or false 22 | -- Careful : weight is transposed from nn.Linear 23 | self.weight = torch.Tensor(nInput, nOutput) 24 | self.gradWeight = torch.Tensor(nInput, nOutput) 25 | self.output = torch.Tensor() 26 | 27 | self:reset() 28 | end 29 | 30 | function LookupTableGPU:reset(stdv) 31 | stdv = stdv or 1 32 | self.weight:normal(0, stdv) 33 | end 34 | 35 | function LookupTableGPU:parameters() 36 | return {self.weight}, {self.gradWeight} 37 | end 38 | 39 | -- input should be a 1d (size N) or 2d (size batchSize x N) 40 | -- tensor of byte or long on CPU, cudaTensor on GPU. 41 | -- It contains the indices of the lookup. 42 | function LookupTableGPU:updateOutput(input) 43 | if input:dim() == 2 then 44 | if self.featuresInDim2 then 45 | self.output:resize(input:size(1), self.nOutput, input:size(2)) 46 | else 47 | self.output:resize(input:size(1), input:size(2), self.nOutput) 48 | end 49 | else 50 | self.output:resize(input:size(1), self.nOutput) 51 | end 52 | 53 | if input:type() == 'torch.CudaTensor' then 54 | input.nn.LookupTableGPU_updateOutput(input, self.weight, self.output, 55 | self.featuresInDim2) 56 | else 57 | if input:dim() == 2 then 58 | -- batch mode 59 | local this_output 60 | for batch = 1, input:size(1) do 61 | for i = 1, input:size(2) do 62 | if self.featuresInDim2 then 63 | this_output = self.output[{batch, {}, i}] 64 | else 65 | this_output = self.output[{batch, i}] 66 | end 67 | if self.unk_index and (input[batch][i] == self.unk_index) then 68 | this_output:zero() 69 | else 70 | this_output:copy(self.weight[input[batch][i]]) 71 | end 72 | end 73 | end 74 | else 75 | -- non-batch mode 76 | if input:size(1) == 1 then 77 | if self.unk_index and (input[1] == self.unk_index) then 78 | self.zeros = self.zeros or torch.zeros(1, self.nOutput) 79 | self.output = self.zeros 80 | else 81 | self.output = self.weight[input[1]]:reshape(1, self.nOutput) 82 | end 83 | else 84 | self.output:resize(input:size(1), self.nOutput) 85 | for i = 1,input:size(1) do 86 | if self.unk_index and (input[i] == self.unk_index) then 87 | self.output[i]:zero() 88 | else 89 | self.output[i]:copy(self.weight[input[i]]) 90 | end 91 | end 92 | end 93 | end 94 | end 95 | 96 | return self.output 97 | end 98 | 99 | function LookupTableGPU:updateGradInput(input, gradOutput) 100 | --print("Should not be used") --TODO 101 | end 102 | 103 | function LookupTableGPU:accGradParameters(input, gradOutput, scale) 104 | scale = scale or 1 105 | if input:type() == 'torch.CudaTensor' then 106 | input.nn.LookupTableGPU_accGradParameters(input, gradOutput, 107 | self.gradWeight, scale, 108 | self.featuresInDim2) 109 | else 110 | if input:dim() == 2 then 111 | -- batch mode 112 | for batch = 1, input:size(1) do 113 | for i = 1, input:size(2) do 114 | if (self.unk_index == nil) or 115 | (input[batch][i] ~= self.unk_index) then 116 | if self.featuresInDim2 then 117 | self.gradWeight[input[batch][i]] 118 | :add(scale, gradOutput[{batch, {}, i}]) 119 | else 120 | self.gradWeight[input[batch][i]] 121 | :add(scale, gradOutput[batch][i]) 122 | end 123 | end 124 | end 125 | end 126 | else 127 | -- non-batch mode 128 | for i = 1,input:size(1) do 129 | if (self.unk_index == nil) or 130 | (input[i] ~= self.unk_index) then 131 | self.gradWeight[input[i] ]:add(scale, gradOutput[i]) 132 | end 133 | end 134 | end 135 | end 136 | end 137 | -------------------------------------------------------------------------------- /fbcunn/OneBitDataParallel.lua: -------------------------------------------------------------------------------- 1 | require('cutorch') 2 | require 'fbnn' 3 | local util = require('fb.util') 4 | local OBSGD = require('fbcunn.OneBitSGD') 5 | local withDevice = cutorch.withDevice 6 | 7 | --[[ OneBitDataParallel implements the "1-Bit Stochastic Gradient 8 | Descent and Application to Data-Parallel Distributed Training of 9 | Speech DNNs" paper of Frank Seide, Hao Fu, Jasha Droppo, Gang Li, and 10 | Dong Yu. 11 | 12 | The implementation is similar to a vanilla DataParallel module, except we replace the averaging gradient step with a quantize-copy-merge-broadcast procedure. 13 | 14 | 15 | ]] 16 | local OneBitDataParallel, parent = 17 | torch.class('nn.OneBitDataParallel', 'nn.DataParallel') 18 | 19 | function OneBitDataParallel:__init(dimension, config) 20 | parent.__init(self, dimension) 21 | self.config = {} 22 | self.config.min_elements = config.min_elements or 20 23 | self.config.adagrad_learning_rate = config.learningRate or 1.0 24 | self.config.momentum_rate = config.momentum or 0 25 | 26 | -- Aggregators for each [row][gradient] 27 | self._aggregators = util.defaultdict(function() return end) 28 | self.home_device = 1 29 | end 30 | 31 | 32 | function OneBitDataParallel:_should_run_one_bit_sgd(gradients) 33 | -- TODO(tulloch) - flesh this test out 34 | assert(gradients) 35 | assert(#gradients >= 1) 36 | return gradients[1]:nDimension() == 2 and 37 | gradients[1]:nElement() > self.config.min_elements 38 | end 39 | 40 | function OneBitDataParallel:_combine_gradients(row_idx, gradients) 41 | assert(#gradients >= 1) 42 | if not self:_should_run_one_bit_sgd(gradients) then 43 | return parent._combine_gradients(self, row_idx, gradients) 44 | end 45 | 46 | if not self._aggregators[row_idx] then 47 | local g = gradients[1] 48 | self._aggregators[row_idx] = OBSGD.OneBitAggregator( 49 | self.config, 50 | function() return torch.Tensor():typeAs(g):resizeAs(g):zero() end, 51 | function(dest, source) return self:gpuSend(dest, source) end, 52 | self.home_device 53 | ) 54 | self.home_device = self.home_device % cutorch.getDeviceCount() + 1 55 | end 56 | self._aggregators[row_idx]:run(gradients) 57 | end 58 | -------------------------------------------------------------------------------- /fbcunn/OneBitQuantization.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | require 'cutorch' 4 | require 'nn' 5 | 6 | --[[ 7 | CUDA implementation of the quantize/unquantize methods used by `nn.OneBitDataParallel`. 8 | ]] 9 | local OneBitQuantization = torch.class('nn.OneBitQuantization') 10 | 11 | function OneBitQuantization:__init() 12 | self.quantized = torch.CudaTensor() 13 | self.non_quantized = torch.CudaTensor() 14 | self.quantization_error = nil 15 | self.avg_pos = torch.CudaTensor() 16 | self.avg_neg = torch.CudaTensor() 17 | end 18 | 19 | function OneBitQuantization:reset() 20 | self.quantization_error = nil 21 | end 22 | 23 | function OneBitQuantization:quantize(non_quantized_input) 24 | -- When starting a new quantization chain, we start with zero error 25 | if not self.quantization_error then 26 | self.quantization_error = non_quantized_input:clone() 27 | self.quantization_error:zero() 28 | end 29 | 30 | non_quantized_input.nn.OneBitQuantization_quantize(self, non_quantized_input) 31 | return self.quantized, self.quantization_error, self.avg_pos, self.avg_neg 32 | end 33 | 34 | function OneBitQuantization:dequantize(quantized_input, 35 | avg_pos, avg_neg, num_orig_cols) 36 | quantized_input.nn.OneBitQuantization_dequantize( 37 | self, quantized_input, avg_pos, avg_neg, num_orig_cols) 38 | return self.non_quantized 39 | end 40 | -------------------------------------------------------------------------------- /fbcunn/TemporalConvolutionFB.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | require 'nn' 4 | 5 | local TemporalConvolutionFB, parent = 6 | torch.class('nn.TemporalConvolutionFB', 'nn.Module') 7 | 8 | function TemporalConvolutionFB:__init(inputFrameSize, outputFrameSize, kW, dW) 9 | parent.__init(self) 10 | 11 | dW = dW or 1 12 | 13 | self.inputFrameSize = inputFrameSize 14 | self.outputFrameSize = outputFrameSize 15 | self.kW = kW 16 | self.dW = dW 17 | 18 | self.weight = torch.Tensor(outputFrameSize, kW, inputFrameSize) 19 | self.bias = torch.Tensor(outputFrameSize) 20 | self.gradWeight = torch.Tensor(outputFrameSize, kW, inputFrameSize) 21 | self.gradBias = torch.Tensor(outputFrameSize) 22 | 23 | self:reset() 24 | end 25 | 26 | function TemporalConvolutionFB:reset(stdv) 27 | if stdv then 28 | stdv = stdv * math.sqrt(3) 29 | else 30 | stdv = 1/math.sqrt(self.kW*self.inputFrameSize) 31 | end 32 | if nn.oldSeed then 33 | self.weight:apply(function() 34 | return torch.uniform(-stdv, stdv) 35 | end) 36 | self.bias:apply(function() 37 | return torch.uniform(-stdv, stdv) 38 | end) 39 | else 40 | self.weight:uniform(-stdv, stdv) 41 | self.bias:uniform(-stdv, stdv) 42 | end 43 | end 44 | 45 | function TemporalConvolutionFB:updateOutput(input) 46 | input.nn.TemporalConvolutionFB_updateOutput(self, input) 47 | return self.output 48 | end 49 | 50 | function TemporalConvolutionFB:updateGradInput(input, gradOutput) 51 | if self.gradInput then 52 | return input.nn.TemporalConvolutionFB_updateGradInput( 53 | self, input, gradOutput) 54 | end 55 | end 56 | 57 | function TemporalConvolutionFB:accGradParameters(input, gradOutput, scale) 58 | scale = scale or 1 59 | input.nn.TemporalConvolutionFB_accGradParameters( 60 | self, input, gradOutput, scale) 61 | end 62 | 63 | function TemporalConvolutionFB:sharedAccUpdateGradParameters(input, gradOutput, lr) 64 | -- we do not need to accumulate parameters when sharing: 65 | self:defaultAccUpdateGradParameters(input, gradOutput, lr) 66 | end 67 | -------------------------------------------------------------------------------- /fbcunn/TemporalKMaxPooling.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | -- TemporalKmaxPooling 4 | -- Input : (bsize x) width x height 5 | -- Output : (bisze x) k_out x height 6 | -- with k_out = max(k_out_prop, inputSeqLen) 7 | -- where k_out_prop = max(k, ceil(k_dynamic*inputSeqLen)) 8 | 9 | require 'cutorch' 10 | require 'nn' 11 | 12 | local TemporalKMaxPooling, parent = 13 | torch.class('nn.TemporalKMaxPooling', 'nn.Module') 14 | 15 | function TemporalKMaxPooling:__init(k, k_dynamic) 16 | parent.__init(self) 17 | 18 | self.k = k 19 | if k_dynamic then 20 | assert(k_dynamic <= 1 and k_dynamic >=0, 21 | 'k_dynamic must be between 0 and 1') 22 | end 23 | self.k_dynamic = k_dynamic or -1 24 | 25 | -- k_dynamic is an optional scalar parameter between 0 and 1 26 | -- that makes k a fraction of the input sequence size. 27 | 28 | -- To follow Kalchbrenner et al's architecture on Dynamic k-Max Pooling: 29 | -- Use (k = k_top, kDynamic = (L - l)/L), with 30 | -- L : total number of conv layers, 31 | -- l : current convolutional layer to which the pooling is applied, 32 | -- k_top : fixed pooling parameter for the topmost convolutional layer. 33 | 34 | self.output = torch.CudaTensor() 35 | self.gradInput = torch.CudaTensor() 36 | self.indices = torch.CudaTensor() 37 | end 38 | 39 | function TemporalKMaxPooling:updateOutput(input) 40 | input = input:contiguous() 41 | input.nn.TemporalKMaxPooling_updateOutput(self, input) 42 | return self.output 43 | end 44 | 45 | function TemporalKMaxPooling:updateGradInput(input, gradOutput) 46 | input = input:contiguous() 47 | gradOutput = gradOutput:contiguous() 48 | 49 | input.nn.TemporalKMaxPooling_updateGradInput(self, input, gradOutput) 50 | return self.gradInput 51 | end 52 | -------------------------------------------------------------------------------- /fbcunn/init.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'fbnn' 3 | require 'cunn' 4 | require 'libfbcunn' 5 | require 'fbcunn.cuda_ext' 6 | 7 | include('AbstractParallel.lua') 8 | include('BatchNormalization.lua') 9 | include('CuBLASWrapper.lua') 10 | include('DataParallel.lua') 11 | include('FeatureLPPooling.lua') 12 | include('FFTWrapper.lua') 13 | include('HalfPrecision.lua') 14 | include('LookupTableGPU.lua') 15 | include('ModelParallel.lua') 16 | include('OneBitDataParallel.lua') 17 | include('OneBitQuantization.lua') 18 | include('OneBitSGD.lua') 19 | include('FFTCDefs.lua') 20 | include('SpatialBatchNormalization.lua') 21 | -- include('SpatialConvolutionFFT.lua') 22 | -- include('SpatialConvolutionCuFFT.lua') 23 | -- include('SpatialConvolutionFBFFT.lua') 24 | -- include('SpatialConvolutionFBFFTGemm.lua') 25 | -- include('SpatialConvolutionFFTTiled.lua') 26 | -- include('SpatialConvolutionFFTTiledSync.lua') 27 | -- include('SpatialConvolutionFFTTiledAsync.lua') 28 | -- include('SpatialConvolutionFFTTiledIterated.lua') 29 | -- include('SpatialConvolution.lua') 30 | include('TemporalConvolutionFB.lua') 31 | include('TemporalKMaxPooling.lua') 32 | 33 | -- Monkey-patch module to include getParametersByDevice 34 | -- Get the params of the module separated by device. 35 | -- Returns the pair: 36 | -- {0 = flat tensor containing CPU weights, 37 | -- 1 = flat tensor containing weights from device 1, 38 | -- ... 39 | -- N = ... containing weights from device N}, 40 | -- {0 = flat tensor containing CPU grads, 41 | -- 1 = ... containing grads from device 1, ...} 42 | function nn.Module:getParametersByDevice() 43 | local n_dev = cutorch.getDeviceCount() 44 | local d2weights = {} -- Device => { tensor1, tensor2, ..., tensorN } 45 | local d2grads = {} -- Device => { tensor1, tensor2, ..., tensorN } 46 | 47 | local function tensor_to_dev(tensor) 48 | local tnm = torch.typename(tensor) 49 | if tnm == 'torch.CudaTensor' then 50 | return tensor:getDevice() 51 | end 52 | return 0 53 | end 54 | 55 | local params, grads = self:parameters() 56 | assert(#params == #grads) 57 | -- Herd each tensor into appropriate row of weights,grads 58 | for i = 1,#params do 59 | local p = params[i] 60 | local g = grads[i] 61 | local d = tensor_to_dev(p) 62 | if d ~= tensor_to_dev(g) then 63 | error(("Improbable module; params,grads on devices %d,%d"): 64 | format(d, tensor_to_dev(g))) 65 | end 66 | if not d2weights[d] then 67 | d2weights[d] = {} 68 | d2grads[d] = {} 69 | end 70 | table.insert(d2weights[d], p) 71 | table.insert(d2grads[d], g) 72 | end 73 | 74 | local function gather(dev, params, grads) 75 | if not params or #params == 0 then 76 | return nil 77 | end 78 | if dev == 0 then 79 | return nn.Module.flatten(params), nn.Module.flatten(grads) 80 | end 81 | return cutorch.withDevice(dev, 82 | function() return nn.Module.flatten(params), 83 | nn.Module.flatten(grads) 84 | end) 85 | end 86 | 87 | local ret_params = { } 88 | local ret_grads = { } 89 | for d = 0,n_dev do -- sic 90 | ret_params[d], ret_grads[d] = gather(d, d2weights[d], d2grads[d]) 91 | end 92 | 93 | return ret_params, ret_grads 94 | end 95 | -------------------------------------------------------------------------------- /rocks/fbcunn-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "fbcunn" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/facebook/fbcunn.git", 6 | } 7 | 8 | description = { 9 | summary = "Facebook's extensions to torch/cunn. ", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/facebook/fbcunn", 13 | license = "BSD" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | "nn >= 1.0", 19 | "cutorch >= 1.0", 20 | "multikey", 21 | "fbnn", 22 | "fbtorch" 23 | } 24 | 25 | build = { 26 | type = "command", 27 | build_command = [[ 28 | git submodule init 29 | git submodule update 30 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" 31 | ]], 32 | install_command = "cd build && $(MAKE) install" 33 | } 34 | -------------------------------------------------------------------------------- /src/BLASParameters.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "BLASParameters.h" 4 | 5 | using namespace std; 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { 8 | 9 | std::ostream& operator<<(ostream& os, const BLASParameters& params) { 10 | os << " ITD = " << params.iterDims; 11 | os << " BTD = " << params.batchDims; 12 | os << " RIX = " << params.resourceIndex; 13 | os << " CPLX = " << params.asComplex; 14 | os << " batchStepA = " << params.batchStepA; 15 | os << " batchStepB = " << params.batchStepB; 16 | os << " batchStepC = " << params.batchStepC; 17 | os << " #handles = " << params.handles.size(); 18 | os << " #streams = " << params.streams.size(); 19 | os << " transposeA = " << ((params.transposeA == CUBLAS_OP_T) ? "t " : 20 | (params.transposeA == CUBLAS_OP_C) ? "c " : "n"); 21 | os << " transposeB = " << ((params.transposeB == CUBLAS_OP_T) ? "t " : 22 | (params.transposeB == CUBLAS_OP_C) ? "c " : "n"); 23 | os << " scale = (" << params.scaleRe << ", " << params.scaleIm << ")"; 24 | return os; 25 | } 26 | 27 | }}} 28 | -------------------------------------------------------------------------------- /src/BLASParameters.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace facebook { namespace deeplearning { namespace torch { 9 | 10 | // Column major: columns are contiguous in memory 11 | // Cmxn <- Amxk * Bkxn becomes 12 | // C'nxm <- A'kxm * B'nxk and so C'nxm <- B'nxk * A'kxm 13 | struct BLASParameters { 14 | BLASParameters() : 15 | iterDims(0), 16 | batchDims(0), 17 | resourceIndex(0), 18 | batchStepA(1), 19 | batchStepB(1), 20 | batchStepC(1), 21 | scaleRe(1.0f), 22 | scaleIm(0.0f), 23 | asComplex(false), 24 | accumulate(false), 25 | handles(), 26 | streams(), 27 | transposeA(CUBLAS_OP_N), 28 | transposeB(CUBLAS_OP_N) {} 29 | 30 | // Outermost dimensions to be treated as individual iterations in enclosing 31 | // for loops. 32 | BLASParameters& withIterDims(int i) { 33 | iterDims = i; 34 | return *this; 35 | } 36 | 37 | // After iterDims, remaining outermost dimensions to be treated as batch 38 | // dimensions, for instance, in a gemmbatched call. 39 | BLASParameters& withBatchDims(int i) { 40 | batchDims = i; 41 | return *this; 42 | } 43 | 44 | // Force running on a particular handle / stream index in the handle / 45 | // stream vectors. The actual handle / stream we will end up running on is 46 | // recovered by modulo indexing into the vector, default handle / stream if 47 | // the vectors are empty. 48 | BLASParameters& withResourceIndex(int i) { 49 | resourceIndex = i; 50 | return *this; 51 | } 52 | 53 | // Distance between two batches of A, used in batched mode, in case we want 54 | // to compute one entry every k. Step of zerom means the same matrix A will 55 | // be read over and over again. 56 | BLASParameters& withBatchStepA(int i) { 57 | batchStepA = i; 58 | return *this; 59 | } 60 | 61 | // Distance between two batches of B, used in batched mode, in case we want 62 | // to compute one entry every k. Step of zerom means the same matrix B will 63 | // be read over and over again. 64 | BLASParameters& withBatchStepB(int i) { 65 | batchStepB = i; 66 | return *this; 67 | } 68 | 69 | // Distance between two batches of C, used in batched mode, in case we want 70 | // to compute one entry every k. Step of zerom means the same matrix C will 71 | // be written over and over again. 72 | BLASParameters& withBatchStepC(int i) { 73 | batchStepC = i; 74 | return *this; 75 | } 76 | 77 | // Sets real scale in C += alpha * C + scale * A * B 78 | BLASParameters& withScaleReal(float f) { 79 | scaleRe = f; 80 | return *this; 81 | } 82 | 83 | // Sets imaginary scale in C += alpha * C + scale * A * B 84 | BLASParameters& withScaleImaginary(float f) { 85 | scaleIm = f; 86 | return *this; 87 | } 88 | 89 | // Use cgemm instead of sgemm 90 | BLASParameters& withComplex(bool b) { 91 | asComplex = b; 92 | return *this; 93 | } 94 | 95 | // If true, computes C += scale * A * B. Default is C = scale * A * B. 96 | BLASParameters& withAccumulate(bool b) { 97 | accumulate = b; 98 | return *this; 99 | } 100 | 101 | // Set vector of handle resources 102 | BLASParameters& withHandles(const std::vector& h) { 103 | handles = h; 104 | return *this; 105 | } 106 | 107 | // Set vector of stream resources 108 | BLASParameters& withStreams(const std::vector& s) { 109 | streams = s; 110 | return *this; 111 | } 112 | 113 | // Transpose A 114 | BLASParameters& withTransposeA(cublasOperation_t t) { 115 | transposeA = t; 116 | return *this; 117 | } 118 | 119 | // Transpose B 120 | BLASParameters& withTransposeB(cublasOperation_t t) { 121 | transposeB = t; 122 | return *this; 123 | } 124 | 125 | // Transpose A 126 | BLASParameters& withTransposeA(char c) { 127 | transposeA = (c == 't') ? CUBLAS_OP_T : 128 | ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N); 129 | return *this; 130 | } 131 | 132 | // Transpose B 133 | BLASParameters& withTransposeB(char c) { 134 | transposeB = (c == 't') ? CUBLAS_OP_T : 135 | ((c == 'c') ? CUBLAS_OP_C : CUBLAS_OP_N); 136 | return *this; 137 | } 138 | 139 | unsigned int iterDims; 140 | unsigned int batchDims; 141 | unsigned int resourceIndex; 142 | unsigned int batchStepA; 143 | unsigned int batchStepB; 144 | unsigned int batchStepC; 145 | float scaleRe; 146 | float scaleIm; 147 | bool asComplex; 148 | bool accumulate; 149 | std::vector handles; 150 | std::vector streams; 151 | cublasOperation_t transposeA; 152 | cublasOperation_t transposeB; 153 | }; 154 | 155 | std::ostream& operator<<(std::ostream& os, const BLASParameters& params); 156 | 157 | }}} 158 | -------------------------------------------------------------------------------- /src/ConvolutionBias.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | struct THCudaTensor; 5 | struct THCState; 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { 8 | namespace bias { 9 | 10 | /// Applies an additive bias to all output elements, pointwise, one 11 | /// bias per output plane 12 | /// Performs the operation output[b][o][y][x] += bias[o] 13 | void updateOutputBias(THCState* state, 14 | THCudaTensor* outputTH, 15 | THCudaTensor* biasTH); 16 | 17 | /// Applies an additive bias to all output elements, pointwise, one 18 | /// bias per kernel column. 19 | /// Performs the operation output[b][o][x] += bias[x] 20 | void updateOutputTemporalBias(THCState* state, 21 | THCudaTensor* outputTH, 22 | THCudaTensor* biasTH); 23 | 24 | /// Updates the gradient bias with the scaled sum of the output per 25 | /// output plane 26 | /// Performs the operation gradBias[o] += biasScale * output[b][o][x][y] 27 | void accGradParametersBias(THCState* state, 28 | THCudaTensor* outputTH, 29 | THCudaTensor* gradBiasTH, 30 | float biasScale); 31 | 32 | /// Updates the gradient bias with the scaled sum of the output per 33 | /// output plane 34 | /// Performs the operation gradBias[x] += biasScale * output[b][o][x] 35 | void accGradParametersTemporalBias(THCState* state, 36 | THCudaTensor* outputTH, 37 | THCudaTensor* gradBiasTH, 38 | float biasScale); 39 | 40 | } } } } 41 | -------------------------------------------------------------------------------- /src/CrossMapNormalization.cuh: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Tudor Bosman (tudorb@fb.com) 4 | */ 5 | #pragma once 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { 8 | namespace detail { 9 | 10 | struct CrossMapNormalizationParam { 11 | int batchSize; 12 | int numFeatures; 13 | int featureSize; 14 | int kernelSize; 15 | int kernelRadius; 16 | float scale; 17 | float power; 18 | }; 19 | 20 | void launchCrossMapNormalizationUpdateOutputKernel( 21 | cudaStream_t stream, 22 | const float* input, 23 | float* output, 24 | float* squaredSum, 25 | CrossMapNormalizationParam params); 26 | 27 | void launchCrossMapNormalizationUpdateGradInputKernel( 28 | cudaStream_t stream, 29 | const float* input, 30 | const float* gradOutput, 31 | const float* squaredSum, 32 | float* gradInput, 33 | CrossMapNormalizationParam params); 34 | 35 | }}}} // namespaces 36 | -------------------------------------------------------------------------------- /src/CrossMapNormalizationHost.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Tudor Bosman (tudorb@fb.com) 4 | */ 5 | 6 | #include "THC.h" 7 | #include "src/CrossMapNormalization.cuh" 8 | #include "src/Utils.h" 9 | #include 10 | #include 11 | 12 | namespace facebook { namespace deeplearning { namespace torch { 13 | 14 | namespace { 15 | 16 | // Forward pass 17 | int updateOutput(lua_State* L) { 18 | THCState* state = getCutorchState(L); 19 | auto input = static_cast( 20 | luaT_checkudata(L, 2, "torch.CudaTensor")); 21 | auto output = static_cast( 22 | luaT_getfieldcheckudata(L, 1, "output", "torch.CudaTensor")); 23 | 24 | 25 | int outputIdx = lua_gettop(L); 26 | auto squaredSum = static_cast( 27 | luaT_getfieldcheckudata(L, 1, "squaredSum", "torch.CudaTensor")); 28 | 29 | THAssert(THCudaTensor_checkGPU(state, 3, input, output, squaredSum)); 30 | 31 | detail::CrossMapNormalizationParam param; 32 | param.kernelSize = luaT_getfieldcheckint(L, 1, "size"); 33 | param.kernelRadius = param.kernelSize / 2; 34 | param.scale = luaT_getfieldchecknumber(L, 1, "scale"); 35 | param.power = luaT_getfieldchecknumber(L, 1, "power"); 36 | 37 | int ndims = THCudaTensor_nDimension(state, input); 38 | if (ndims != 3 && ndims != 4) { 39 | luaL_error(L, "Invalid input tensor dimension"); 40 | } 41 | 42 | if (param.kernelSize % 2 == 0) { 43 | luaL_error(L, "Kernel size must be odd"); 44 | } 45 | 46 | // Make tensors contiguous 47 | input = THCudaTensor_newContiguous(state, input); 48 | output = THCudaTensor_newContiguous(state, output); 49 | 50 | // Resize derived tensors based on input 51 | THCudaTensor_resizeAs(state, output, input); 52 | THCudaTensor_resizeAs(state, squaredSum, input); 53 | 54 | param.batchSize = 1; 55 | int firstDim = 0; 56 | if (ndims == 4) { 57 | param.batchSize = THCudaTensor_size(state, input, 0); 58 | firstDim = 1; 59 | } 60 | 61 | param.numFeatures = THCudaTensor_size(state, input, firstDim); 62 | param.featureSize = THCudaTensor_stride(state, input, firstDim); 63 | 64 | detail::launchCrossMapNormalizationUpdateOutputKernel( 65 | THCState_getCurrentStream(state), 66 | THCudaTensor_data(state, input), 67 | THCudaTensor_data(state, output), 68 | THCudaTensor_data(state, squaredSum), 69 | param); 70 | lua_pushvalue(L, outputIdx); 71 | THCudaTensor_free(state, input); 72 | THCudaTensor_free(state, output); 73 | 74 | return 1; 75 | } 76 | 77 | // Backprop 78 | int updateGradInput(lua_State* L) { 79 | THCState* state = getCutorchState(L); 80 | auto input = static_cast( 81 | luaT_checkudata(L, 2, "torch.CudaTensor")); 82 | auto gradOutput = static_cast( 83 | luaT_checkudata(L, 3, "torch.CudaTensor")); 84 | auto gradInput = static_cast( 85 | luaT_getfieldcheckudata(L, 1, "gradInput", "torch.CudaTensor")); 86 | int gradInputIdx = lua_gettop(L); 87 | auto squaredSum = static_cast( 88 | luaT_getfieldcheckudata(L, 1, "squaredSum", "torch.CudaTensor")); 89 | 90 | THAssert(THCudaTensor_checkGPU(state, 4, input, 91 | gradInput, gradOutput, squaredSum)); 92 | 93 | detail::CrossMapNormalizationParam param; 94 | param.kernelSize = luaT_getfieldcheckint(L, 1, "size"); 95 | param.kernelRadius = param.kernelSize / 2; 96 | param.scale = luaT_getfieldchecknumber(L, 1, "scale"); 97 | param.power = luaT_getfieldchecknumber(L, 1, "power"); 98 | 99 | int ndims = THCudaTensor_nDimension(state, input); 100 | if (ndims != 3 && ndims != 4) { 101 | luaL_error(L, "Invalid input tensor dimension"); 102 | } 103 | 104 | if (param.kernelSize % 2 == 0) { 105 | luaL_error(L, "Kernel size must be odd"); 106 | } 107 | 108 | // Make tensors contiguous 109 | input = THCudaTensor_newContiguous(state, input); 110 | gradOutput = THCudaTensor_newContiguous(state, gradOutput); 111 | gradInput = THCudaTensor_newContiguous(state, gradInput); 112 | 113 | // Resize derived tensors based on input 114 | THCudaTensor_resizeAs(state, gradInput, input); 115 | 116 | param.batchSize = 1; 117 | int firstDim = 0; 118 | if (ndims == 4) { 119 | param.batchSize = THCudaTensor_size(state, input, 0); 120 | firstDim = 1; 121 | } 122 | 123 | 124 | param.numFeatures = THCudaTensor_size(state, input, firstDim); 125 | param.featureSize = THCudaTensor_stride(state, input, firstDim); 126 | 127 | detail::launchCrossMapNormalizationUpdateGradInputKernel( 128 | THCState_getCurrentStream(state), 129 | THCudaTensor_data(state, input), 130 | THCudaTensor_data(state, gradOutput), 131 | THCudaTensor_data(state, squaredSum), 132 | THCudaTensor_data(state, gradInput), 133 | param); 134 | 135 | lua_pushvalue(L, gradInputIdx); 136 | THCudaTensor_free(state, gradOutput); 137 | THCudaTensor_free(state, gradInput); 138 | THCudaTensor_free(state, input); 139 | return 1; 140 | } 141 | 142 | const luaL_Reg functions[] = { 143 | {"CrossMapNormalization_updateOutput", updateOutput}, 144 | {"CrossMapNormalization_updateGradInput", updateGradInput}, 145 | {nullptr, nullptr}, 146 | }; 147 | 148 | } // namespace 149 | 150 | void initCrossMapNormalizationCuda(lua_State* L) { 151 | luaT_pushmetatable(L, "torch.CudaTensor"); 152 | luaT_registeratname(L, functions, "nn"); 153 | lua_pop(L, 1); 154 | } 155 | 156 | }}} // namespaces 157 | -------------------------------------------------------------------------------- /src/CuBLASWrapper.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cuda/DeviceTensor.cuh" 5 | 6 | #include "src/BLASParameters.h" 7 | 8 | #include "cublas_v2.h" 9 | #include 10 | #include 11 | #include 12 | 13 | namespace facebook { namespace deeplearning { namespace torch { 14 | 15 | // 16 | // This transposition wrapper implements quick device-side transpositions. 17 | // Consider tensor dimensions are collapsed into a 2-D 'y'-by-'x'. 18 | // The wrapper takes a separator integer and considers dimensions 19 | // (0 .. separator - 1) as being collapsed to form the 'y' 20 | // dimension. Dimensions (separator .. Dim - 1) are collapsed to form the 'x' 21 | // dimension. 22 | // 23 | // The complex case is a bit trickier since Torch does not natively support 24 | // complex numbers, we emulate them with float[2]. In that case, 'x' is 25 | // special in that it has to be exactly [x/2][2] and the inner 2 floats can 26 | // never be transposed. 27 | // 28 | // The invariant is that in and out are sized identically on entry and that 29 | // out is permuted to account for the transposition on exit. 30 | // 31 | // This wrapper requires non-padded tensors since it calls CUBLAS 32 | // under the hood. It could support padding along 1 dimension if needed. 33 | // 34 | template 35 | void transpose(const cuda::DeviceTensor& in, 36 | cuda::DeviceTensor& out, 37 | int separator, 38 | bool asComplex = false, 39 | bool transposeMetaData = true, 40 | cublasHandle_t handle = NULL, 41 | cudaStream_t stream = NULL); 42 | 43 | template 44 | void transposeAsComplex(const cuda::DeviceTensor& in, 45 | cuda::DeviceTensor& out, 46 | int separator, 47 | bool transposeMetaData = true, 48 | cublasHandle_t handle = NULL, 49 | cudaStream_t stream = NULL); 50 | 51 | // Single matmult, not batched, not iterated, complex or real 52 | template 53 | void matmult(cuda::DeviceTensor& C, 54 | const cuda::DeviceTensor& A, 55 | const cuda::DeviceTensor& B, 56 | const BLASParameters& params); 57 | 58 | 59 | // Batched matmult from device pointers and model tensors serve to derive 60 | // problem sizes. This is exposed for convenience to perform fancier batched 61 | // sgemm calls. 62 | void matmultBatched(thrust::host_vector& CPtrVec, 63 | thrust::host_vector& APtrVec, 64 | thrust::host_vector& BPtrVec, 65 | const cuda::DeviceTensor& modelC, 66 | const cuda::DeviceTensor& modelA, 67 | const cuda::DeviceTensor& modelB, 68 | const BLASParameters& params); 69 | 70 | // Batched matmult from device pointers and model tensors serve to derive 71 | // problem sizes. This is exposed for convenience to perform fancier batched 72 | // sgemm calls. 73 | void matmultBatched(thrust::host_vector& CPtrVec, 74 | thrust::host_vector& APtrVec, 75 | thrust::host_vector& BPtrVec, 76 | const cuda::DeviceTensor& modelC, 77 | const cuda::DeviceTensor& modelA, 78 | const cuda::DeviceTensor& modelB, 79 | const BLASParameters& params); 80 | 81 | // Batched matmult, not iterated, complex or real 82 | // batchDims are outermost dimensions of the tensor iterated in parallel 83 | template 84 | void matmultBatched(cuda::DeviceTensor& C, 85 | cuda::DeviceTensor& A, 86 | cuda::DeviceTensor& B, 87 | const BLASParameters& params); 88 | 89 | // Iterated matmult, batch or not, complex or real 90 | // iterDims are outermost dimensions of the tensor iterated sequentially 91 | // batchDims are outermost dimensions of the tensor iterated in parallel 92 | template 93 | void matmultIter(cuda::DeviceTensor& C, 94 | cuda::DeviceTensor& A, 95 | cuda::DeviceTensor& B, 96 | const BLASParameters& params); 97 | 98 | } } } // namespace 99 | -------------------------------------------------------------------------------- /src/CudaTensorUtils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "THCTensor.h" 5 | #include 6 | #include "thpp/Tensor.h" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | struct THCState; 13 | 14 | // unique_ptr destructor for THCudaTensor 15 | struct THCudaTensor; 16 | struct CudaTensorDeleter { 17 | explicit CudaTensorDeleter(THCState* s) : state(s) {} 18 | CudaTensorDeleter() : state(nullptr) {} 19 | 20 | void operator()(THCudaTensor*); 21 | THCState* state; 22 | }; 23 | 24 | // unique_ptr destructor for device-malloc'd memory 25 | struct CudaDeleter { 26 | void operator()(void* p) { 27 | if (p) { 28 | cudaFree(p); 29 | } 30 | } 31 | }; 32 | 33 | namespace facebook { namespace deeplearning { namespace torch { 34 | 35 | /// Constructs a new THCudaTensor initialized to 0 with the given 36 | /// sizes and strides. 37 | /// See D1581014, this method allocates a full tensor whose storage capacity is 38 | /// greater than strictly requested by torch. 39 | std::unique_ptr 40 | makeTHCudaTensorFull(THCState* state, 41 | const std::vector& sizes, 42 | const folly::Optional>& strides = 43 | folly::none); 44 | 45 | /// Constructs a new THCudaTensor which is a view of the aliased 46 | /// THCudaTensor with the given sizes and strides. 47 | /// The requested size (strides(0) * sizes(0)) must fit within the input 48 | /// Tensor otherwise overflows would occur. 49 | std::unique_ptr 50 | makeAliasedTHCudaTensorFull(THCState* state, 51 | THCudaTensor* in, 52 | const std::vector& sizes, 53 | const folly::Optional>& strides = 54 | folly::none); 55 | 56 | /// See D1581014, this method allocates a full tensor whose storage capacity is 57 | /// greater than strictly requested by torch. 58 | std::unique_ptr 59 | makeTHCudaTensorFull(THCState* state, 60 | std::initializer_list sizes, 61 | std::initializer_list strides = 62 | std::initializer_list()); 63 | 64 | /// Copy a THCudaTensor to a new host-resident Tensor. Does not modify 'tensor'. 65 | thpp::Tensor copyFromCuda(THCState* state, 66 | const THCudaTensor* tensor); 67 | 68 | /// Copy a Tensor to a new THCudaTensor. Does not modify 'tensor'. 69 | std::unique_ptr 70 | copyToCuda(THCState* state, thpp::Tensor& tensor); 71 | 72 | template 73 | std::unique_ptr cudaAlloc(size_t size) { 74 | T* ptr = nullptr; 75 | const auto err = cudaMalloc(&ptr, size); 76 | 77 | if (!ptr || err == cudaErrorMemoryAllocation) { 78 | throw std::bad_alloc(); 79 | } 80 | 81 | return std::unique_ptr(ptr, CudaDeleter()); 82 | } 83 | 84 | } } } // namespace 85 | -------------------------------------------------------------------------------- /src/DeviceTensorUtils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cuda/DeviceTensor.cuh" 5 | #include "THCTensor.h" 6 | 7 | #include 8 | 9 | namespace facebook { namespace deeplearning { namespace torch { 10 | 11 | /// Constructs a DeviceTensor initialized from a THCudaTensor. Will 12 | /// throw if the dimensionality does not match. 13 | template class PtrTraits> 15 | cuda::DeviceTensor 16 | torchToDeviceTensor(THCState* state, THCudaTensor* t); 17 | 18 | template 19 | cuda::DeviceTensor 20 | torchToDeviceTensor(THCState* state, THCudaTensor* t) { 21 | return torchToDeviceTensor(state, t); 22 | } 23 | 24 | template 25 | cuda::DeviceTensor 26 | torchToDeviceTensor(THCState* state, THCudaTensor* t) { 27 | return torchToDeviceTensor(state, t); 28 | } 29 | 30 | /// Constructs a DeviceTensor initialized from a THCudaTensor by 31 | /// upcasting or downcasting the tensor to that of a different 32 | /// dimension. 33 | template class PtrTraits> 35 | cuda::DeviceTensor 36 | torchToDeviceTensorCast(THCState* state, THCudaTensor* t); 37 | 38 | template 39 | cuda::DeviceTensor 40 | torchToDeviceTensorCast(THCState* state, THCudaTensor* t) { 41 | return 42 | torchToDeviceTensorCast(state, t); 43 | } 44 | 45 | template 46 | cuda::DeviceTensor 47 | torchToDeviceTensorCast(THCState* state, THCudaTensor* t) { 48 | return 49 | torchToDeviceTensorCast(state, t); 50 | } 51 | 52 | } } } // namespace 53 | 54 | #include "src/DeviceTensorUtils-inl.h" 55 | -------------------------------------------------------------------------------- /src/FeatureLPPooling.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | #include 5 | 6 | namespace facebook { namespace deeplearning { namespace torch { 7 | 8 | bool 9 | runFeatureLPPoolingUpdateOutput( 10 | cudaStream_t stream, 11 | const cuda::DeviceTensor& input, 12 | cuda::DeviceTensor& output, 13 | float power, int width, int stride); 14 | 15 | bool 16 | runFeatureLPPoolingUpdateGradInput( 17 | cudaStream_t stream, 18 | const cuda::DeviceTensor& gradOutput, 19 | const cuda::DeviceTensor& input, 20 | const cuda::DeviceTensor& output, 21 | cuda::DeviceTensor& gradInput, 22 | float power, int width, int stride); 23 | 24 | } } } 25 | -------------------------------------------------------------------------------- /src/HalfPrec.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #include "src/HalfPrec.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include "src/Utils.h" 9 | #include "src/Tensor.h" 10 | #include "src/LuaUtils.h" 11 | #include "THC.h" 12 | 13 | using namespace std; 14 | using namespace facebook::deeplearning::torch; 15 | 16 | namespace { 17 | 18 | // Would be nice to use thrust, but its header files are full of things 19 | // that trip our Werr settings for unused typedefs. 20 | struct HalfTensor { 21 | explicit HalfTensor(THCState* state, THCudaTensor* floats) { 22 | auto sz = THCudaTensor_nElement(state, floats); 23 | auto err = cudaMalloc(&devPtr_, sz * sizeof(half_t)); 24 | if (err != cudaSuccess) { 25 | throw std::runtime_error("failed to cudamalloc HalfTensor"); 26 | } 27 | size_ = sz; 28 | halfprec_ToHalf(THCState_getCurrentStream(state), 29 | THCudaTensor_data(state, floats), devPtr_, size_); 30 | } 31 | 32 | HalfTensor() 33 | : devPtr_(nullptr) 34 | , size_(0) { 35 | } 36 | 37 | ~HalfTensor() { 38 | cudaFree(devPtr_); 39 | } 40 | 41 | void toFloat(THCState* state, THCudaTensor* dest) { 42 | THCudaTensor_resize1d(state, dest, size_); 43 | assert(size_ > 0); 44 | halfprec_ToFloat(THCState_getCurrentStream(state), 45 | devPtr_, THCudaTensor_data(state, dest), size_); 46 | } 47 | 48 | private: 49 | half_t *devPtr_; 50 | size_t size_; 51 | }; 52 | 53 | const char* kLibName = "HalfPrec"; 54 | 55 | int HalfPrec_new(lua_State* l) { 56 | auto dv = new HalfTensor(); 57 | luaT_pushudata(l, dv, kLibName); 58 | return 1; 59 | } 60 | 61 | int HalfPrec_destroy(lua_State* l) { 62 | delete static_cast(luaT_checkudata(l, 1, kLibName)); 63 | return 0; 64 | }; 65 | 66 | int HalfPrec_toHalfCUDA(lua_State* l) { 67 | THCState* state = getCutorchState(l); 68 | auto input = (THCudaTensor*)luaT_checkudata(l, 1, "torch.CudaTensor"); 69 | THAssert(THCudaTensor_checkGPU(state, 1, input)); 70 | auto cinput = THCudaTensor_newContiguous(state, input); 71 | auto dest = new HalfTensor(state, cinput); 72 | 73 | luaT_pushudata(l, dest, kLibName); 74 | THCudaTensor_free(state, cinput); 75 | return 1; 76 | } 77 | 78 | int HalfPrec_toFloatCUDA(lua_State* l) { 79 | THCState* state = getCutorchState(l); 80 | auto input = (HalfTensor*)luaT_checkudata(l, 1, kLibName); 81 | auto dest = THCudaTensor_new(state); 82 | input->toFloat(state, dest); 83 | luaT_pushudata(l, dest, "torch.CudaTensor"); 84 | return 1; 85 | } 86 | 87 | const struct luaL_reg manifest[] = { 88 | {"new", HalfPrec_new}, 89 | {"toHalfCUDA", HalfPrec_toHalfCUDA}, 90 | {"toFloatCUDA", HalfPrec_toFloatCUDA}, 91 | {"free", HalfPrec_destroy}, 92 | {nullptr, nullptr}, 93 | }; 94 | 95 | } 96 | 97 | extern "C" int luaopen_libhalfprec(lua_State* L) { 98 | luaT_newmetatable(L, kLibName, nullptr, 99 | HalfPrec_new, // ctor 100 | HalfPrec_destroy, // dtor 101 | nullptr); 102 | lua_newtable(L); 103 | luaL_register(L, nullptr, manifest); 104 | return 1; 105 | } 106 | -------------------------------------------------------------------------------- /src/HalfPrec.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #pragma once 4 | #include 5 | #include 6 | #include 7 | 8 | typedef uint16_t half_t; 9 | 10 | void halfprec_ToHalf(cudaStream_t stream, 11 | const float* input, 12 | half_t* output, 13 | size_t n); 14 | 15 | void halfprec_ToFloat(cudaStream_t stream, 16 | const half_t* input, 17 | float* output, 18 | size_t n); 19 | -------------------------------------------------------------------------------- /src/HalfPrecKernels.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include "src/HalfPrec.h" 8 | #include "src/util/Transform.cuh" 9 | 10 | using namespace facebook::cuda; 11 | void halfprec_ToHalf(cudaStream_t stream, 12 | const float* input, 13 | half_t* output, 14 | size_t n) { 15 | transform(stream, input, output, n); 16 | } 17 | 18 | void halfprec_ToFloat(cudaStream_t stream, 19 | const half_t* input, 20 | float* output, 21 | size_t n) { 22 | transform(stream, input, output, n); 23 | } 24 | -------------------------------------------------------------------------------- /src/HalfPrecTest.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void cudaCheck(cudaError_t e) { 10 | auto toStr = [&] { 11 | return std::string(cudaGetErrorString(e)); 12 | }; 13 | if (e != cudaSuccess) { 14 | throw std::runtime_error("cuda failure: " + toStr()); 15 | } 16 | e = cudaDeviceSynchronize(); 17 | if (e != cudaSuccess) { 18 | throw std::runtime_error("cuda failure @ synchronize: " + toStr()); 19 | } 20 | } 21 | 22 | template 23 | class CUDA { 24 | public: 25 | explicit CUDA(size_t n) 26 | : n_(n) { 27 | cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T))); 28 | cudaCheck(cudaMemset(vals_, 0, n_ * sizeof(T))); 29 | } 30 | 31 | CUDA(const T* base, size_t n) : 32 | n_(n) { 33 | cudaCheck(cudaMalloc(&vals_, n_ * sizeof(T))); 34 | cudaCheck(cudaMemcpy(vals_, base, n_ * sizeof(T), cudaMemcpyHostToDevice)); 35 | } 36 | 37 | void toHost(T* base) const { 38 | cudaCheck(cudaMemcpy(base, vals_, n_ * sizeof(T), cudaMemcpyDeviceToHost)); 39 | } 40 | 41 | size_t size() const { 42 | return n_; 43 | } 44 | 45 | ~CUDA() { 46 | cudaCheck(cudaFree(vals_)); 47 | } 48 | 49 | T* data() { 50 | return vals_; 51 | } 52 | 53 | private: 54 | T* vals_; 55 | size_t n_; 56 | }; 57 | 58 | TEST(HalfPrec, cuda) { 59 | float hostFloats[] = { 60 | -1, 61 | -100, 62 | 2.3, 63 | 0.0, 64 | 1.0, 65 | 3867.2, 66 | }; 67 | const auto N = sizeof(hostFloats) / sizeof(float); 68 | CUDA devFloats(hostFloats, N); 69 | CUDA devHalfs(N); 70 | 71 | halfprec_ToHalf(nullptr, devFloats.data(), devHalfs.data(), devFloats.size()); 72 | cudaCheck(cudaDeviceSynchronize()); 73 | 74 | { 75 | uint16_t cpuHalfs[N] = { 666 }; 76 | facebook::math::Float16::encode(cpuHalfs, hostFloats, N); 77 | 78 | half_t convertedHalfs[N]; 79 | devHalfs.toHost(convertedHalfs); 80 | for (int i = 0; i < N; i++) { 81 | // The CPU and GPU disagree by a digit sometimes because the GPU 82 | // is using a different rounding mode. 83 | EXPECT_NEAR(cpuHalfs[i], convertedHalfs[i], 1); 84 | } 85 | } 86 | 87 | CUDA exploded(N); 88 | halfprec_ToFloat(nullptr, devHalfs.data(), exploded.data(), N); 89 | float postExpl[N]; 90 | exploded.toHost(postExpl); 91 | for (int i = 0; i < N; i++) { 92 | auto thousandth = fabs(hostFloats[i] / 1000.0); 93 | EXPECT_NEAR(postExpl[i], hostFloats[i], thousandth); 94 | } 95 | } 96 | 97 | int halfSign(half_t h) { 98 | return (h & (1 << 15)) >> 15; 99 | } 100 | 101 | int halfExp(half_t h) { 102 | return (h >> 10) & 31; 103 | } 104 | 105 | int halfMant(half_t h) { 106 | return h & 1023; 107 | } 108 | 109 | TEST(HalfPrec, exhaustive) { 110 | const auto N = 1 << 16; 111 | 112 | half_t hostHalfs[N]; 113 | float hostFloats[N]; 114 | for (int i = 0; i < N; i++) { 115 | hostHalfs[i] = i; 116 | } 117 | facebook::math::Float16::decode(hostFloats, hostHalfs, N); 118 | 119 | CUDA devHalfs(hostHalfs, N); 120 | CUDA devFloats(N); 121 | float devOut[N]; 122 | halfprec_ToFloat(nullptr, devHalfs.data(), devFloats.data(), N); 123 | devFloats.toHost(devOut); 124 | for (int i = 0; i < N; i++) { 125 | if (halfExp(i) == 0) continue; // subnormals 126 | if (halfExp(i) == 31) continue; // inf/nan 127 | if (hostFloats[i] != devOut[i]) { 128 | printf("failure: %d %x s/e/m %01x %03x %04x\n", 129 | i, i, halfSign(i), halfExp(i), halfMant(i)); 130 | EXPECT_EQ(hostFloats[i], devOut[i]); 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/Includes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "thpp/Tensor.h" 4 | #include "thpp/Storage.h" 5 | #include "fblualib/LuaUtils.h" 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { 8 | 9 | using namespace thpp; 10 | using namespace fblualib; 11 | 12 | }}} // namespaces 13 | -------------------------------------------------------------------------------- /src/InitCuda.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Tudor Bosman (tudorb@fb.com) 4 | */ 5 | 6 | #include 7 | 8 | #ifdef FB_INTERNAL 9 | #else 10 | #define LUAOPEN(x) luaopen_fbcunn_cuda_ext(x) 11 | #endif 12 | 13 | namespace facebook { namespace deeplearning { namespace torch { 14 | 15 | void initCrossMapNormalizationCuda(lua_State* L); 16 | void initLocallyConnectedCuda(lua_State* L); 17 | void initLookupTableGPUCuda(lua_State* L); 18 | void initHSMCuda(lua_State* L); 19 | void initTemporalConvolutionFB(lua_State *L); 20 | void initTemporalKMaxPoolingCuda(lua_State* L); 21 | void initOneBitQuantizationCuda(lua_State* L); 22 | void initSparseNLLCriterionCuda(lua_State* L); 23 | void initTemporalConvolutionTBCCuda(lua_State* L); 24 | void initFeatureLPPoolingCuda(lua_State* L); 25 | void initCuBLASWrapper(lua_State *L); 26 | // void initFFTWrapper(lua_State *L); 27 | // void initSpatialConvolutionCuFFT(lua_State *L); 28 | void initWeightedLookupTableCuda(lua_State *L); 29 | 30 | }}} // namespace 31 | 32 | using namespace facebook::deeplearning::torch; 33 | 34 | extern "C" int LUAOPEN(lua_State* L) { 35 | initCrossMapNormalizationCuda(L); 36 | initLocallyConnectedCuda(L); 37 | initLookupTableGPUCuda(L); 38 | initTemporalConvolutionFB(L); 39 | initTemporalKMaxPoolingCuda(L); 40 | initHSMCuda(L); 41 | initOneBitQuantizationCuda(L); 42 | initSparseNLLCriterionCuda(L); 43 | initTemporalConvolutionTBCCuda(L); 44 | initFeatureLPPoolingCuda(L); 45 | initCuBLASWrapper(L); 46 | // initFFTWrapper(L); 47 | // initSpatialConvolutionCuFFT(L); 48 | initWeightedLookupTableCuda(L); 49 | 50 | return 0; 51 | } 52 | -------------------------------------------------------------------------------- /src/LookupTableGPU.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/CudaUtils.cuh" 4 | #include "cuda/DeviceTensor.cuh" 5 | #include "cuda/WarpReductions.cuh" 6 | 7 | using namespace facebook::cuda; 8 | 9 | namespace facebook { namespace deeplearning { namespace torch { 10 | namespace detail { 11 | 12 | namespace { 13 | 14 | // for updateOutput 15 | 16 | __device__ __forceinline__ int getBatch() { 17 | return blockIdx.x; 18 | } 19 | 20 | __device__ __forceinline__ int getLookupElement() { 21 | return blockIdx.y; 22 | } 23 | 24 | // // for accGradParameters 25 | 26 | __device__ __forceinline__ int getFeatureDim() { 27 | // Each warp runs effectively independently, but there is slightly 28 | // better utilization if each block has at least 4 warps. 29 | int warpId = threadIdx.x / 32; 30 | return blockIdx.x * 4 + warpId; 31 | } 32 | 33 | // Feature dimension is always innermost. Depending on tensor layout, 34 | // it may or may not be contiguous. 35 | __global__ void updateOutputKernel(DeviceTensor input, 36 | DeviceTensor weight, 37 | DeviceTensor output) { 38 | int weightIndex = (int)(input[getBatch()][getLookupElement()] - 0.5f); 39 | 40 | for (int i = threadIdx.x; i < weight.getSize(1); i += blockDim.x) { 41 | output[getBatch()][getLookupElement()][i] = weight[weightIndex][i]; 42 | } 43 | } 44 | 45 | __global__ void accGradParametersKernel(DeviceTensor input, 46 | DeviceTensor gradOutput, 47 | DeviceTensor gradWeight, 48 | float scale) { 49 | const int featureDim = getFeatureDim(); 50 | if (featureDim >= gradWeight.getSize(1)) { 51 | return; 52 | } 53 | 54 | // The strategy here is that each warp handles a single feature 55 | // dimension. 56 | // Within that feature dimension, points in the [batch][element] 57 | // dimension can overlap, and we need to determine if threads want 58 | // to add to the gradient in a colliding manner. 59 | // Typically one would use floating-point atomicAdd() to resolve 60 | // these collisions, but that is non-deterministic if there are 61 | // collisions. Non-determinism for this code is really bad, 62 | // especially in RNNs, and is prone to snowballing error. 63 | // In order to get a deterministic order of execution, we handle 64 | // non-colliding updates separately from colliding ones. Colliding 65 | // updates are serialized in their order of execution by using the 66 | // warp-wide collision detector `warpHasCollision`. 67 | unsigned int maxLinearIndex = input.getSize(0) * input.getSize(1); 68 | for (unsigned int i = getLaneId(); i < maxLinearIndex; i += WARP_SIZE) { 69 | unsigned int batch = i / input.getSize(1); 70 | unsigned int lookupElement = i % input.getSize(1); 71 | 72 | int weightIndex = (int) (input[batch][lookupElement].ldg() - 0.5f); 73 | float update = gradOutput[batch][lookupElement][featureDim] * scale; 74 | 75 | // Check for collision 76 | if (warpHasCollision(weightIndex)) { 77 | // Run all lanes sequentially; warp divergence 78 | for (int i = 0; i < WARP_SIZE; ++i) { 79 | if (getLaneId() == i) { 80 | gradWeight[weightIndex][featureDim] += update; 81 | } 82 | } 83 | } else { 84 | // No collision; warp coherence 85 | gradWeight[weightIndex][featureDim] += update; 86 | } 87 | } 88 | } 89 | 90 | } // namespace 91 | 92 | typedef DeviceTensor DeviceTensor2; 93 | typedef DeviceTensor DeviceTensor3; 94 | 95 | void launchLookupTableGPUUpdateOutputKernel(cudaStream_t stream, 96 | DeviceTensor2& input, 97 | DeviceTensor2& weight, 98 | DeviceTensor3& output) { 99 | const dim3 grid(input.getSize(0), input.getSize(1)); 100 | const dim3 block(min(weight.getSize(1), 1024)); 101 | 102 | updateOutputKernel<<>>(input, weight, output); 103 | } 104 | 105 | void launchLookupTableGPUAccGradParametersKernel(cudaStream_t stream, 106 | DeviceTensor2& input, 107 | DeviceTensor3& gradOutput, 108 | DeviceTensor2& gradWeight, 109 | float scale) { 110 | // Target 4 warps/block for better utilization. Even if the input 111 | // doesn't have that many dimensions, the blocks/warps not 112 | // participating will just exit immediately. 113 | const dim3 grid(ceil(gradOutput.getSize(2), 4)); 114 | const dim3 block(32 * 4); 115 | 116 | accGradParametersKernel<<>>( 117 | input, gradOutput, gradWeight, scale); 118 | } 119 | 120 | }}}} // namespaces 121 | -------------------------------------------------------------------------------- /src/LuaUtils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "fblualib/LuaUtils.h" 4 | 5 | namespace facebook { namespace deeplearning { namespace torch { 6 | 7 | using namespace fblualib; 8 | 9 | }}} // namespaces 10 | -------------------------------------------------------------------------------- /src/MM.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "DeviceTensorUtils.h" 4 | #include "THCTensor.h" 5 | 6 | #include "cuda/DeviceTensor.cuh" 7 | #include "cuda/MM.cuh" 8 | 9 | 10 | using namespace facebook::cuda; 11 | 12 | namespace facebook { namespace deeplearning { namespace torch { 13 | 14 | template 15 | 16 | void transposeMM(DeviceTensor& A, 17 | DeviceTensor& B, 18 | DeviceTensor& C, 19 | float invNorm, 20 | cudaStream_t s = 0) { 21 | facebook::cuda::transposeMM 22 | ( 23 | A, B, C, invNorm, s); 24 | } 25 | 26 | #define INSTANTIATE_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC) \ 27 | template void transposeMM( \ 28 | DeviceTensor& A, \ 29 | DeviceTensor& B, \ 30 | DeviceTensor& C, \ 31 | float invNorm, \ 32 | cudaStream_t s); 33 | 34 | INSTANTIATE_TRANSPOSE_MM(5, true, false, true); 35 | INSTANTIATE_TRANSPOSE_MM(5, false, true, true); 36 | INSTANTIATE_TRANSPOSE_MM(5, false, false, true); 37 | INSTANTIATE_TRANSPOSE_MM(5, true, false, false); 38 | INSTANTIATE_TRANSPOSE_MM(5, false, true, false); 39 | INSTANTIATE_TRANSPOSE_MM(5, false, false, false); 40 | 41 | #define CALL_TRANSPOSE_MM(DIM, CONJA, CONJB, ACC) \ 42 | if (THCudaTensor_nDimension(state, tA) == DIM && \ 43 | conjugateTransposeA == CONJA && \ 44 | conjugateTransposeB == CONJB && \ 45 | accumulate == ACC) { \ 46 | DeviceTensor A = torchToDeviceTensor(state, tA); \ 47 | DeviceTensor B = torchToDeviceTensor(state, tB); \ 48 | DeviceTensor C = torchToDeviceTensor(state, tC); \ 49 | facebook::deeplearning::torch::transposeMM( \ 50 | A, B, C, invNorm, THCState_getCurrentStream(state)); \ 51 | return; \ 52 | } 53 | 54 | extern "C" void transposeMMFFI(THCState* state, 55 | THCudaTensor* tA, 56 | THCudaTensor* tB, 57 | THCudaTensor* tC, 58 | float invNorm, 59 | bool conjugateTransposeA, 60 | bool conjugateTransposeB, 61 | bool accumulate) { 62 | CHECK_EQ(THCudaTensor_nDimension(state, tA), 63 | THCudaTensor_nDimension(state, tB)); 64 | CHECK_EQ(THCudaTensor_nDimension(state, tA), 65 | THCudaTensor_nDimension(state, tC)); 66 | 67 | CALL_TRANSPOSE_MM(5, true, false, true); 68 | CALL_TRANSPOSE_MM(5, false, true, true); 69 | CALL_TRANSPOSE_MM(5, false, false, true); 70 | CALL_TRANSPOSE_MM(5, true, false, false); 71 | CALL_TRANSPOSE_MM(5, false, true, false); 72 | CALL_TRANSPOSE_MM(5, false, false, false); 73 | } 74 | 75 | #undef INSTANTIATE_TRANSPOSE_MM 76 | 77 | }}} 78 | -------------------------------------------------------------------------------- /src/MM.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | #include "cuda/DeviceTensor.cuh" 6 | 7 | #include 8 | 9 | namespace facebook { namespace deeplearning { namespace torch { 10 | 11 | template 12 | 13 | void transposeMM(facebook::cuda::DeviceTensor& A, 14 | facebook::cuda::DeviceTensor& B, 15 | facebook::cuda::DeviceTensor& C, 16 | float invNorm, 17 | cudaStream_t s = 0); 18 | 19 | }}} 20 | -------------------------------------------------------------------------------- /src/OneBitQuantization.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | 5 | namespace facebook { namespace deeplearning { namespace torch { 6 | 7 | void 8 | runQuantize1Bit(cudaStream_t stream, 9 | const cuda::DeviceTensor& in, 10 | cuda::DeviceTensor& out, 11 | cuda::DeviceTensor& quantizationError, 12 | cuda::DeviceTensor& avgPos, 13 | cuda::DeviceTensor& avgNeg); 14 | 15 | void 16 | runDequantize1Bit(cudaStream_t stream, 17 | const cuda::DeviceTensor& in, 18 | const cuda::DeviceTensor& avgPos, 19 | const cuda::DeviceTensor& avgNeg, 20 | cuda::DeviceTensor& out); 21 | 22 | } } } 23 | -------------------------------------------------------------------------------- /src/OneBitQuantizationHost.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Facebook 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | #include "src/DeviceTensorUtils.h" 5 | #include "src/Utils.h" 6 | #include "THC.h" 7 | #include "THCTensor.h" 8 | #include "src/OneBitQuantization.cuh" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace facebook::cuda; 17 | 18 | namespace facebook { namespace deeplearning { namespace torch { 19 | 20 | namespace { 21 | 22 | constexpr int kNumBits = sizeof(unsigned) * 8; 23 | 24 | constexpr int toQuantizedSize(int size) { 25 | return (size + kNumBits - 1) / kNumBits; 26 | } 27 | 28 | int quantize(lua_State *L) { 29 | THCState* state = getCutorchState(L); 30 | auto nonQuantizedTH = (THCudaTensor*)luaT_checkudata( 31 | L, 2, "torch.CudaTensor"); 32 | auto quantizedTH = (THCudaTensor*)luaT_getfieldcheckudata( 33 | L, 1, "quantized", "torch.CudaTensor"); 34 | auto quantizationErrorTH = (THCudaTensor*)luaT_getfieldcheckudata( 35 | L, 1, "quantization_error", "torch.CudaTensor"); 36 | auto avgPosTH = (THCudaTensor*)luaT_getfieldcheckudata( 37 | L, 1, "avg_pos", "torch.CudaTensor"); 38 | auto avgNegTH = (THCudaTensor*)luaT_getfieldcheckudata( 39 | L, 1, "avg_neg", "torch.CudaTensor"); 40 | 41 | THAssert(THCudaTensor_checkGPU(state, 5, nonQuantizedTH, quantizedTH, 42 | quantizationErrorTH, avgPosTH, avgNegTH)); 43 | // The input should be two-dimensional 44 | luaL_argcheck(L, THCudaTensor_nDimension(state, nonQuantizedTH) == 2, 2, 45 | "non_quantized_input should be 2d"); 46 | 47 | const auto rows = THCudaTensor_size(state, nonQuantizedTH, 0); 48 | const auto cols = THCudaTensor_size(state, nonQuantizedTH, 1); 49 | 50 | // Make sure that the outputs are properly sized 51 | THCudaTensor_resize2d(state, quantizedTH, rows, toQuantizedSize(cols)); 52 | THCudaTensor_resize2d(state, quantizationErrorTH, rows, cols); 53 | THCudaTensor_resize1d(state, avgPosTH, rows); 54 | THCudaTensor_resize1d(state, avgNegTH, rows); 55 | 56 | DeviceTensor nonQuantized = 57 | torchToDeviceTensor(state, nonQuantizedTH); 58 | DeviceTensor quantized = 59 | torchToDeviceTensor(state, quantizedTH); 60 | DeviceTensor quantizationError = 61 | torchToDeviceTensor(state, quantizationErrorTH); 62 | DeviceTensor avgPos = 63 | torchToDeviceTensor(state, avgPosTH); 64 | DeviceTensor avgNeg = 65 | torchToDeviceTensor(state, avgNegTH); 66 | 67 | runQuantize1Bit(THCState_getCurrentStream(state), 68 | nonQuantized, quantized, quantizationError, avgPos, avgNeg); 69 | 70 | return 0; 71 | } 72 | 73 | int dequantize(lua_State *L) { 74 | THCState* state = getCutorchState(L); 75 | auto quantizedTH = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 76 | auto avgPosTH = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); 77 | auto avgNegTH = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor"); 78 | auto nonQuantizedCols = luaL_checkint(L, 5); 79 | auto nonQuantizedTH = (THCudaTensor*)luaT_getfieldcheckudata( 80 | L, 1, "non_quantized", "torch.CudaTensor"); 81 | 82 | THAssert(THCudaTensor_checkGPU(state, 4, nonQuantizedTH, quantizedTH, 83 | avgPosTH, avgNegTH)); 84 | // The input should be two-dimensional 85 | luaL_argcheck(L, THCudaTensor_nDimension(state, quantizedTH) == 2, 2, 86 | "input should be 2d"); 87 | 88 | const auto rows = THCudaTensor_size(state, quantizedTH, 0); 89 | const auto quantizedCols = THCudaTensor_size(state, quantizedTH, 1); 90 | 91 | // The input should be within appropriate quantization sizes 92 | luaL_argcheck(L, quantizedCols == toQuantizedSize(nonQuantizedCols), 5, 93 | "num_orig_cols does not match quantized_input cols"); 94 | luaL_argcheck(L, THCudaTensor_size(state, avgPosTH, 0) == rows, 3, 95 | "avg_pos size doesn't match quantized_input rows"); 96 | luaL_argcheck(L, THCudaTensor_size(state, avgNegTH, 0) == rows, 4, 97 | "avg_neg size doesn't match quantized_input rows"); 98 | 99 | // Make sure that the outputs are properly sized 100 | THCudaTensor_resize2d(state, nonQuantizedTH, rows, nonQuantizedCols); 101 | 102 | DeviceTensor quantized = 103 | torchToDeviceTensor(state, quantizedTH); 104 | DeviceTensor avgPos = 105 | torchToDeviceTensor(state, avgPosTH); 106 | DeviceTensor avgNeg = 107 | torchToDeviceTensor(state, avgNegTH); 108 | DeviceTensor nonQuantized = 109 | torchToDeviceTensor(state, nonQuantizedTH); 110 | 111 | runDequantize1Bit(THCState_getCurrentStream(state), 112 | quantized, avgPos, avgNeg, nonQuantized); 113 | 114 | return 0; 115 | } 116 | 117 | const luaL_Reg functions [] = { 118 | {"OneBitQuantization_quantize", quantize}, 119 | {"OneBitQuantization_dequantize", dequantize}, 120 | {nullptr, nullptr} 121 | }; 122 | 123 | } // namespace 124 | 125 | void initOneBitQuantizationCuda(lua_State *L) { 126 | luaT_pushmetatable(L, "torch.CudaTensor"); 127 | luaT_registeratname(L, functions, "nn"); 128 | lua_pop(L,1); 129 | } 130 | 131 | }}} // namespaces 132 | -------------------------------------------------------------------------------- /src/SparseNLLCriterion.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Michael Mathieu (myrhev@fb.com) 4 | */ 5 | 6 | #include "cuda/CudaUtils.cuh" 7 | #include "cuda/WarpReductions.cuh" 8 | #include "cuda/util/CachedDeviceProperties.h" 9 | 10 | #include "SparseNLLCriterion.cuh" 11 | 12 | using namespace facebook::cuda; 13 | 14 | namespace facebook { namespace deeplearning { namespace torch { 15 | namespace detail { 16 | 17 | namespace { 18 | 19 | // only one block. 20 | // threadIdx.x splits K (ideally, is equal to K) 21 | // threadIdx.y splits batchSize 22 | __global__ void 23 | updateOutput(const DeviceTensor targetIdx, 24 | const DeviceTensor targetP, 25 | const DeviceTensor input, 26 | DeviceTensor output, 27 | const int batchSize, 28 | const int K) { 29 | extern __shared__ float buffer[]; 30 | 31 | // map (sum the correct input multiplied by the probabilities) 32 | float local_sum = 0.f; 33 | for (int i = threadIdx.y; i < batchSize; i += blockDim.y) { 34 | for (int j = threadIdx.x; j < K; j += blockDim.x) { 35 | local_sum += input[i][(int)(targetIdx[i][j] - 1)] * targetP[i][j]; 36 | } 37 | } 38 | 39 | // reduce (sum all) 40 | local_sum = cuda::warpReduceSum(local_sum); 41 | if (cuda::getLaneId() == 0) 42 | buffer[cuda::getWarpId()] = local_sum; 43 | __syncthreads(); 44 | if ((threadIdx.x == 0) && (threadIdx.y == 0)) { 45 | local_sum = 0.f; 46 | for (int i = 0; i < cuda::ceil(blockDim.x * blockDim.y, 32u); ++i) { 47 | local_sum += buffer[i]; 48 | } 49 | output[0] = -local_sum; 50 | } 51 | } 52 | 53 | // blockIdx.x * threadIdx.y splits batchSize 54 | // threadIdx.x splits K (ideally is equal to K) 55 | __global__ void 56 | updateGradInput(const DeviceTensor targetIdx, 57 | const DeviceTensor targetP, 58 | DeviceTensor gradInput, 59 | int batchSize, int K) { 60 | const int batch_idx = blockIdx.x * blockDim.y + threadIdx.y; 61 | const int batch_dim = gridDim.x * blockDim.y; 62 | for (int i = batch_idx; i < batchSize; i += batch_dim) { 63 | for (int j = threadIdx.x; j < K; j += blockDim.x) { 64 | gradInput[i][(int)(targetIdx[i][j] - 0.5)] = - targetP[i][j]; 65 | } 66 | } 67 | } 68 | 69 | } // namespace 70 | 71 | void runSparseNLLCriterion_updateOutput( 72 | cudaStream_t stream, 73 | const DeviceTensor& targetIdx, 74 | const DeviceTensor& targetP, 75 | const DeviceTensor& input, 76 | DeviceTensor& output) { 77 | 78 | const cudaDeviceProp& deviceProperties = 79 | facebook::cuda::getCurrentDeviceProperties(); 80 | const int maxThreads = deviceProperties.maxThreadsPerBlock; 81 | 82 | const int batchSize = targetP.getSize(0); 83 | const int K = targetP.getSize(1); 84 | dim3 blocks(1, 1, 1); 85 | int threadsx = min(K, maxThreads); 86 | dim3 threads(threadsx, max(1, maxThreads/threadsx), 1); 87 | size_t sharedSize = cuda::ceil(threads.x * threads.y * sizeof(float), 88 | (size_t)deviceProperties.warpSize); 89 | updateOutput<<>>( 90 | targetIdx, targetP, input, output, batchSize, K); 91 | } 92 | 93 | void runSparseNLLCriterion_updateGradInput( 94 | cudaStream_t stream, 95 | const DeviceTensor& targetIdx, 96 | const DeviceTensor& targetP, 97 | DeviceTensor& gradInput) { 98 | 99 | const cudaDeviceProp& deviceProperties = 100 | facebook::cuda::getCurrentDeviceProperties(); 101 | 102 | const int batchSize = targetP.getSize(0); 103 | const int K = targetP.getSize(1); 104 | const int nClasses = gradInput.getSize(1); 105 | cudaMemsetAsync(gradInput.data(), 0, nClasses * batchSize * sizeof(float), stream); 106 | int threadsx = min(K, deviceProperties.maxThreadsPerBlock); 107 | int threadsy = (threadsx > 128) ? 1 : (256 / threadsx); 108 | dim3 threads(threadsx, threadsy, 1); 109 | dim3 blocks(max(1, batchSize / threadsy), 1, 1); 110 | updateGradInput<<>>( 111 | targetIdx, targetP, gradInput, batchSize, K); 112 | } 113 | 114 | }}}} // namespaces 115 | -------------------------------------------------------------------------------- /src/SparseNLLCriterion.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | 5 | namespace facebook { namespace deeplearning { namespace torch { 6 | namespace detail { 7 | 8 | void runSparseNLLCriterion_updateOutput( 9 | cudaStream_t stream, 10 | const cuda::DeviceTensor& targetIdx, 11 | const cuda::DeviceTensor& targetP, 12 | const cuda::DeviceTensor& input, 13 | cuda::DeviceTensor& output); 14 | 15 | void runSparseNLLCriterion_updateGradInput( 16 | cudaStream_t stream, 17 | const cuda::DeviceTensor& targetIdx, 18 | const cuda::DeviceTensor& targetP, 19 | cuda::DeviceTensor& gradinput); 20 | 21 | }}}} 22 | -------------------------------------------------------------------------------- /src/SparseNLLCriterionHost.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Facebook 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | #include "src/Utils.h" 5 | #include "src/DeviceTensorUtils.h" 6 | #include "THC.h" 7 | #include "THCTensor.h" 8 | #include "src/SparseNLLCriterion.cuh" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace facebook::cuda; 17 | 18 | namespace facebook { namespace deeplearning { namespace torch { 19 | 20 | namespace { 21 | 22 | inline THCudaTensor* getFieldCudaTensor(lua_State* L, int arg, 23 | const char* name) { 24 | return static_cast(luaT_getfieldcheckudata( 25 | L, arg, name, "torch.CudaTensor")); 26 | } 27 | inline THCudaTensor* getCudaTensor(lua_State* L, int arg) { 28 | return static_cast(luaT_checkudata(L, arg, 29 | "torch.CudaTensor")); 30 | } 31 | 32 | int updateOutput(lua_State *L) { 33 | THCState* state = getCutorchState(L); 34 | auto output = (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "output", 35 | "torch.CudaTensor"); 36 | auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 37 | auto targetP = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); 38 | auto targetIdx = (THCudaTensor*)luaT_checkudata(L, 4, "torch.CudaTensor"); 39 | auto batchSize = targetP->size[0]; 40 | auto K = targetP->size[1]; 41 | 42 | THAssert(THCudaTensor_checkGPU(state, 4, input, output, targetP, targetIdx)); 43 | luaL_argcheck(L, (output->nDimension == 1) && (output->size[0] == 1), 44 | 1, "output has wrong dimension"); 45 | luaL_argcheck(L, (input->nDimension == 2) && (input->size[0] == batchSize) 46 | && (THCudaTensor_isContiguous(state, input)), 47 | 2, "input has wrong dimension"); 48 | luaL_argcheck(L, (targetP->nDimension == 2) 49 | && (THCudaTensor_isContiguous(state, targetP)), 50 | 3, "targetP has wrong dimension"); 51 | luaL_argcheck(L, (targetIdx->nDimension == 2) 52 | && (targetIdx->size[0] == batchSize) 53 | && (targetIdx->size[1] == K) 54 | && (THCudaTensor_isContiguous(state, targetIdx)), 55 | 4, "targetIdx has wrong dimension"); 56 | 57 | auto targetIdxDev = torchToDeviceTensor(state, targetIdx); 58 | auto targetPDev = torchToDeviceTensor(state, targetP); 59 | auto inputDev = torchToDeviceTensor(state, input); 60 | auto outputDev = torchToDeviceTensor(state, output); 61 | 62 | detail::runSparseNLLCriterion_updateOutput( 63 | THCState_getCurrentStream(state), 64 | targetIdxDev, targetPDev, 65 | inputDev, outputDev); 66 | 67 | return 0; 68 | } 69 | 70 | int updateGradInput(lua_State *L) { 71 | THCState* state = getCutorchState(L); 72 | auto gradInput = 73 | (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "gradInput", 74 | "torch.CudaTensor"); 75 | auto targetP = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 76 | auto targetIdx = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); 77 | auto batchSize = targetP->size[0]; 78 | auto K = targetP->size[1]; 79 | 80 | THAssert(THCudaTensor_checkGPU(state, 3, gradInput, targetP, targetIdx)); 81 | luaL_argcheck(L, (gradInput->nDimension == 2) 82 | && (gradInput->size[0] == batchSize) 83 | && (THCudaTensor_isContiguous(state, gradInput)), 84 | 1, "gradInput has wrong dimension"); 85 | luaL_argcheck(L, (targetP->nDimension == 2) 86 | && (THCudaTensor_isContiguous(state, targetP)), 87 | 2, "targetP has wrong dimension"); 88 | luaL_argcheck(L, (targetIdx->nDimension == 2) 89 | && (targetIdx->size[0] == batchSize) 90 | && (targetIdx->size[1] == K) 91 | && (THCudaTensor_isContiguous(state, targetIdx)), 92 | 3, "targetIdx has wrong dimension"); 93 | 94 | auto targetIdxDev = torchToDeviceTensor(state, targetIdx); 95 | auto targetPDev = torchToDeviceTensor(state, targetP); 96 | auto gradInputDev = torchToDeviceTensor(state, gradInput); 97 | 98 | detail::runSparseNLLCriterion_updateGradInput( 99 | THCState_getCurrentStream(state), 100 | targetIdxDev, targetPDev, 101 | gradInputDev); 102 | 103 | return 0; 104 | } 105 | 106 | const luaL_Reg functions [] = { 107 | {"SparseNLLCriterion_updateOutput", updateOutput}, 108 | {"SparseNLLCriterion_updateGradInput", updateGradInput}, 109 | {nullptr, nullptr} 110 | }; 111 | 112 | } // namespace 113 | 114 | void initSparseNLLCriterionCuda(lua_State *L) { 115 | luaT_pushmetatable(L, "torch.CudaTensor"); 116 | luaT_registeratname(L, functions, "nn"); 117 | lua_pop(L, 1); 118 | } 119 | 120 | }}} // namespaces 121 | -------------------------------------------------------------------------------- /src/Storage.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "thpp/Storage.h" 4 | 5 | namespace facebook { namespace deeplearning { namespace torch { 6 | 7 | using namespace thpp; 8 | 9 | }}} // namespaces 10 | -------------------------------------------------------------------------------- /src/TemporalConvolutionTBC.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "THCGeneral.h" 4 | #include "cuda/DeviceTensor.cuh" 5 | 6 | namespace facebook { 7 | namespace deeplearning { 8 | namespace torch { 9 | namespace detail { 10 | 11 | void runTemporalConvolutionTBC_updateOutput( 12 | THCState* state, 13 | const cuda::DeviceTensor& input, 14 | const cuda::DeviceTensor& output, 15 | const cuda::DeviceTensor& weight, 16 | const cuda::DeviceTensor& bias); 17 | 18 | void runTemporalConvolutionTBC_updateGradInput( 19 | THCState* state, 20 | const cuda::DeviceTensor& dInput, 21 | const cuda::DeviceTensor& dOutput, 22 | const cuda::DeviceTensor& weight); 23 | 24 | void runTemporalConvolutionTBC_accGradParameters( 25 | THCState* state, 26 | const cuda::DeviceTensor& input, 27 | const cuda::DeviceTensor& dOutput, 28 | const cuda::DeviceTensor& dWeight, 29 | const cuda::DeviceTensor& dBias, 30 | float scale); 31 | } 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/TemporalConvolutionTBCHost.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2016 Facebook 2 | 3 | #include "THC.h" 4 | #include "THCTensor.h" 5 | #include "cuda/DeviceTensor.cuh" 6 | #include "src/DeviceTensorUtils.h" 7 | #include "src/TemporalConvolutionTBC.cuh" 8 | #include "src/Utils.h" 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | using namespace std; 16 | using namespace facebook::cuda; 17 | 18 | namespace facebook { 19 | namespace deeplearning { 20 | namespace torch { 21 | 22 | namespace { 23 | 24 | inline THCudaTensor* 25 | getFieldCudaTensor(lua_State* L, int arg, const char* name) { 26 | return static_cast( 27 | luaT_getfieldcheckudata(L, arg, name, "torch.CudaTensor")); 28 | } 29 | inline THCudaTensor* getCudaTensor(lua_State* L, int arg) { 30 | return static_cast( 31 | luaT_checkudata(L, arg, "torch.CudaTensor")); 32 | } 33 | 34 | int updateOutput(lua_State* L) { 35 | THCState* state = getCutorchState(L); 36 | auto output = (THCudaTensor*)luaT_getfieldcheckudata( 37 | L, 1, "output", "torch.CudaTensor"); 38 | auto weight = (THCudaTensor*)luaT_getfieldcheckudata( 39 | L, 1, "weight", "torch.CudaTensor"); 40 | auto bias = 41 | (THCudaTensor*)luaT_getfieldcheckudata(L, 1, "bias", "torch.CudaTensor"); 42 | auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 43 | 44 | THAssert(THCudaTensor_checkGPU(state, 4, input, output, weight, bias)); 45 | 46 | auto inputDev = torchToDeviceTensor(state, input); 47 | auto outputDev = torchToDeviceTensor(state, output); 48 | auto weightDev = torchToDeviceTensor(state, weight); 49 | auto biasDev = torchToDeviceTensor(state, bias); 50 | 51 | detail::runTemporalConvolutionTBC_updateOutput( 52 | state, inputDev, outputDev, weightDev, biasDev); 53 | 54 | return 0; 55 | } 56 | 57 | int updateGradInput(lua_State* L) { 58 | THCState* state = getCutorchState(L); 59 | auto dInput = (THCudaTensor*)luaT_getfieldcheckudata( 60 | L, 1, "gradInput", "torch.CudaTensor"); 61 | auto weight = (THCudaTensor*)luaT_getfieldcheckudata( 62 | L, 1, "weight", "torch.CudaTensor"); 63 | auto dOutput = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 64 | 65 | THAssert(THCudaTensor_checkGPU(state, 3, dInput, dOutput, weight)); 66 | 67 | auto dInputDev = torchToDeviceTensor(state, dInput); 68 | auto dOutputDev = torchToDeviceTensor(state, dOutput); 69 | auto weightDev = torchToDeviceTensor(state, weight); 70 | 71 | detail::runTemporalConvolutionTBC_updateGradInput( 72 | state, dInputDev, dOutputDev, weightDev); 73 | 74 | return 0; 75 | } 76 | 77 | int accGradParameters(lua_State* L) { 78 | THCState* state = getCutorchState(L); 79 | auto dWeight = (THCudaTensor*)luaT_getfieldcheckudata( 80 | L, 1, "gradWeight", "torch.CudaTensor"); 81 | auto dBias = (THCudaTensor*)luaT_getfieldcheckudata( 82 | L, 1, "gradBias", "torch.CudaTensor"); 83 | auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 84 | auto dOutput = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); 85 | float scale = lua_tonumber(L, 4); 86 | 87 | THAssert(THCudaTensor_checkGPU(state, 4, input, dOutput, dWeight, dBias)); 88 | 89 | auto inputDev = torchToDeviceTensor(state, input); 90 | auto dOutputDev = torchToDeviceTensor(state, dOutput); 91 | auto dWeightDev = torchToDeviceTensor(state, dWeight); 92 | auto dBiasDev = torchToDeviceTensor(state, dBias); 93 | 94 | detail::runTemporalConvolutionTBC_accGradParameters( 95 | state, inputDev, dOutputDev, dWeightDev, dBiasDev, scale); 96 | 97 | return 0; 98 | } 99 | 100 | const luaL_Reg functions[] = { 101 | {"TemporalConvolutionTBC_updateOutput", updateOutput}, 102 | {"TemporalConvolutionTBC_updateGradInput", updateGradInput}, 103 | {"TemporalConvolutionTBC_accGradParameters", accGradParameters}, 104 | {nullptr, nullptr}}; 105 | 106 | } // namespace 107 | 108 | void initTemporalConvolutionTBCCuda(lua_State* L) { 109 | luaT_pushmetatable(L, "torch.CudaTensor"); 110 | luaT_registeratname(L, functions, "nn"); 111 | lua_pop(L, 1); 112 | } 113 | } 114 | } 115 | } // namespaces 116 | -------------------------------------------------------------------------------- /src/TemporalKMaxPooling.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | #include "cuda/TopKElements.cuh" 5 | #include "cuda/DeviceTensor.cuh" 6 | #include "cuda/util/CachedDeviceProperties.h" 7 | #include "THC.h" 8 | 9 | using namespace facebook::cuda; 10 | 11 | namespace facebook { namespace deeplearning { namespace torch { 12 | 13 | namespace { 14 | 15 | __device__ __forceinline__ 16 | int getUpdateOutputBatch(const DeviceTensor& input) { 17 | return blockIdx.y; 18 | } 19 | 20 | __device__ __forceinline__ 21 | int getUpdateOutputFeature(const DeviceTensor& input) { 22 | return blockIdx.x * blockDim.y + threadIdx.y; 23 | } 24 | 25 | // input: [batch][frame][embedding] 26 | // output: [batch][K frames s.t. (f)(embedding) is the highest][embedding] 27 | // ordered in original [frame] order 28 | 29 | __global__ void 30 | temporalKMaxPoolingUpdateOutput(DeviceTensor input, 31 | DeviceTensor indices, 32 | DeviceTensor output, 33 | int k) { 34 | const int batch = getUpdateOutputBatch(input); 35 | const int feature = getUpdateOutputFeature(input); 36 | 37 | if (feature >= input.getSize(2)) { 38 | return; 39 | } 40 | 41 | DeviceTensor input1d(&input[batch][0][feature], 42 | (const int[1]){ input.getSize(1) }, 43 | (const int[1]){ input.getSize(2) }); 44 | DeviceTensor output1d(&output[batch][0][feature], 45 | (const int[1]){ k }, 46 | (const int[1]){ output.getSize(2) }); 47 | DeviceTensor indices1d(&indices[batch][0][feature], 48 | (const int[1]){ k }, 49 | (const int[1]){ indices.getSize(2) }); 50 | 51 | warpFindTopKElementsIndexOrder(input1d, output1d, indices1d, k); 52 | } 53 | 54 | __device__ __forceinline__ 55 | int getUpdateGradInputBatch() { 56 | return blockIdx.x; 57 | } 58 | 59 | __device__ __forceinline__ 60 | int getUpdateGradInputOutputFrame() { 61 | return blockIdx.y; 62 | } 63 | 64 | __global__ void 65 | temporalKMaxPoolingUpdateGradInput(DeviceTensor gradOutput, 66 | DeviceTensor indices, 67 | DeviceTensor gradInput, 68 | int k) { 69 | const int batch = getUpdateGradInputBatch(); 70 | const int outputFrame = getUpdateGradInputOutputFrame(); 71 | 72 | for (int feature = threadIdx.x; 73 | feature < gradInput.getSize(2); 74 | feature += blockDim.x) { 75 | int index = (int) indices[batch][outputFrame][feature]; 76 | 77 | atomicAdd(&gradInput[batch][index][feature], 78 | gradOutput[batch][outputFrame][feature]); 79 | } 80 | } 81 | 82 | } 83 | 84 | void 85 | runTemporalKMaxPoolingUpdateOutput(cudaStream_t stream, 86 | const DeviceTensor& input, 87 | const DeviceTensor& indices, 88 | DeviceTensor& output, 89 | int k) { 90 | const cudaDeviceProp& deviceProperties = 91 | facebook::cuda::getCurrentDeviceProperties(); 92 | 93 | // We aim to run with 4 warps. 94 | const int numWarps = std::min(input.getSize(2), 4); 95 | 96 | dim3 block(deviceProperties.warpSize, numWarps); 97 | dim3 grid(cuda::ceil(input.getSize(2), numWarps), input.getSize(0)); 98 | 99 | temporalKMaxPoolingUpdateOutput<<>>( 100 | input, indices, output, k); 101 | } 102 | 103 | void 104 | runTemporalKMaxPoolingUpdateGradInput(cudaStream_t stream, 105 | const DeviceTensor& gradOutput, 106 | const DeviceTensor& indices, 107 | DeviceTensor& gradInput, 108 | int k) { 109 | const cudaDeviceProp& deviceProperties = 110 | facebook::cuda::getCurrentDeviceProperties(); 111 | 112 | // We aim to run with 4 warps. 113 | const int numThreads = 114 | std::min(gradOutput.getSize(2), deviceProperties.warpSize * 4); 115 | 116 | dim3 block(numThreads); 117 | dim3 grid(gradOutput.getSize(0), 118 | gradOutput.getSize(1)); 119 | 120 | temporalKMaxPoolingUpdateGradInput<<>>( 121 | gradOutput, indices, gradInput, k); 122 | } 123 | 124 | } } } 125 | -------------------------------------------------------------------------------- /src/TemporalKMaxPooling.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/DeviceTensor.cuh" 4 | 5 | namespace facebook { namespace deeplearning { namespace torch { 6 | 7 | void 8 | runTemporalKMaxPoolingUpdateOutput( 9 | cudaStream_t stream, 10 | const cuda::DeviceTensor& input, 11 | const cuda::DeviceTensor& indices, 12 | cuda::DeviceTensor& output, 13 | int k); 14 | 15 | void 16 | runTemporalKMaxPoolingUpdateGradInput( 17 | cudaStream_t stream, 18 | const cuda::DeviceTensor& gradOutput, 19 | const cuda::DeviceTensor& indices, 20 | cuda::DeviceTensor& gradInput, 21 | int k); 22 | 23 | } } } 24 | -------------------------------------------------------------------------------- /src/Tensor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "thpp/Storage.h" 4 | #include "thpp/Tensor.h" 5 | 6 | namespace facebook { namespace deeplearning { namespace torch { 7 | 8 | using namespace thpp; 9 | 10 | }}} // namespaces 11 | -------------------------------------------------------------------------------- /src/Utils.cpp: -------------------------------------------------------------------------------- 1 | #include "Utils.h" 2 | 3 | namespace facebook { namespace deeplearning { namespace torch { 4 | 5 | THCState* getCutorchState(lua_State* L) { 6 | // Unfortunately cutorch lua headers aren't exported, so we have to 7 | // copy this. This is a copy from cunn. 8 | lua_getglobal(L, "cutorch"); 9 | lua_getfield(L, -1, "getState"); 10 | lua_call(L, 0, 1); 11 | THCState *state = (THCState*) lua_touserdata(L, -1); 12 | lua_pop(L, 2); 13 | return state; 14 | } 15 | 16 | } } } 17 | -------------------------------------------------------------------------------- /src/Utils.h: -------------------------------------------------------------------------------- 1 | #ifndef FBCUNN_UTILS_H 2 | #define FBCUNN_UTILS_H 3 | 4 | #include 5 | #include "THCGeneral.h" 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { 8 | 9 | THCState* getCutorchState(lua_State* L); 10 | 11 | } } } 12 | 13 | #endif 14 | -------------------------------------------------------------------------------- /src/WeightedLookupTable.cu: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015 Facebook 3 | */ 4 | 5 | #include "cuda/CudaUtils.cuh" 6 | #include "cuda/DeviceTensor.cuh" 7 | #include "cuda/WarpReductions.cuh" 8 | 9 | using namespace facebook::cuda; 10 | 11 | namespace facebook { namespace deeplearning { namespace torch { 12 | namespace detail { 13 | 14 | namespace { 15 | 16 | __global__ void scaleByWeight(DeviceTensor output, 17 | DeviceTensor input, 18 | DeviceTensor weights) { 19 | // Values computed per thread 20 | const int VT = 4; 21 | 22 | // Each block computes a 4x128 section of the output, with each 23 | // warp handling a 1x128 section. 24 | 25 | int rowIdx = blockIdx.x * blockDim.y + threadIdx.y; 26 | if (rowIdx < weights.getSize(0)) { 27 | float weight = weights[rowIdx]; 28 | 29 | #pragma unroll 30 | for (int i = 0; i < VT; i++) { 31 | int colIdx = blockDim.x * (VT * blockIdx.y + i) + threadIdx.x; 32 | if (colIdx < input.getSize(1)) { 33 | output[rowIdx][colIdx] = input[rowIdx][colIdx] * weight; 34 | } 35 | } 36 | } 37 | } 38 | 39 | } 40 | 41 | void launchWeightedLookupTableScaleByWeightKernel(cudaStream_t stream, 42 | DeviceTensor& output, 43 | DeviceTensor& input, 44 | DeviceTensor& weight) { 45 | dim3 grid(cuda::ceil(output.getSize(0), 4), cuda::ceil(output.getSize(1), 128)); 46 | dim3 block(32, 4); 47 | 48 | scaleByWeight<<>>(output, input, weight); 49 | } 50 | 51 | }}}} 52 | -------------------------------------------------------------------------------- /src/WeightedLookupTableHost.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2015 Facebook 3 | */ 4 | 5 | #include "cuda/DeviceTensor.cuh" 6 | #include "src/Utils.h" 7 | #include "src/DeviceTensorUtils.h" 8 | #include "THC.h" 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | using namespace facebook::cuda; 15 | 16 | namespace facebook { namespace deeplearning { namespace torch { 17 | 18 | namespace detail { 19 | void launchWeightedLookupTableScaleByWeightKernel( 20 | cudaStream_t stream, 21 | DeviceTensor& output, 22 | DeviceTensor& input, 23 | DeviceTensor& weight); 24 | } 25 | 26 | namespace { 27 | 28 | int scaleByWeight(lua_State* L) { 29 | THCState* state = getCutorchState(L); 30 | auto output = (THCudaTensor*)luaT_checkudata(L, 1, "torch.CudaTensor"); 31 | const auto input = (THCudaTensor*)luaT_checkudata(L, 2, "torch.CudaTensor"); 32 | const auto weight = (THCudaTensor*)luaT_checkudata(L, 3, "torch.CudaTensor"); 33 | 34 | DeviceTensor cudaOutput = torchToDeviceTensor(state, output); 35 | DeviceTensor cudaInput = torchToDeviceTensor(state, input); 36 | DeviceTensor cudaWeight = torchToDeviceTensor(state, weight); 37 | 38 | detail::launchWeightedLookupTableScaleByWeightKernel( 39 | THCState_getCurrentStream(state), 40 | cudaOutput, cudaInput, cudaWeight); 41 | 42 | return 0; 43 | } 44 | 45 | const luaL_Reg functions[] = { 46 | {"WeightedLookupTable_scaleByWeight", scaleByWeight}, 47 | {nullptr, nullptr}, 48 | }; 49 | 50 | } // namespace 51 | 52 | void initWeightedLookupTableCuda(lua_State* L) { 53 | luaT_pushmetatable(L, "torch.CudaTensor"); 54 | luaT_registeratname(L, functions, "nn"); 55 | lua_pop(L, 1); 56 | } 57 | 58 | }}} // namespaces 59 | -------------------------------------------------------------------------------- /src/fft/CuFFTConvolution_AccGradParameters.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | struct THCudaTensor; 6 | struct THCState; 7 | 8 | namespace facebook { namespace deeplearning { namespace torch { 9 | 10 | void CuFFTConvolution_ReferenceAccGradParameters(THCState* state, 11 | THCudaTensor* inputTH, 12 | THCudaTensor* kernelsTH, 13 | THCudaTensor* outputTH, 14 | THCudaTensor* gradBiasTH, 15 | float scale, 16 | THCudaTensor* inputComplexTH, 17 | THCudaTensor* kernelsComplexTH, 18 | THCudaTensor* outputComplexTH); 19 | 20 | void CuFFTConvolution_AccGradParameters(THCState* state, 21 | THCudaTensor* inputTH, 22 | THCudaTensor* kernelsTH, 23 | THCudaTensor* outputTH, 24 | THCudaTensor* gradBiasTH, 25 | float scale, 26 | THCudaTensor* inputComplexTH, 27 | THCudaTensor* kernelsComplexTH, 28 | THCudaTensor* outputComplexTH, 29 | THCudaTensor* inputComplexTTH, 30 | THCudaTensor* kernelsComplexTTH, 31 | THCudaTensor* outputComplexTTH); 32 | 33 | class CuFFTConvolution; 34 | 35 | 36 | // This version can be preconfigured with cublasHandle, cufftHandle and 37 | // cudaStreams. Use this one for performance and reuse resources. 38 | void CuFFTConvolution_AccGradParameters(THCState* state, 39 | CuFFTConvolution* conv, 40 | THCudaTensor* gradOutputTH, 41 | THCudaTensor* gradBiasTH, 42 | float scale); 43 | } } } // namespace 44 | -------------------------------------------------------------------------------- /src/fft/CuFFTConvolution_UpdateGradInput.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "src/fft/CuFFTConvolution_UpdateGradInput.cuh" 4 | 5 | #include "cuda/CudaUtils.cuh" 6 | #include "cuda/DeviceTensor.cuh" 7 | #include "src/DeviceTensorUtils.h" 8 | #include "THCTensor.h" 9 | #include "src/CuBLASWrapper.h" 10 | #include "src/fft/CuFFTWrapper.cuh" 11 | #include "src/fft/CuFFTConvolution.cuh" 12 | #include "src/fft/Utils.cuh" 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | using namespace facebook::cuda; 20 | 21 | namespace facebook { namespace deeplearning { namespace torch { 22 | 23 | // Assumes complex is float[2] 24 | __global__ void referenceUpdateGradInput(DeviceTensor inputComplex, 25 | DeviceTensor weightComplex, 26 | DeviceTensor outputComplex) 27 | { 28 | // Input originally real, we have circular Hermitian symmetry: 29 | // X[k] = X∗[−k mod N] . 30 | const int Batches = inputComplex.getSize(0); 31 | const int Weight = weightComplex.getSize(0); 32 | const int InputRows = inputComplex.getSize(2); 33 | const int InputCols = inputComplex.getSize(3); 34 | for (int batch = 0; batch < Batches; ++batch) { 35 | for (int filter = 0; filter < Weight; ++filter) { 36 | for (int inputRow = 0; inputRow < InputRows; ++inputRow) { 37 | for (int inputCol = 0; inputCol < InputCols; ++inputCol) { 38 | for (int inputPlane = 0; inputPlane < inputComplex.getSize(1); 39 | ++inputPlane) { 40 | cuFloatComplex* inp = inputComplex[batch][inputPlane] 41 | [inputRow][inputCol].dataAs(); 42 | if (filter == 0) { 43 | inp->x = 0.0f; 44 | inp->y = 0.0f; 45 | } 46 | 47 | cuFloatComplex weight = weightComplex[filter][inputPlane] 48 | [inputRow][inputCol].ldgAs(); 49 | 50 | cuFloatComplex output = outputComplex[batch][filter][inputRow] 51 | [inputCol].ldgAs(); 52 | 53 | *inp = cuCfmaf(weight, output, *inp); 54 | } 55 | } 56 | } 57 | } 58 | } 59 | } 60 | 61 | void CuFFTConvolution_ReferenceUpdateGradInput(THCState* state, 62 | THCudaTensor* inputTH, 63 | THCudaTensor* weightTH, 64 | THCudaTensor* outputTH, 65 | THCudaTensor* inputComplexTH, 66 | THCudaTensor* weightComplexTH, 67 | THCudaTensor* outputComplexTH) { 68 | DeviceTensor weight = 69 | torchToDeviceTensor(state, weightTH); 70 | DeviceTensor input = 71 | torchToDeviceTensor(state, inputTH); 72 | DeviceTensor output = 73 | torchToDeviceTensor(state, outputTH); 74 | 75 | DeviceTensor inputComplex = 76 | torchToDeviceTensor(state, inputComplexTH); 77 | DeviceTensor outputComplex = 78 | torchToDeviceTensor(state, outputComplexTH); 79 | DeviceTensor weightComplex = 80 | torchToDeviceTensor(state, weightComplexTH); 81 | 82 | fft2d<2>(weight, weightComplex); 83 | fft2d<2>(output, outputComplex); 84 | 85 | dim3 grid(1); 86 | dim3 block(1); 87 | referenceUpdateGradInput<<>>( 88 | inputComplex, weightComplex, outputComplex); 89 | 90 | fft2d<2>(input, inputComplex, FFTParameters().inverse()); 91 | } 92 | 93 | void CuFFTConvolution_UpdateGradInput(THCState* state, 94 | THCudaTensor* inputTH, 95 | THCudaTensor* weightTH, 96 | THCudaTensor* outputTH, 97 | THCudaTensor* inputComplexTH, 98 | THCudaTensor* weightComplexTH, 99 | THCudaTensor* outputComplexTH, 100 | THCudaTensor* inputComplexTTH, 101 | THCudaTensor* weightComplexTTH, 102 | THCudaTensor* outputComplexTTH) { 103 | CuFFTConvolution conv((ConvolutionPass(ConvolutionPass::kUpdateGradInput))); 104 | conv.withInputAndBuffers(state, inputTH, inputComplexTH, inputComplexTTH) 105 | .withFiltersAndBuffers(state, weightTH, weightComplexTH, weightComplexTTH) 106 | .withOutputAndBuffers(state, outputTH, outputComplexTH, outputComplexTTH) 107 | .run(); 108 | } 109 | 110 | void CuFFTConvolution_UpdateGradInput(CuFFTConvolution* conv) { 111 | conv->run(); 112 | } 113 | 114 | } } } // namespace 115 | -------------------------------------------------------------------------------- /src/fft/CuFFTConvolution_UpdateGradInput.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | struct THCudaTensor; 6 | struct THCState; 7 | 8 | namespace facebook { namespace deeplearning { namespace torch { 9 | 10 | void CuFFTConvolution_ReferenceUpdateGradInput(THCState* state, 11 | THCudaTensor* inputTH, 12 | THCudaTensor* kernelsTH, 13 | THCudaTensor* outputTH, 14 | THCudaTensor* inputComplexTH, 15 | THCudaTensor* kernelsComplexTH, 16 | THCudaTensor* outputComplexTH); 17 | 18 | // CuFFTConvolution calls require 2 sets of buffers for each 19 | // input / kernels / output tensor. 20 | // - The first set is used to perform FFTs 21 | // - The second set is used to hold the transpose of the FFTs for the 22 | // subsequent gemm calls. 23 | // The first set must always be supplied, the second will be constructed if 24 | // passed NULL. 25 | void CuFFTConvolution_UpdateGradInput(THCState* state, 26 | THCudaTensor* inputTH, 27 | THCudaTensor* kernelsTH, 28 | THCudaTensor* outputTH, 29 | THCudaTensor* inputComplexTH, 30 | THCudaTensor* kernelsComplexTH, 31 | THCudaTensor* outputComplexTH, 32 | THCudaTensor* inputComplexTTH, 33 | THCudaTensor* kernelsComplexTTH, 34 | THCudaTensor* outputComplexTTH); 35 | 36 | class CuFFTConvolution; 37 | 38 | // This version can be preconfigured with cublasHandle, cufftHandle and 39 | // cudaStreams. Use this one for performance and reuse resources. 40 | void CuFFTConvolution_UpdateGradInput(CuFFTConvolution* conv); 41 | 42 | } } } // namespace 43 | -------------------------------------------------------------------------------- /src/fft/CuFFTConvolution_UpdateOutput.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "src/fft/CuFFTConvolution_UpdateOutput.cuh" 4 | 5 | #include "cuda/CudaUtils.cuh" 6 | #include "cuda/DeviceTensor.cuh" 7 | #include "src/DeviceTensorUtils.h" 8 | #include "THCTensor.h" 9 | #include "src/ConvolutionBias.cuh" 10 | #include "src/CuBLASWrapper.h" 11 | #include "src/fft/CuFFTWrapper.cuh" 12 | #include "src/fft/CuFFTConvolution.cuh" 13 | #include "src/fft/Utils.cuh" 14 | 15 | #include 16 | #include 17 | #include 18 | #include 19 | 20 | using namespace facebook::cuda; 21 | 22 | namespace facebook { namespace deeplearning { namespace torch { 23 | 24 | // Assumes complex is float[2] 25 | __global__ void referenceUpdateOuput(DeviceTensor inputComplex, 26 | DeviceTensor filtersComplex, 27 | DeviceTensor outputComplex) 28 | { 29 | // Input originally real, we have circular Hermitian symmetry: 30 | // X[k] = X∗[−k mod N] . 31 | const int Batches = inputComplex.getSize(0); 32 | const int Filters = filtersComplex.getSize(0); 33 | const int OutputRows = outputComplex.getSize(2); 34 | const int OutputCols = outputComplex.getSize(3); 35 | for (int batch = 0; batch < Batches; ++batch) { 36 | for (int filter = 0; filter < Filters; ++filter) { 37 | for (int outputRow = 0; outputRow < OutputRows; ++outputRow) { 38 | for (int outputCol = 0; outputCol < OutputCols; ++outputCol) { 39 | cuFloatComplex* out = outputComplex[batch][filter] 40 | [outputRow][outputCol].dataAs(); 41 | out->x = 0.0f; 42 | out->y = 0.0f; 43 | for (int inputPlane = 0; inputPlane < inputComplex.getSize(1); 44 | ++inputPlane) { 45 | cuFloatComplex input = 46 | inputComplex[batch][inputPlane] 47 | [outputRow][outputCol].ldgAs(); 48 | 49 | cuFloatComplex filters = 50 | cuConjf(filtersComplex[filter][inputPlane] 51 | [outputRow][outputCol].ldgAs()); 52 | 53 | *out = cuCfmaf(input, filters, *out); 54 | } 55 | } 56 | } 57 | } 58 | } 59 | } 60 | 61 | void CuFFTConvolution_ReferenceUpdateOutput(THCState* state, 62 | THCudaTensor* inputTH, 63 | THCudaTensor* kernelsTH, 64 | THCudaTensor* outputTH, 65 | THCudaTensor* biasTH, 66 | THCudaTensor* inputComplexTH, 67 | THCudaTensor* kernelsComplexTH, 68 | THCudaTensor* outputComplexTH) { 69 | DeviceTensor filters = 70 | torchToDeviceTensor(state, kernelsTH); 71 | DeviceTensor input = 72 | torchToDeviceTensor(state, inputTH); 73 | DeviceTensor output = 74 | torchToDeviceTensor(state, outputTH); 75 | 76 | DeviceTensor inputComplex = 77 | torchToDeviceTensor(state, inputComplexTH); 78 | DeviceTensor outputComplex = 79 | torchToDeviceTensor(state, outputComplexTH); 80 | DeviceTensor filtersComplex = 81 | torchToDeviceTensor(state, kernelsComplexTH); 82 | 83 | fft2d<2>(input, inputComplex); 84 | fft2d<2>(filters, filtersComplex); 85 | 86 | dim3 grid(1); 87 | dim3 block(1); 88 | referenceUpdateOuput<<>>( 89 | inputComplex, filtersComplex, outputComplex); 90 | 91 | fft2d<2>(output, outputComplex, FFTParameters().inverse()); 92 | 93 | bias::updateOutputBias(state, outputTH, biasTH); 94 | } 95 | 96 | void CuFFTConvolution_UpdateOutput(THCState* state, 97 | THCudaTensor* inputTH, 98 | THCudaTensor* kernelsTH, 99 | THCudaTensor* outputTH, 100 | THCudaTensor* biasTH, 101 | THCudaTensor* inputComplexTH, 102 | THCudaTensor* kernelsComplexTH, 103 | THCudaTensor* outputComplexTH, 104 | THCudaTensor* inputComplexTTH, 105 | THCudaTensor* kernelsComplexTTH, 106 | THCudaTensor* outputComplexTTH) { 107 | CuFFTConvolution conv((ConvolutionPass(ConvolutionPass::kUpdateOutput))); 108 | conv.withInputAndBuffers(state, 109 | inputTH, inputComplexTH, inputComplexTTH) 110 | .withFiltersAndBuffers(state, 111 | kernelsTH, kernelsComplexTH, kernelsComplexTTH) 112 | .withOutputAndBuffers(state, 113 | outputTH, outputComplexTH, outputComplexTTH) 114 | .run(); 115 | 116 | bias::updateOutputBias(state, outputTH, biasTH); 117 | } 118 | 119 | void CuFFTConvolution_UpdateOutput(THCState* state, 120 | CuFFTConvolution* conv, 121 | THCudaTensor* outputTH, 122 | THCudaTensor* biasTH) { 123 | conv->run(); 124 | 125 | bias::updateOutputBias(state, outputTH, biasTH); 126 | } 127 | 128 | } } } // namespace 129 | -------------------------------------------------------------------------------- /src/fft/CuFFTConvolution_UpdateOutput.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | struct THCudaTensor; 6 | struct THCState; 7 | 8 | namespace facebook { namespace deeplearning { namespace torch { 9 | 10 | void CuFFTConvolution_ReferenceUpdateOutput(THCState* state, 11 | THCudaTensor* inputTH, 12 | THCudaTensor* kernelsTH, 13 | THCudaTensor* outputTH, 14 | THCudaTensor* biasTH, 15 | THCudaTensor* inputComplexTH, 16 | THCudaTensor* kernelsComplexTH, 17 | THCudaTensor* outputComplexTH); 18 | 19 | // CuFFTConvolution calls require 2 sets of buffers for each 20 | // input / kernels / output tensor. 21 | // - The first set is used to perform FFTs 22 | // - The second set is used to hold the transpose of the FFTs for the 23 | // subsequent gemm calls. 24 | // The first set must always be supplied, the second will be constructed if 25 | // passed NULL. 26 | void CuFFTConvolution_UpdateOutput(THCState* state, 27 | THCudaTensor* inputTH, 28 | THCudaTensor* kernelsTH, 29 | THCudaTensor* outputTH, 30 | THCudaTensor* biasTH, 31 | THCudaTensor* inputComplexTH, 32 | THCudaTensor* kernelsComplexTH, 33 | THCudaTensor* outputComplexTH, 34 | THCudaTensor* inputComplexTTH, 35 | THCudaTensor* kernelsComplexTTH, 36 | THCudaTensor* outputComplexTTH); 37 | 38 | class CuFFTConvolution; 39 | 40 | // This version can be preconfigured with cublasHandle, cufftHandle and 41 | // cudaStreams. Use this one for performance and reuse resources. 42 | void CuFFTConvolution_UpdateOutput(THCState* state, 43 | CuFFTConvolution* conv, 44 | THCudaTensor* outputTH, 45 | THCudaTensor* biasTH); 46 | 47 | } } } // namespace 48 | -------------------------------------------------------------------------------- /src/fft/CuFFTWrapper.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cuda/DeviceTensor.cuh" 5 | #include "cuda/fbfft/FBFFT.cuh" 6 | #include "src/fft/Utils.cuh" 7 | 8 | #include 9 | 10 | namespace facebook { namespace deeplearning { namespace torch { 11 | 12 | // Can add layout stuff later if needed 13 | class FFTParameters { 14 | public: 15 | // Default is forward, normalized FFT. 16 | // Normalization occurs only in inverse FFT (by 1 / (M.N)) since CuFFT does 17 | // unnormalized FFTs by default 18 | FFTParameters() : 19 | version(cufft), direction_(true), normalize_(true), padLeft_(0), padUp_(0) 20 | {} 21 | 22 | operator facebook::cuda::fbfft::FBFFTParameters() const { 23 | facebook::cuda::fbfft::FBFFTParameters res; 24 | res = res.normalize(normalize_).withPadLeft(padLeft_).withPadUp(padUp_); 25 | return (direction_) ? res.forward() : res.inverse(); 26 | } 27 | 28 | FFTParameters& withCufft() { 29 | version = cufft; 30 | return *this; 31 | } 32 | 33 | FFTParameters& withFbfft() { 34 | version = fbfft; 35 | return *this; 36 | } 37 | 38 | FFTParameters& forward() { 39 | direction_ = true; 40 | return *this; 41 | } 42 | 43 | FFTParameters& inverse() { 44 | direction_ = false; 45 | return *this; 46 | } 47 | 48 | FFTParameters& normalize(bool n) { 49 | normalize_ = n; 50 | return *this; 51 | } 52 | 53 | FFTParameters& withPadLeft(int p) { 54 | padLeft_ = p; 55 | return *this; 56 | } 57 | 58 | FFTParameters& withPadUp(int p) { 59 | padUp_ = p; 60 | return *this; 61 | } 62 | 63 | bool forwardFFT() const { return direction_; } 64 | bool inverseFFT() const { return !direction_; } 65 | bool normalizeFFT() const { return normalize_; } 66 | bool cuFFT() const { return version == cufft; } 67 | bool fbFFT() const { return version == fbfft; } 68 | int padLeft() const { return padLeft_; } 69 | int padUp() const { return padUp_; } 70 | 71 | template 72 | std::vector makeComplexTensorSizes( 73 | long batch, long plane, long y, long x) { 74 | // Until fbfft supports rectangular ffts just assert it does not 75 | assert(cuFFT() || y == x); 76 | std::vector result(4); 77 | result[0] = batch; 78 | result[1] = plane; 79 | result[2] = (fbFFT() && Hermitian) ? numHermitian(y) : y; 80 | result[3] = (cuFFT() && Hermitian) ? numHermitian(x) : x; 81 | return result; 82 | } 83 | 84 | // Replaces cufft plans in the case of fbfft, only needed for sizes > 32. 85 | // For <= 32 we do everything in place. 86 | std::vector makeTmpBufferSizes( 87 | long batch, long plane, long y, long x) { 88 | assert(fbFFT()); 89 | // Until fbfft supports rectangular ffts just assert it does not 90 | assert(y == x); 91 | if (y <= 32) { 92 | std::vector result; 93 | return result; 94 | } 95 | std::vector result(4); 96 | result[0] = batch; 97 | result[1] = plane; 98 | if (forwardFFT()) { 99 | result[2] = numHermitian(y); 100 | } else { 101 | result[2] = y; 102 | } 103 | result[3] = x; 104 | return result; 105 | } 106 | 107 | enum FFTVersion { 108 | cufft = 0, 109 | fbfft = 1 110 | } version; 111 | 112 | private: 113 | bool direction_; 114 | bool normalize_; 115 | int padLeft_; 116 | int padUp_; 117 | }; 118 | 119 | template 120 | cufftHandle 121 | makeCuFFTPlan(const cuda::DeviceTensor& real, 122 | const cuda::DeviceTensor& complex, 123 | FFTParameters params = FFTParameters()); 124 | 125 | template 126 | void fft1d(cuda::DeviceTensor& real, 127 | cuda::DeviceTensor& complex, 128 | FFTParameters params = FFTParameters(), 129 | cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to 130 | // encode lack of a plan 131 | cudaStream_t stream = NULL); 132 | 133 | template 134 | void fft2d(cuda::DeviceTensor& real, 135 | cuda::DeviceTensor& complex, 136 | FFTParameters params = FFTParameters(), 137 | cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to 138 | // encode lack of a plan 139 | cudaStream_t stream = NULL); 140 | 141 | template 142 | void fft3d(cuda::DeviceTensor& real, 143 | cuda::DeviceTensor& complex, 144 | FFTParameters params = FFTParameters(), 145 | cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to 146 | // encode lack of a plan 147 | cudaStream_t stream = NULL); 148 | 149 | template 150 | void fft(cuda::DeviceTensor& real, 151 | cuda::DeviceTensor& complex, 152 | FFTParameters params = FFTParameters(), 153 | cufftHandle* plan = NULL, // cufftHandle is unsigned int, need to 154 | // encode lack of a plan 155 | cudaStream_t stream = NULL); 156 | } } } // namespace 157 | -------------------------------------------------------------------------------- /src/fft/FBFFTDevice.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "cuda/fbfft/FBFFT.cuh" 4 | #include "cuda/fbfft/FBFFTCommon.cuh" 5 | 6 | namespace facebook { namespace cuda { namespace fbfft { 7 | 8 | template 9 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft1D<1>( 10 | DeviceTensor& real, 11 | DeviceTensor& complex, 12 | const int padL, 13 | cudaStream_t s); 14 | 15 | template 16 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2D<1>( 17 | DeviceTensor& real, 18 | DeviceTensor& complex, 19 | const int padL, 20 | const int padU, 21 | cudaStream_t s); 22 | 23 | template 24 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2D<1>( 25 | DeviceTensor& complexSrc, 26 | DeviceTensor& complexDst, 27 | cudaStream_t s); 28 | 29 | template 30 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft1D<1>( 31 | DeviceTensor& real, 32 | DeviceTensor& complex, 33 | const int padL, 34 | cudaStream_t s); 35 | 36 | template 37 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft2D<1>( 38 | DeviceTensor& srcComplexAsFloat, 39 | DeviceTensor& dstComplexAsFloat, 40 | cudaStream_t s); 41 | 42 | template 43 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbifft2D<1>( 44 | DeviceTensor& srcComplex, 45 | DeviceTensor& realDst, 46 | const int padL, 47 | const int padU, 48 | cudaStream_t s); 49 | 50 | }}} 51 | -------------------------------------------------------------------------------- /src/fft/FBFFTHost.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cuda/DeviceTensor.cuh" 5 | 6 | namespace facebook { namespace deeplearning { namespace torch { 7 | 8 | template 9 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft1dHost( 10 | facebook::cuda::DeviceTensor& real, 11 | facebook::cuda::DeviceTensor& complexAsFloat, 12 | facebook::cuda::fbfft::FBFFTParameters params = 13 | facebook::cuda::fbfft::FBFFTParameters(), 14 | cudaStream_t s = 0); 15 | 16 | // If calling a 2D-fft of size > 32 we need a buffer to avoid a race condition 17 | // between reads and writes to device memory on the corner turn. 18 | template 19 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft2dHost( 20 | facebook::cuda::DeviceTensor& real, 21 | facebook::cuda::DeviceTensor& complexAsFloat, 22 | facebook::cuda::DeviceTensor* bufferAsFloat, 23 | facebook::cuda::fbfft::FBFFTParameters params = 24 | facebook::cuda::fbfft::FBFFTParameters(), 25 | cudaStream_t s = 0); 26 | 27 | // If calling a 2D-fft of size > 32 we need a buffer to avoid a race condition 28 | // between reads and writes to device memory on the corner turn. 29 | template 30 | facebook::cuda::fbfft::FBFFTParameters::ErrorCode fbfft( 31 | THCState* state, 32 | THCudaTensor* real, 33 | THCudaTensor* complex, 34 | THCudaTensor* buffer = nullptr, 35 | facebook::cuda::fbfft::FBFFTParameters params = 36 | facebook::cuda::fbfft::FBFFTParameters()); 37 | 38 | } } } // namespace 39 | -------------------------------------------------------------------------------- /src/fft/FFTIteratedConvolution.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #include "src/DeviceTensorUtils.h" 4 | #include "THCTensor.h" 5 | 6 | #include "cuda/DeviceTensor.cuh" 7 | #include "cuda/fbfft/FFTIteratedConvolution.cuh" 8 | 9 | #include 10 | #include 11 | 12 | using namespace facebook::cuda; 13 | 14 | namespace facebook { namespace deeplearning { namespace torch { 15 | 16 | typedef struct { 17 | THCudaTensor* tensor; 18 | int padL; 19 | int padU; 20 | } TiledDeviceTensorFFI; 21 | 22 | #define LOG_TARGET LOG(INFO) 23 | 24 | #define INSTANTIATE_ITERATED_CONVOLUTION(DIM, FFT_SIZE) \ 25 | if (THCudaTensor_nDimension(state, weight) == DIM && \ 26 | fftSize == FFT_SIZE) { \ 27 | thrust::host_vector > \ 28 | tiledInputs; \ 29 | thrust::host_vector > \ 30 | tiledOutputs; \ 31 | for (int i = 0; i < numTiles; ++i) { \ 32 | DeviceTensor ti( \ 33 | torchToDeviceTensor(state, input[i].tensor)); \ 34 | fbfft::detail::TiledDeviceTensor inp( \ 35 | ti, \ 36 | input[i].padL, \ 37 | input[i].padU); \ 38 | /* TODO: emplace_back */ \ 39 | tiledInputs.push_back(inp); \ 40 | \ 41 | DeviceTensor to( \ 42 | torchToDeviceTensor(state, output[i].tensor)); \ 43 | fbfft::detail::TiledDeviceTensor out( \ 44 | to, \ 45 | output[i].padL, \ 46 | output[i].padU); \ 47 | /* TODO: emplace_back */ \ 48 | tiledOutputs.push_back(out); \ 49 | } \ 50 | \ 51 | thrust::device_vector > \ 52 | ins = tiledInputs; \ 53 | thrust::device_vector > \ 54 | outs = tiledOutputs; \ 55 | \ 56 | DeviceTensor wei( \ 57 | torchToDeviceTensor(state, weight)); \ 58 | bool res = \ 59 | fbfft::detail::FFTIteratedConvolution( \ 60 | thrust::raw_pointer_cast(&ins[0]), \ 61 | thrust::raw_pointer_cast(&outs[0]), \ 62 | wei, \ 63 | pass, \ 64 | scale, \ 65 | batchSize, \ 66 | ins.size(), \ 67 | THCState_getCurrentStream(state)); \ 68 | if (!res) { THError("Error in iterated convolution"); } \ 69 | } 70 | 71 | extern "C" void convolveIteratedFFI(THCState* state, 72 | TiledDeviceTensorFFI* input, 73 | THCudaTensor* weight, 74 | TiledDeviceTensorFFI* output, 75 | int numTiles, 76 | int fftSize, 77 | fbfft::detail::FFTConvolutionPassFFI pass, 78 | float scale) { 79 | // TODO: accGrad all on same stream, updateOutput / updateGradInput async 80 | int batchSize = THCudaTensor_size(state, input[0].tensor, 0); 81 | 82 | //////////////////////////////////////////////////////// 83 | // FFT of size 32 84 | //////////////////////////////////////////////////////// 85 | INSTANTIATE_ITERATED_CONVOLUTION(4, 32); 86 | 87 | //////////////////////////////////////////////////////// 88 | // FFT of size 16 89 | //////////////////////////////////////////////////////// 90 | INSTANTIATE_ITERATED_CONVOLUTION(4, 16); 91 | 92 | //////////////////////////////////////////////////////// 93 | // FFT of size 8 94 | //////////////////////////////////////////////////////// 95 | INSTANTIATE_ITERATED_CONVOLUTION(4, 8); 96 | } 97 | 98 | }}} 99 | -------------------------------------------------------------------------------- /src/fft/SpatialConvolutionCuFFT.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Facebook 2 | 3 | #pragma once 4 | 5 | #include "src/fft/CuFFTStrategy.h" 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { 8 | namespace detail { 9 | 10 | void updateOutputTH(THCState* state, 11 | const THParams& p, 12 | const ProblemSizes& originalSizes, 13 | const CuFFTStrategy& s); 14 | 15 | void updateGradInputTH(THCState* state, 16 | const THParams& p, 17 | const ProblemSizes& originalSizes, 18 | const CuFFTStrategy& s); 19 | 20 | void accGradParametersTH(THCState* state, 21 | const THParams& p, 22 | const ProblemSizes& originalSizes, 23 | const CuFFTStrategy& s); 24 | 25 | void cleanupBuffers(); 26 | 27 | }}}} 28 | -------------------------------------------------------------------------------- /src/fft/SpatialConvolutionCuFFTTuner.h: -------------------------------------------------------------------------------- 1 | // Copyright 2014 Facebook 2 | 3 | #pragma once 4 | 5 | #include "src/fft/CuFFTStrategy.h" 6 | #include 7 | 8 | struct THCState; 9 | 10 | namespace facebook { namespace deeplearning { namespace torch { 11 | 12 | struct SpatialConvolutionCuFFTTuner { 13 | static folly::Optional getBestPerformance(THCState* state, 14 | ProblemSizes pbs); 15 | }; 16 | 17 | }}} 18 | -------------------------------------------------------------------------------- /src/fft/Utils.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2014-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | namespace facebook { namespace deeplearning { namespace torch { 5 | 6 | // Depending on whether cuFFT is expected, use the Hermitian symmetry 7 | // properties that cufft exploits on the rows. 8 | template 9 | __device__ __host__ T numHermitian(T commonCols) { 10 | return commonCols / 2 + 1; 11 | 12 | } 13 | 14 | }}} // namespace 15 | -------------------------------------------------------------------------------- /src/fft/Utils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | #include "thpp/Tensor.h" 6 | #include "THCTensor.h" 7 | #include "src/CudaTensorUtils.h" 8 | 9 | #include 10 | #include 11 | 12 | namespace facebook { namespace deeplearning { namespace torch { 13 | enum class FFTOutputSpecification : bool { InPlace = true, OutOfPlace = false }; 14 | 15 | // Given a 4-D input tensor in (?, ?, row, col) storage mode and a common 16 | // padding specification for Rows and Cols, creates a real and complex cuda 17 | // tensor suitable for cuFFT. 18 | // If the FFTOutputSpecification is InPlace then complex and real alias the same 19 | // storage buffer. 20 | // The real 'time' tensor has: 21 | // - same dimensionality as the input tensor (4) 22 | // - same sizes as the input tensor 23 | // - modified strides to accommodate padding to (commonRows, commonCols) 24 | // The complex 'frequency' tensor has: 25 | // - dimensionality 5 to support AoS with S == cufftComplex == float[2] 26 | // - size == stride == (?, ?, NumRows, NumCols / 2 + 1) to accommodate the 27 | // output of cufft R2C which has only 1/2 the data due to Hermitian 28 | // symmetry (X[k] == X*[-k mod NumCols]) 29 | // 30 | // Warning going to multi-GPUs: In Version 6.0 only a subset of single GPU 31 | // functionality is supported for two GPU execution. 32 | // http://docs.nvidia.com/cuda/cufft/index.html#ixzz39Wu2cUWp 33 | // TODO:#4846735 extend for 1-D and 3-D FFTs 34 | // Always dim 4 (3b+1fft, 2b+2fft, 1b+3fft) atm, extend later 35 | // 36 | // This method always copies the real data. 37 | // TODO(4948477) Remove the copy when it is not needed 38 | template 39 | std::unique_ptr 40 | makeCuFFTTensorReal( 41 | THCState* state, 42 | THCudaTensor* in, 43 | const std::vector& commonDims, 44 | THCudaTensor* candidateCudaStorageReal = nullptr, 45 | FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace); 46 | 47 | // Given a real tensor that is properly padded for interpolation, construct a 48 | // complex tensor that will hold the output of the CuFFT_R2C operation. 49 | // If in place, reuse the real THCudaTensor storage 50 | // Otherwise, if candidateCudaStorageComplex is large enough, use it. 51 | // Otherwise allocate a new cuda buffer. 52 | // 53 | // This method never copies data but will fill with 0 if allocation occurs. 54 | template 55 | std::unique_ptr 56 | makeCuFFTTensorComplex( 57 | THCState* state, 58 | THCudaTensor* real, 59 | const std::vector& commonDims, 60 | THCudaTensor* candidateCudaStorageComplex = nullptr, 61 | FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace); 62 | 63 | // Given a 4-D vector containing the sizes, this allocates a full tensor of 64 | // the specified sizes with strides matching exactly. 65 | // If candidate storage is specified it will try to reuse the storage. 66 | // This version does not need a model tensor but requires all dims to be 67 | // specified a-priori. 68 | template 69 | std::unique_ptr 70 | makeCuFFTTensorComplex( 71 | THCState* state, 72 | const std::vector& allDims, 73 | THCudaTensor* candidateCudaStorageComplex = nullptr); 74 | 75 | // Make properly sized and padded real and complex tensors on the Cuda device 76 | // This version is wasteful and always creates new storage; used in tests 77 | template 78 | std::pair, 79 | std::unique_ptr> 80 | makeCuFFTTensors( 81 | THCState* state, 82 | THCudaTensor* in, 83 | const std::vector& commonDims, 84 | FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace); 85 | 86 | // Make properly sized and padded real and complex tensors on the Cuda device 87 | // This version is wasteful and always creates new storage; used in tests 88 | template 89 | std::pair, 90 | std::unique_ptr> 91 | makeCuFFTTensors( 92 | THCState* state, 93 | thpp::Tensor& in, 94 | const std::vector& commonDims, 95 | FFTOutputSpecification inPlace = FFTOutputSpecification::OutOfPlace); 96 | 97 | } } } // namespace 98 | 99 | #include "Utils-inl.h" 100 | -------------------------------------------------------------------------------- /src/init.cu: -------------------------------------------------------------------------------- 1 | #include "luaT.h" 2 | #include "THC.h" 3 | 4 | #include "TemporalMaxPooling.cu" 5 | 6 | LUA_EXTERNC DLL_EXPORT int luaopen_libfbcunn(lua_State *L); 7 | 8 | int luaopen_libfbcunn(lua_State *L) 9 | { 10 | lua_newtable(L); 11 | 12 | fbcunn_TemporalMaxPooling_init(L); 13 | 14 | return 1; 15 | } 16 | -------------------------------------------------------------------------------- /src/util/AsyncCopier.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Tudor Bosman (tudorb@fb.com) 4 | */ 5 | 6 | #ifndef DEEPLEARNING_TORCH_CUDA_UTIL_ASYNCCOPIER_H_ 7 | #define DEEPLEARNING_TORCH_CUDA_UTIL_ASYNCCOPIER_H_ 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | namespace facebook { namespace cuda { 19 | 20 | class AsyncCopier { 21 | public: 22 | explicit AsyncCopier(size_t bufferSize); 23 | 24 | void copyHtoD(void* dest, const void* src, size_t size); 25 | 26 | private: 27 | class Deallocator { 28 | public: 29 | void operator()(uint8_t* ptr) const; 30 | }; 31 | 32 | struct Event { 33 | explicit Event(int device); 34 | 35 | int device; 36 | folly::Optional event; 37 | ssize_t refCount; 38 | }; 39 | 40 | struct AllocatedBlock { 41 | AllocatedBlock(size_t s, size_t l) : start(s), length(l) { } 42 | size_t start; 43 | size_t length; 44 | Event* event = nullptr; 45 | }; 46 | 47 | static bool pollEvent(Event* event); // returns true if completed 48 | static void waitEvent(Event* event); 49 | 50 | typedef folly::small_vector RangeVec; 51 | RangeVec getRangesLocked() const; 52 | Event* getEventLocked(); 53 | void releaseEventLocked(Event* event); 54 | 55 | const size_t bufferSize_; 56 | std::unique_ptr buffer_; 57 | 58 | std::mutex mutex_; 59 | std::vector> events_; 60 | std::vector> freeEvents_; 61 | std::deque allocated_; 62 | }; 63 | 64 | }} // namespaces 65 | 66 | #endif /* DEEPLEARNING_TORCH_CUDA_UTIL_ASYNCCOPIER_H_ */ 67 | -------------------------------------------------------------------------------- /src/util/GlobalAsyncCopier.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Tudor Bosman (tudorb@fb.com) 4 | */ 5 | 6 | #include "src/util/GlobalAsyncCopier.h" 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | #include "src/util/AsyncCopier.h" 13 | 14 | using namespace facebook::cuda; 15 | 16 | constexpr size_t kDefaultBufferSizeMB = 16; 17 | const char* const kBufferSizeEnvVar = "FB_CUDA_ASYNC_COPIER_BUFFER_SIZE_MB"; 18 | 19 | std::unique_ptr makeGlobalCopier() { 20 | size_t bufferSize = kDefaultBufferSizeMB; 21 | auto ptr = getenv(kBufferSizeEnvVar); 22 | if (ptr) { 23 | bufferSize = folly::to(ptr); 24 | } 25 | 26 | return std::make_unique(bufferSize << 20); 27 | } 28 | 29 | extern "C" void fbCudaAsyncMemcpyHtoD(void* dest, 30 | const void* src, 31 | size_t size) { 32 | static auto copier = makeGlobalCopier(); 33 | copier->copyHtoD(dest, src, size); 34 | } 35 | -------------------------------------------------------------------------------- /src/util/GlobalAsyncCopier.h: -------------------------------------------------------------------------------- 1 | /** 2 | * Copyright 2014 Facebook 3 | * @author Tudor Bosman (tudorb@fb.com) 4 | */ 5 | 6 | #ifndef DEEPLEARNING_TORCH_CUDA_UTIL_GLOBALASYNCCOPIER_H_ 7 | #define DEEPLEARNING_TORCH_CUDA_UTIL_GLOBALASYNCCOPIER_H_ 8 | 9 | #include 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | void fbCudaAsyncMemcpyHtoD(void* dest, const void* src, size_t size); 16 | 17 | #ifdef __cplusplus 18 | } 19 | #endif 20 | 21 | #endif /* DEEPLEARNING_TORCH_CUDA_UTIL_GLOBALASYNCCOPIER_H_ */ 22 | -------------------------------------------------------------------------------- /src/util/Misc.cpp: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #include "src/util/Misc.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace std; 10 | 11 | namespace facebook { namespace cuda { 12 | 13 | cudaStream_t getComputeStream() { 14 | // It would be nice to compute on non-default streams from time to time, 15 | // but there's a *lot* of code to change. 16 | return 0; 17 | } 18 | 19 | [[noreturn]] void throwCudaError(cudaError_t error, const char* msg) { 20 | auto string = msg ? 21 | folly::sformat("{}: CUDA error {} ({})", msg, int(error), 22 | cudaGetErrorString(error)) : 23 | folly::sformat("CUDA error {} ({})", int(error), 24 | cudaGetErrorString(error)); 25 | throw std::runtime_error(string); 26 | } 27 | 28 | } } 29 | -------------------------------------------------------------------------------- /src/util/Misc.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | #include "cuda/util/CachedDeviceProperties.h" 6 | 7 | #include 8 | 9 | namespace facebook { namespace cuda { 10 | 11 | [[noreturn]] void throwCudaError(cudaError_t, const char* msg); 12 | 13 | inline void 14 | checkCudaError(cudaError_t error, const char* msg = 0) { 15 | if (error != cudaSuccess) { 16 | throwCudaError(error, msg); 17 | } 18 | } 19 | 20 | class OnDevice { 21 | int m_home; 22 | public: 23 | explicit OnDevice(int newDev) : m_home(getDevice()) { 24 | checkCudaError(cudaSetDevice(newDev)); 25 | } 26 | 27 | ~OnDevice() { 28 | checkCudaError(cudaSetDevice(m_home)); 29 | } 30 | }; 31 | 32 | cudaStream_t getComputeStream(); 33 | 34 | } } 35 | -------------------------------------------------------------------------------- /src/util/Transform.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #include 4 | #include 5 | 6 | #include "src/util/Transform.cuh" 7 | 8 | namespace facebook { namespace cuda { 9 | 10 | template 11 | __global__ static void 12 | transformKernel(const typename Operator::Input* input, 13 | typename Operator::Output* out, 14 | size_t n) { 15 | 16 | Operator op; 17 | size_t start = threadIdx.x + blockIdx.x * blockDim.x; 18 | if (start >= n) return; 19 | out[start] = op(input[start]); 20 | } 21 | 22 | size_t roundUp(double d) { 23 | return size_t(ceil(d)); 24 | } 25 | 26 | template 27 | void transform(cudaStream_t stream, 28 | const typename Op::Input* input, 29 | typename Op::Output* out, 30 | size_t n) { 31 | static const int kThreadsPerBlock = 128; 32 | assert(n > 0); 33 | int totalNumBlocks = int(ceil(1.0 * n / kThreadsPerBlock)); 34 | dim3 blockDim(kThreadsPerBlock); 35 | dim3 gridDim(totalNumBlocks); 36 | transformKernel<<>>(input, out, n); 37 | } 38 | 39 | template void transform(cudaStream_t stream, 40 | const ToHalf::Input* in, 41 | ToHalf::Output* out, 42 | size_t n); 43 | template void transform(cudaStream_t stream, 44 | const ToFloat::Input* in, 45 | ToFloat::Output* out, 46 | size_t n); 47 | 48 | } } 49 | -------------------------------------------------------------------------------- /src/util/Transform.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-, Facebook, Inc. All Rights Reserved. 2 | 3 | #pragma once 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | namespace facebook { namespace cuda { 10 | 11 | /* 12 | * A generic interface for dense point-to-point operations. 13 | */ 14 | template 15 | void transform(cudaStream_t stream, 16 | const typename Operator::Input* input, 17 | typename Operator::Output* out, size_t n); 18 | 19 | typedef uint16_t half_t; 20 | 21 | // Some pointwise operations. They must publicly define Input and 22 | // Output types, and provide an operator() mapping one input to one 23 | // output. 24 | struct ToHalf { 25 | typedef float Input; 26 | typedef half_t Output; 27 | Output __device__ operator()(const Input f) { 28 | return __float2half_rn(f); 29 | } 30 | }; 31 | 32 | struct ToFloat { 33 | typedef half_t Input; 34 | typedef float Output; 35 | Output __device__ operator()(const Input h) { 36 | return __half2float(h); 37 | } 38 | }; 39 | 40 | } } 41 | -------------------------------------------------------------------------------- /test/CudaTensorTestKernels.cu: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #include "cuda/DeviceTensor.cuh" 3 | #include "cuda/util/CachedDeviceProperties.h" 4 | #include "src/DeviceTensorUtils.h" 5 | 6 | #include 7 | 8 | using namespace facebook::cuda; 9 | using namespace facebook::cuda; 10 | 11 | namespace facebook { namespace deeplearning { namespace torch { 12 | 13 | __global__ void testAssignment1dKernel(DeviceTensor tensor) { 14 | // Thread grid is already sized exactly for our tensor 15 | tensor[threadIdx.x] = threadIdx.x; 16 | } 17 | 18 | bool testAssignment1d(THCState* state, THCudaTensor* t) { 19 | DeviceTensor tensor = 20 | torchToDeviceTensor(state, t); 21 | 22 | const cudaDeviceProp& deviceProp = getDeviceProperties(0); 23 | 24 | if (deviceProp.maxThreadsDim[0] < tensor.getSize(0)) { 25 | // tensor too large to be covered exactly by threads in one block alone 26 | return false; 27 | } 28 | 29 | testAssignment1dKernel<<<1, tensor.getSize(0)>>>(tensor); 30 | 31 | return (cudaGetLastError() == cudaSuccess); 32 | } 33 | 34 | __global__ void testAssignment3dKernel(DeviceTensor tensor) { 35 | // Thread grid is already sized exactly for our tensor 36 | tensor[threadIdx.z][threadIdx.y][threadIdx.x] = 37 | tensor.getSize(0) * threadIdx.z + 38 | tensor.getSize(1) * threadIdx.y + 39 | tensor.getSize(2) * threadIdx.x; 40 | } 41 | 42 | bool testAssignment3d(THCState* state, THCudaTensor* t) { 43 | DeviceTensor tensor = torchToDeviceTensor(state, t); 44 | 45 | const cudaDeviceProp& deviceProp = getDeviceProperties(0); 46 | 47 | for (int i = 0; i < 3; ++i) { 48 | if (deviceProp.maxThreadsDim[i] < tensor.getSize(i)) { 49 | // tensor too large to be covered exactly by threads in one block alone 50 | return false; 51 | } 52 | } 53 | 54 | dim3 threadsPerBlock(tensor.getSize(2), 55 | tensor.getSize(1), 56 | tensor.getSize(0)); 57 | testAssignment3dKernel<<<1, threadsPerBlock>>>(tensor); 58 | 59 | return (cudaGetLastError() == cudaSuccess); 60 | } 61 | 62 | template 63 | bool verifyUpcast(DeviceTensor up, 64 | DeviceTensor orig) { 65 | int shift = NewDim - Dim; 66 | 67 | // Check extended dimensions size and stride 68 | for (int i = 0; i < shift; ++i) { 69 | if (up.getSize(i) != 1) { 70 | return false; 71 | } else if (up.getStride(i) != 72 | orig.getStride(0) * orig.getSize(0)) { 73 | return false; 74 | } 75 | } 76 | 77 | // Check original dimensions size and stride 78 | for (int i = shift; i < NewDim; ++i) { 79 | if (up.getSize(i) != orig.getSize(i - shift)) { 80 | return false; 81 | } else if (up.getStride(i) != orig.getStride(i - shift)) { 82 | return false; 83 | } 84 | } 85 | 86 | return true; 87 | } 88 | 89 | bool testUpcast(THCState* state, THCudaTensor* t) { 90 | DeviceTensor tensor = torchToDeviceTensor(state, t); 91 | 92 | if (!verifyUpcast(tensor.upcastOuter<4>(), tensor)) { 93 | return false; 94 | } else if (!verifyUpcast(tensor.upcastOuter<5>(), tensor)) { 95 | return false; 96 | } 97 | 98 | return true; 99 | } 100 | 101 | bool testDowncastTo2d(THCState* state, THCudaTensor* t) { 102 | DeviceTensor tensor = torchToDeviceTensor(state, t); 103 | DeviceTensor downTensor = tensor.downcastOuter<2>(); 104 | 105 | if (downTensor.getSize(0) != 106 | tensor.getSize(0) * tensor.getSize(1)) { 107 | return false; 108 | } else if (downTensor.getStride(0) != 109 | tensor.getSize(2) * tensor.getStride(2)) { 110 | return false; 111 | } else if (downTensor.getSize(1) != 112 | tensor.getSize(2)) { 113 | return false; 114 | } else if (downTensor.getStride(1) != 115 | tensor.getStride(2)) { 116 | return false; 117 | } 118 | 119 | return true; 120 | } 121 | 122 | bool testDowncastTo1d(THCState* state, THCudaTensor* t) { 123 | DeviceTensor tensor = torchToDeviceTensor(state, t); 124 | DeviceTensor downTensor = tensor.downcastOuter<1>(); 125 | 126 | if (downTensor.getSize(0) != 127 | tensor.getSize(0) * tensor.getSize(1) * tensor.getSize(2)) { 128 | return false; 129 | } else if (downTensor.getStride(0) != 130 | tensor.getStride(2)) { 131 | return false; 132 | } 133 | 134 | return true; 135 | } 136 | 137 | __global__ void testDowncastWritesKernel(DeviceTensor tensor) { 138 | // Thread grid is already sized exactly for our tensor 139 | tensor[threadIdx.x] = 1.0f; 140 | } 141 | 142 | bool testDowncastWrites(THCState* state, THCudaTensor* t) { 143 | DeviceTensor tensor = torchToDeviceTensor(state, t); 144 | DeviceTensor downTensor = tensor.downcastOuter<1>(); 145 | 146 | testDowncastWritesKernel<<<1, downTensor.getSize(0)>>>(downTensor); 147 | return (cudaGetLastError() == cudaSuccess); 148 | } 149 | 150 | } } } // namespace 151 | -------------------------------------------------------------------------------- /test/CudaTensorTestKernels.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | struct THCudaTensor; 5 | 6 | /// 7 | /// Collection of kernels for testing DeviceTensor<> 8 | /// 9 | 10 | namespace facebook { namespace deeplearning { namespace torch { 11 | 12 | /// Assign values to the tensor via CudaTensor based on position 13 | bool testAssignment1d(THCState* state, THCudaTensor* tensor); 14 | bool testAssignment3d(THCState* state, THCudaTensor* tensor); 15 | 16 | /// Test upcasting to a higher-dimensional tensor 17 | bool testUpcast(THCState* state, THCudaTensor* tensor); 18 | 19 | /// Downcast tests 20 | bool testDowncastTo2d(THCState* state, THCudaTensor* tensor); 21 | bool testDowncastTo1d(THCState* state, THCudaTensor* tensor); 22 | bool testDowncastWrites(THCState* state, THCudaTensor* tensor); 23 | 24 | } } } // namespace 25 | -------------------------------------------------------------------------------- /test/InputCentricConvolution_UpdateOutput.cuh: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | struct THCudaTensor; 5 | struct THCState; 6 | 7 | namespace facebook { namespace deeplearning { namespace torch { namespace test { 8 | 9 | bool InputCentricRelayoutConvolution_UpdateOutput(THCState* state, 10 | THCudaTensor* inputTH, 11 | THCudaTensor* kernelsTH, 12 | long filterRowStride, 13 | long filterColStride, 14 | THCudaTensor* outputTH); 15 | 16 | } } } } 17 | -------------------------------------------------------------------------------- /test/ReferenceConvolutions.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "src/Tensor.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace facebook { namespace deeplearning { namespace torch { namespace test { 10 | 11 | /// 12 | /// Reference convolution/cross-correlation implementations 13 | /// 14 | 15 | /// Returns the output size based on the input and filter size and 16 | /// stride for a valid-only convolution or cross-correlation 17 | constexpr long 18 | getValidConvSize(long inputSize, long filterSize, long filterStride) { 19 | return ((inputSize - filterSize) / filterStride) + 1; 20 | } 21 | 22 | /// Returns the output size based on the input and filter size and 23 | /// stride for a reverse valid-only convolution or cross-correlation 24 | constexpr long 25 | getValidRevConvSize(long inputSize, long filterSize, long filterStride) { 26 | return inputSize - (filterSize - 1) * filterStride; 27 | } 28 | 29 | /// Returns the output size based on the input and filter size and 30 | /// stride for a full convolution or cross-correlation 31 | constexpr long 32 | getFullConvSize(long inputSize, long filterSize, long filterStride) { 33 | return (inputSize - 1) * filterStride + filterSize; 34 | } 35 | 36 | /// Input to output: 37 | /// 38 | /// input (batch x img planes x img row x img col) 39 | /// star (valid only) 40 | /// filters (filter planes x img planes x filter row x filter col) 41 | /// = 42 | /// output (batch x filter planes x 43 | /// getValidConvSize(img row, filter row, stride), 44 | /// getValidConvSize(img col, filter col, stride)) 45 | /// Optional input padding is expressed as 46 | /// on each innermost 2d plane. 47 | Tensor 48 | crossCorrelationValidOnly( 49 | const Tensor& input, 50 | const Tensor& filters, 51 | long filterRowStride, 52 | long filterColStride, 53 | const folly::Optional>& padding = 54 | folly::none); 55 | 56 | Tensor 57 | crossCorrelationValidOnlyInputCentric( 58 | const Tensor& input, 59 | const Tensor& filters, 60 | long filterRowStride, 61 | long filterColStride, 62 | const folly::Optional>& padding = 63 | folly::none); 64 | 65 | /// Output gradient to input gradient: 66 | /// 67 | /// output (batch x filter planes x 68 | /// getValidConvSize(img row, filter row, stride), 69 | /// getValidConvSize(img col, filter col, stride)) 70 | /// * (full) 71 | /// filters (filter planes x img planes x filter row x filter col) 72 | /// = 73 | /// input (batch x img planes x img row x img col) 74 | /// Optional input padding is expressed as 75 | /// on each innermost 2d plane. 76 | Tensor 77 | convolutionFull( 78 | const Tensor& output, 79 | const Tensor& filters, 80 | long filterRowStride, 81 | long filterColStride, 82 | const folly::Optional>& padding = 83 | folly::none); 84 | 85 | /// Output gradient to weights: 86 | /// 87 | /// input (batch x img planes x img row x img col) 88 | /// star (valid only) 89 | /// output (batch x filter planes x 90 | /// getValidRevConvSize(img row, filter row, stride), 91 | /// getValidRevConvSize(img col, filter col, stride)) 92 | /// = 93 | /// weight gradient (filter planes x img planes x filter row x filter col) 94 | /// Optional input padding is expressed as 95 | /// on each innermost 2d plane. Scale is a multiplicative factor 96 | /// applied pointwise to every output point 97 | Tensor 98 | crossCorrelationReverseValidOnly( 99 | const Tensor& input, 100 | const Tensor& output, 101 | long filterRowStride, 102 | long filterColStride, 103 | float scale, 104 | const folly::Optional>& padding = 105 | folly::none); 106 | 107 | } } } } // namespace 108 | -------------------------------------------------------------------------------- /test/TestUtils.h: -------------------------------------------------------------------------------- 1 | // Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | #pragma once 4 | #include "cuda/DeviceTensor.cuh" 5 | #include "src/CudaTensorUtils.h" 6 | #include "src/DeviceTensorUtils.h" 7 | #include "THCTensor.h" 8 | #include "src/fft/CuFFTConvolution_UpdateOutput.cuh" 9 | #include "src/fft/Utils.h" 10 | 11 | #include 12 | #include 13 | 14 | namespace facebook { namespace deeplearning { namespace torch { namespace test { 15 | 16 | // Constructs a full CUDA tensor of the same size as the input 17 | std::unique_ptr 18 | makeTHCudaTensorSameSize(THCState* state, const thpp::Tensor& t); 19 | 20 | // Constructs a full CUDA tensor with constant values 21 | thpp::Tensor 22 | makeRandomTestTensor(std::initializer_list sizeList); 23 | 24 | thpp::Tensor makeTestTensor(std::initializer_list sizeList, 25 | float constant); 26 | 27 | // Constructs a CUDA tensor by scaling the factor list 28 | thpp::Tensor makeTestTensor( 29 | std::initializer_list sizeList, 30 | std::initializer_list factorList, 31 | const folly::Optional>& padding = 32 | folly::none); 33 | 34 | // Constructs a full CUDA tensor by scaling {0.1f, 0.2f, 0.3f, 0.4f} 35 | thpp::Tensor makeTestTensor(std::initializer_list sizeList); 36 | 37 | 38 | bool isWithin(float a, float b, float relativeError = 1e-5f); 39 | 40 | // Returns true or false if the two tensors match within some relative 41 | // error; also returns the 2d slice where they first differ as a 42 | // string if they do. 43 | // PrecisionDebug controls how many digits are printed on error in the 44 | // returned string. 45 | // If compareInter is set to true, comparison will only be performed on the 46 | // intersection subtensors: 47 | // [0, min(reference.size(0), test.size(0))] x ... x 48 | // [0, min(reference.size(dim-1), test.size(dim-1))] 49 | // This is useful for kernels that write tail garbage 50 | std::pair 51 | compareTensors(const thpp::Tensor& reference, 52 | const thpp::Tensor& test, 53 | float relativeError = 1e-5f, 54 | int precisionDebug = 4, 55 | bool compareInter = false); 56 | 57 | // Constructs a full CUDA tensor of the same size as the input 58 | template 59 | std::unique_ptr 60 | makeTHCudaTensorSameSize(THCState* state, 61 | const cuda::DeviceTensor& t) { 62 | std::vector sizes; 63 | std::vector strides; 64 | for (int i = 0; i < Dim; ++i) { 65 | sizes.push_back(t.getSize(i)); 66 | strides.push_back(t.getStride(i)); 67 | } 68 | 69 | return makeTHCudaTensorFull(state, sizes, strides); 70 | } 71 | 72 | }}}} // namespace 73 | -------------------------------------------------------------------------------- /test/benchmark_fft.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | require('fb.luaunit') 3 | 4 | require 'cunn' 5 | 6 | require 'fbcunn' 7 | 8 | torch.setdefaulttensortype('torch.FloatTensor') 9 | 10 | local test = {} 11 | 12 | -- These are used for fast, exhaustive search over the parameters space 13 | -- Can be overridden by setting problemSizes to non-{} 14 | local batchList = { 15 | 128, 64, 32, 16 | } 17 | local filterList = { 18 | 128, 96, 64, 32, 24, 19 | } 20 | local planeList = { 21 | 128, 96, 64, 32, 24, 3 22 | } 23 | local inputRowList = { 24 | 128, 96, 64, 32, 16, 13 25 | } 26 | local inputColList = { 27 | 128, 96, 64, 32, 16, 13 28 | } 29 | local kernelRowList = { 30 | 11, 9, 7, 5, 3 31 | } 32 | local kernelColList = {} 33 | 34 | -- batch, filters, plane, row, col, kernelRow, kernelCol overrides 35 | -- the List arguments 36 | -- This is particularly useful to explore tradeoffs between cufft 37 | -- efficiency at various interpolation sizes and amount of work in 38 | -- transpose + mxm 39 | 40 | -- Soumith's benchmark sizes 41 | local fixedSizes = { 42 | -- {128, 96, 3, 128, 128, 11, 11}, 43 | -- {128, 64, 64, 64, 64, 9, 9}, 44 | -- {128, 128, 128, 32, 32, 9, 9}, 45 | -- {128, 128, 128, 16, 16, 7, 7}, 46 | -- {128, 384, 384, 13, 13, 3, 3}, 47 | {128, 96, 256, 31, 31, 5, 5}, -- 1 GPU 48 | {128, 96, 128, 31, 31, 5, 5}, -- 2 GPU 49 | {64, 96, 256, 31, 31, 5, 5}, -- 2 GPU 50 | {128, 96, 256, 21, 31, 5, 5}, -- 2 GPU, 27 / 2 = 14 + 4 + 3 51 | {128, 96, 64, 31, 31, 5, 5}, -- 4 GPU 52 | {32, 96, 256, 31, 31, 5, 5}, -- 4 GPU 53 | {128, 96, 256, 14, 31, 5, 5}, -- 4 GPU, 27 / 4 = 7 + 4 + 3 54 | {64, 96, 256, 21, 31, 5, 5}, -- 4 GPU, 27 / 2 = 14 + 4 + 3 55 | {128, 96, 128, 21, 31, 5, 5}, -- 4 GPU, 27 / 2 = 14 + 4 + 3 56 | {64, 96, 128, 31, 31, 5, 5}, -- 2 GPU 57 | } 58 | 59 | -- Running 76 81 84 8 9 92 88 60 | -- Running 176 3 9 8 1 13 54 61 | 62 | -- Set this to {} to run a small search around the fixedSizes 63 | local problemSizes = fixedSizes -- {} 64 | 65 | local problemSize = {} 66 | 67 | local function testLoop() 68 | -- Just allocate some dummy placeholder to get to the proper 69 | -- function in the lua module 70 | local net = nn.SpatialConvolutionCuFFT(1, 1, 1, 1) 71 | local input = torch.Tensor(1, 1, 1, 1):normal():cuda() 72 | 73 | if table.getn(problemSize) > 0 then 74 | batchList = {problemSize[1]} 75 | filterList = {problemSize[2]} 76 | planeList = {problemSize[3]} 77 | inputRowList = {problemSize[4]} 78 | inputColList = {problemSize[5]} 79 | kernelRowList = {problemSize[6]} 80 | kernelColList = {} 81 | end 82 | 83 | local batches = torch.Tensor(batchList):cuda() 84 | local filters = torch.Tensor(filterList):cuda() 85 | local planes = torch.Tensor(planeList):cuda() 86 | local inputRows = torch.Tensor(inputRowList):cuda() 87 | local inputCols = torch.Tensor(inputColList):cuda() 88 | local kernelRows = torch.Tensor(kernelRowList):cuda() 89 | local kernelCols = torch.Tensor(kernelColList):cuda() 90 | 91 | print('-------------------------------------------------------') 92 | net:explorePerformance(input, batches, filters, 93 | planes, inputRows, inputCols, kernelRows, kernelCols) 94 | 95 | net:cleanupBuffers(input) 96 | collectgarbage() 97 | end 98 | 99 | if table.getn(problemSizes) >= 1 then 100 | for i = 1, table.getn(problemSizes) do 101 | problemSize = problemSizes[i] 102 | testLoop() 103 | end 104 | else 105 | testLoop() 106 | end 107 | -------------------------------------------------------------------------------- /test/fb_test.lua: -------------------------------------------------------------------------------- 1 | require('fb.luaunit') 2 | require('fbtorch') 3 | 4 | include('test.lua') 5 | 6 | nn.testfbcunn() 7 | -------------------------------------------------------------------------------- /test/run_test_HSM_seed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | for tt in 1 2 4 8 16 32 3 | do 4 | export OMP_NUM_THREADS=$tt 5 | echo "" 6 | echo "" 7 | echo "number of threads $tt" 8 | _build/opt/deeplearning/torch/th.llar deeplearning/torch/layers/test/test_HSM_speed.lua 9 | done 10 | -------------------------------------------------------------------------------- /test/test_CuBLAS.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | require 'fb.luaunit' 3 | require 'fbtorch' 4 | require 'cunn' 5 | require 'fbcunn' 6 | 7 | torch.setdefaulttensortype('torch.FloatTensor') 8 | 9 | local fb_test = {} 10 | 11 | -- Let C = m-by-n and A = m-by-k 12 | -- Format is m, n, k, seqIter, batch, numHandles, numStreams 13 | local problemSize = { 14 | -- Sanity tests 15 | -- Trivial mxm, no batch, no iter 16 | {1, 1, 2, {}, {}, 0, 0}, 17 | {1, 1, 2, {}, {}, 0, 1}, 18 | {1, 1, 2, {}, {}, 1, 0}, 19 | {1, 1, 2, {}, {}, 1, 1}, 20 | {1, 1, 2, {}, {}, 16, 16}, 21 | -- 2x4 <- 2x8 * 8x4 as 1 iter, 1 batch 22 | {2, 4, 8, {1}, {1}, 1, 1}, 23 | -- 2x4 <- 2x8 * 8x4 as 1 iter, no batch 24 | {2, 4, 8, {1}, {}, 1, 1}, 25 | -- 2x4 <- 2x8 * 8x4 as no iter, 1 batch 26 | {2, 4, 8, {}, {1}, 1, 1}, 27 | -- 2x4 <- 2x8 * 8x4 as no iter, no batch 28 | {2, 4, 8, {}, {}, 1, 1}, 29 | -- 128x128 <- 128x128 * 128x128 as 4x4 iter, 4x4 batch 30 | {128, 128, 128, {4, 4}, {4, 4}, 1, 1}, 31 | {1024, 1024, 1024, {1, 1}, {1, 1}, 1, 1}, 32 | {1024, 1024, 1024, {}, {}, 1, 1}, 33 | -- Various way of performing temporal convolution of 512: 32 -> 16 34 | {16, 1024, 512, {}, {1}, 1, 1}, 35 | {16, 1024, 512, {}, {}, 1, 1}, 36 | {1, 1024, 512, {16}, {1}, 1, 1}, 37 | {1, 1024, 512, {1}, {16}, 1, 1}, 38 | {32 * 16, 1024, 512, {1}, {1}, 1, 1}, 39 | {1, 1024, 512, {16 * 32}, {1}, 1, 1}, 40 | {16, 1024, 512, {32}, {1}, 16, 1}, 41 | {16, 1024, 512, {1}, {32}, 0, 0}, 42 | {1, 1024, 512, {1}, {16 * 32}, 1, 1}, 43 | } 44 | 45 | -- This test exercises the performance of multi-handle + multi-stream on many 46 | -- small gemms. 47 | local _testMultiHandlePerf = { 48 | {513, 513, 513, {53}, {}, 0, 0}, 49 | {513, 513, 513, {53}, {}, 1, 1}, 50 | {513, 513, 513, {53}, {}, 1, 4}, 51 | {513, 513, 513, {53}, {}, 4, 1}, 52 | {513, 513, 513, {53}, {}, 4, 4}, 53 | } 54 | 55 | local function concat(t1,t2) 56 | local res = {} 57 | for i=1,#t1 do 58 | res[#res + 1] = t1[i] 59 | end 60 | for i=1,#t2 do 61 | res[#res + 1] = t2[i] 62 | end 63 | return res 64 | end 65 | 66 | local function testLoop(problemSize) 67 | -- Just allocate some dummy placeholder to get to the proper 68 | -- function in the lua module 69 | local net = nn.CuBLASWrapper(true) 70 | 71 | local m = problemSize[1] 72 | local n = problemSize[2] 73 | local k = problemSize[3] 74 | local seqIter = problemSize[4] 75 | local batch = problemSize[5] 76 | local handles = problemSize[6] 77 | local streams = problemSize[7] 78 | local seqBatch = concat(seqIter, batch) 79 | local sA = torch.LongStorage(concat(seqBatch, {m, k})) 80 | local sB = torch.LongStorage(concat(seqBatch, {k, n})) 81 | local sC = torch.LongStorage(concat(seqBatch, {m, n})) 82 | local A = torch.Tensor(sA):cuda() 83 | local B = torch.Tensor(sB):cuda() 84 | local C = torch.Tensor(sC):cuda() 85 | 86 | cutorch.reserveBlasHandles(handles) 87 | cutorch.reserveStreams(streams) 88 | cutorch.synchronize() 89 | net:matmult(A, B, C, seqIter, batch) 90 | mytester:assert(true) 91 | 92 | cutorch.synchronize() 93 | collectgarbage() 94 | end 95 | 96 | function fb_test.testGEMMs() 97 | for i = 1, table.getn(_testMultiHandlePerf) do 98 | testLoop(_testMultiHandlePerf[i]) 99 | end 100 | for i = 1, table.getn(problemSize) do 101 | testLoop(problemSize[i]) 102 | end 103 | end 104 | 105 | mytester = torch.Tester() 106 | mytester:add(fb_test) 107 | mytester:run() 108 | -------------------------------------------------------------------------------- /test/test_DataParallel.lua: -------------------------------------------------------------------------------- 1 | local fboptim = require('fboptim') 2 | -- Copyright 2004-present Facebook. All Rights Reserved. 3 | 4 | require 'fb.luaunit' 5 | require 'optim' 6 | require 'fbcunn' 7 | require 'cunn' 8 | 9 | -- Hyper-params. We're targeting a toy problem that computes 10 | -- some function of its inputs. 11 | local inputWidth = 32 12 | local hiddenWidth = 512 13 | local nHidden = 2 14 | local outputWidth = 1 15 | local numGPUs = cutorch.getDeviceCount() 16 | 17 | local function targetFunction(x) 18 | -- admittedly tough for us to learn, but hey. 19 | local retval = torch.Tensor(outputWidth) 20 | local sum = x:sum() 21 | retval[1] = math.sin(sum) 22 | return retval 23 | end 24 | 25 | local function genInput() 26 | return torch.randn(inputWidth) 27 | end 28 | 29 | local function genWideInput() 30 | return torch.randn(inputWidth * numGPUs) 31 | end 32 | 33 | local function getNarrowedInputRange(i) 34 | assert(type(i) == 'number') 35 | local rangeStart = 1 + ((i - 1) * inputWidth) 36 | local rangeEnd = rangeStart + (inputWidth) - 1 37 | return rangeStart, rangeEnd 38 | end 39 | 40 | local function getNarrowedInput(input, i) 41 | assert(torch.typename(input)) 42 | assert(type(i) == 'number') 43 | return input[{ {getNarrowedInputRange(i)} }] 44 | end 45 | 46 | local function genWideExample() 47 | local samp = genWideInput() 48 | local retval = torch.Tensor(outputWidth * numGPUs) 49 | for i = 1,numGPUs do 50 | retval[i] = targetFunction(getNarrowedInput(samp, i)) 51 | end 52 | return samp:cuda(), retval:cuda() 53 | end 54 | 55 | local function simpleModel() 56 | local seq = nn.Sequential() 57 | local pred = inputWidth 58 | for i = 1,nHidden do 59 | seq:add(nn.Linear(pred, hiddenWidth)) 60 | seq:add(nn.Tanh()) 61 | pred = hiddenWidth 62 | end 63 | seq:add(nn.Linear(hiddenWidth, outputWidth)) 64 | seq:add(nn.Tanh()) 65 | return seq 66 | end 67 | 68 | local function tensorsAreProbablySimilar(l, r, epsilon) 69 | epsilon = epsilon or 0.00001 70 | return math.abs(l:norm() - r:norm()) < epsilon 71 | end 72 | 73 | function testDataParallel() 74 | -- Set up models on each GPU. 75 | local dp = nn.DataParallel(1) 76 | local simpleModels = {} 77 | for i = 1,numGPUs do 78 | if i == 1 then 79 | simpleModels[i] = simpleModel() 80 | else 81 | simpleModels[i] = simpleModels[1]:clone() 82 | end 83 | dp:add(simpleModels[i]) 84 | end 85 | 86 | -- CPU models to cross-validate 87 | local cpuModels = {} 88 | local function syncCPUModels() 89 | for i = 1,numGPUs do 90 | cpuModels[i] = simpleModels[i]:clone() 91 | cpuModels[i] = cpuModels[i]:double() 92 | end 93 | end 94 | syncCPUModels() 95 | 96 | -- Check an input/output pair against the CPU models 97 | local function checkWideResult(inputs, outputs) 98 | local function checkOneResult(input, modIdx, expectedOutput) 99 | input = input:double() -- de-cudify 100 | assert(tensorsAreProbablySimilar(cpuModels[modIdx]:forward(input), 101 | expectedOutput)) 102 | end 103 | for j = 1, numGPUs do 104 | checkOneResult(getNarrowedInput(inputs, j), j, outputs[{ {j} }]) 105 | end 106 | end 107 | 108 | local function checkCPUModelsAreEquivalent() 109 | syncCPUModels() 110 | local input = genInput() 111 | local out = cpuModels[1]:forward(input) 112 | for j = 2, numGPUs do 113 | assert(tensorsAreProbablySimilar(out, cpuModels[j]:forward(input))) 114 | end 115 | end 116 | checkCPUModelsAreEquivalent() 117 | 118 | dp:cuda() 119 | 120 | -- Make sure forward produces same results as an individual copy 121 | for i=1, 10 do 122 | local inputs, targets = genWideExample() 123 | local outputs = dp:forward(inputs) 124 | syncCPUModels() 125 | checkWideResult(inputs, outputs) 126 | end 127 | 128 | local optimState = { 129 | learningRate = 1e-1, 130 | weightDecay = 1e-4, 131 | momentum = 0.9, 132 | learningRateDecay = 1e-7 133 | } 134 | 135 | local timer = torch.Timer() 136 | local opt = nn.Optim(dp, optimState) 137 | local criterion = nn.MSECriterion():cuda() 138 | 139 | local num_iteration = 10 140 | timer:reset() 141 | for i=1, num_iteration do 142 | local inputs, targets = genWideExample() 143 | local outputs = dp:forward(inputs) 144 | syncCPUModels() 145 | checkWideResult(inputs, outputs) 146 | opt:optimize(fboptim.sgd, inputs, targets, criterion) 147 | local out = dp:forward(inputs) 148 | local err = criterion:forward(out, targets) 149 | end 150 | checkCPUModelsAreEquivalent() 151 | 152 | -- Check only the speed for forward/backward. 153 | timer:reset(); 154 | for i=1, num_iteration do 155 | local inputs, targets = genWideExample() 156 | dp:forward(inputs) 157 | opt:optimize(fboptim.sgd, inputs, targets, criterion) 158 | end 159 | end 160 | 161 | LuaUnit:main() 162 | -------------------------------------------------------------------------------- /test/test_GroupKMaxPooling.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | require('fb.luaunit') 4 | 5 | require('math') 6 | 7 | require('fbtorch') 8 | 9 | require('nn') 10 | 11 | require('fbcunn') 12 | require('fbnn') 13 | 14 | function run_GroupKMaxPooling_updateOutput(n, d, k) 15 | -- n = number of words 16 | -- d = dimension of embeddings 17 | -- k = k-max pooling 18 | local input = torch.randn(n, d) 19 | local kmax = nn.GroupKMaxPooling(k) 20 | 21 | local output = kmax:updateOutput(input) 22 | 23 | assert(output == kmax.output) 24 | assert(output:size(1) == k) 25 | assert(output:size(2) == input:size(2)) 26 | 27 | local norms = torch.norm(input, 2, 2) 28 | local _, kmax_indices = torch.sort(norms, 1) 29 | kmax_indices = kmax_indices[{{-k,-1}}] 30 | kmax_indices = torch.sort(kmax_indices, 1) 31 | 32 | local kmax_result = torch.Tensor(k, input:size(2)) 33 | for i = 1, kmax_indices:size(1) do 34 | kmax_result:select(1, i):copy(input:select(1, kmax_indices[i][1])) 35 | end 36 | 37 | assert(torch.sum(torch.eq(kmax_result, output)) == torch.numel(output)) 38 | end 39 | 40 | function test_GroupKMaxPooling_updateOutput() 41 | run_GroupKMaxPooling_updateOutput(10, 11, 4) 42 | end 43 | 44 | function run_GroupKMaxPooling_updateOutput_batch(b, n, d, k) 45 | -- b = batch size 46 | -- n = number of words 47 | -- d = dimension of embeddings 48 | -- k = k-max pooling 49 | local input = torch.randn(b, n, d) 50 | local kmax = nn.GroupKMaxPooling(k) 51 | 52 | local output = kmax:updateOutput(input) 53 | 54 | assert(output == kmax.output) 55 | assert(output:size(1) == b) 56 | assert(output:size(2) == k) 57 | assert(output:size(3) == input:size(3)) 58 | 59 | local norms = torch.norm(input, 2, 3):squeeze() 60 | local _, kmax_indices = torch.sort(norms, 2) 61 | kmax_indices = kmax_indices:sub(1, -1, -k, -1) 62 | kmax_indices = torch.sort(kmax_indices, 2) 63 | 64 | local kmax_result = torch.Tensor(input:size(1), k, input:size(3)) 65 | kmax_result:fill(0.0) 66 | 67 | for i = 1, kmax_indices:size(1) do 68 | for j = 1, kmax_indices:size(2) do 69 | kmax_result:select(1, i):select(1, j):copy( 70 | input:select(1, i):select(1, kmax_indices[i][j])) 71 | end 72 | end 73 | 74 | assert(torch.sum(torch.eq(kmax_result, output)) == torch.numel(output)) 75 | end 76 | 77 | function test_GroupKMaxPooling_updateOutput_batch() 78 | run_GroupKMaxPooling_updateOutput_batch(15, 10, 11, 4) 79 | end 80 | 81 | function run_GroupKMaxPooling_updateGradInput(n, d, k) 82 | -- n = number of words 83 | -- d = dimension of embeddings 84 | -- k = k-max pooling 85 | local input = torch.randn(n, d) 86 | 87 | local kmax = nn.GroupKMaxPooling(k) 88 | 89 | local output = kmax:updateOutput(input) 90 | 91 | local delta = torch.randn(output:size()) 92 | 93 | local gradInput = kmax:updateGradInput(input, delta) 94 | 95 | assert(gradInput == kmax.gradInput) 96 | 97 | assert(gradInput:sum() == delta:sum()) 98 | end 99 | 100 | function test_GroupKMaxPooling_updateGradInput() 101 | run_GroupKMaxPooling_updateOutput(10, 11, 4) 102 | end 103 | 104 | 105 | function run_GroupKMaxPooling_updateGradInput_batch(b, n, d, k) 106 | -- n = number of words 107 | -- d = dimension of embeddings 108 | -- k = k-max pooling 109 | local input = torch.randn(b, n, d) 110 | 111 | local kmax = nn.GroupKMaxPooling(k) 112 | 113 | local output = kmax:updateOutput(input) 114 | 115 | local delta = torch.randn(output:size()) 116 | 117 | local gradInput = kmax:updateGradInput(input, delta) 118 | 119 | assert(gradInput == kmax.gradInput) 120 | 121 | assert(gradInput:sum() == delta:sum()) 122 | end 123 | 124 | function test_GroupKMaxPooling_updateGradInput_batch() 125 | run_GroupKMaxPooling_updateOutput(12, 10, 11, 4) 126 | end 127 | 128 | LuaUnit:main() 129 | -------------------------------------------------------------------------------- /test/test_HSM.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | -- Author: Michael Mathieu 3 | 4 | require 'fbcunn' 5 | require 'fbnn' 6 | 7 | local function test_finite_diff_gradInput(model, input, target) 8 | local eps = 1e-3 9 | local output = model:updateOutput(input, target) 10 | local gradInput = model:updateGradInput(input, target):clone() 11 | 12 | local gradInput2 = torch.Tensor(input:size()) 13 | if input:dim() == 1 then 14 | for i = 1,input:size(1) do 15 | input[i] = input[i] + eps 16 | local outputP = model:updateOutput(input, target) 17 | input[i] = input[i] - 2*eps 18 | local outputM = model:updateOutput(input, target) 19 | input[i] = input[i] + eps 20 | gradInput2[i] = (outputP - outputM) / (2*eps) 21 | end 22 | else 23 | assert(input:dim() == 2) 24 | for i = 1,input:size(1) do 25 | for j = 1,input:size(2) do 26 | input[i][j] = input[i][j] + eps 27 | local outputP = model:updateOutput(input, target) 28 | input[i][j] = input[i][j] - 2*eps 29 | local outputM = model:updateOutput(input, target) 30 | input[i][j] = input[i][j] + eps 31 | gradInput2[i][j] = (outputP - outputM) / (2*eps) 32 | end 33 | end 34 | end 35 | return (gradInput - gradInput2):abs():max() 36 | end 37 | 38 | local function test_finite_diff_accGrads(model, input, target, scale) 39 | local eps = 1e-3 40 | scale = scale or 1 41 | 42 | local w, dw = model:getParameters() 43 | 44 | dw:zero() 45 | local output = model:updateOutput(input, target) 46 | local gradInput = model:updateGradInput(input, target):clone() 47 | model:accGradParameters(input, target, scale) 48 | local gradParams = dw:clone() 49 | 50 | local gradParams2 = torch.Tensor(w:size(1)) 51 | for i = 1,w:size(1) do 52 | w[i] = w[i] + eps 53 | local outputP = model:updateOutput(input, target) 54 | w[i] = w[i] - 2*eps 55 | local outputM = model:updateOutput(input, target) 56 | w[i] = w[i] + eps 57 | gradParams2[i] = scale * (outputP - outputM) / (2*eps) 58 | end 59 | 60 | return (gradParams - gradParams2):abs():max() 61 | end 62 | 63 | for i = 1,100 do 64 | print("Iteration " .. i) 65 | local n_clusters = torch.random(10) 66 | local n_class = torch.random(50) + n_clusters - 1 67 | local mapping = {} 68 | local n_class_in_cluster = {} 69 | for i = 1, n_class do 70 | local cluster = torch.random(n_clusters) 71 | n_class_in_cluster[cluster] = n_class_in_cluster[cluster] or 0 72 | n_class_in_cluster[cluster] = n_class_in_cluster[cluster] + 1 73 | mapping[i] = {cluster, n_class_in_cluster[cluster]} 74 | end 75 | for i = 1,n_clusters do 76 | if n_class_in_cluster[i] == nil then 77 | n_class_in_cluster[i] = 1 78 | mapping[1+#mapping] = {i, 1} 79 | n_class = n_class + 1 80 | end 81 | end 82 | local input_size = torch.random(100) + 1 83 | local model = nn.HSM(mapping, input_size) 84 | 85 | local input = torch.randn(input_size) 86 | local target = torch.LongTensor(1) 87 | target[1] = torch.random(n_class) 88 | local err = test_finite_diff_gradInput(model, input, target) 89 | assert(err < 1e-2) 90 | err = test_finite_diff_accGrads(model, input, target) 91 | assert(err < 1e-2) 92 | local scale = torch.rand(1)[1] 93 | err = test_finite_diff_accGrads(model, input, target, scale) 94 | assert(err < 1e-2) 95 | 96 | local batch_size = torch.random(10) 97 | input = torch.randn(batch_size, input_size) 98 | target = torch.LongTensor(batch_size) 99 | for i = 1, batch_size do 100 | target[i] = torch.random(n_class) 101 | end 102 | err = test_finite_diff_gradInput(model, input, target); 103 | assert(err < 1e-2) 104 | err = test_finite_diff_accGrads(model, input, target) 105 | assert(err < 1e-2) 106 | err = test_finite_diff_accGrads(model, input, target, scale) 107 | assert(err < 1e-2) 108 | 109 | -- test directUpdate 110 | local w, dw = model:getParameters() 111 | dw:normal() 112 | local initdw = dw:clone() 113 | model:updateOutput(input, target) 114 | model:updateGradInput(input, target) 115 | model:accGradParameters(input, target, scale, false) 116 | local w1 = w:clone():add(dw) 117 | model:updateOutput(input, target) 118 | model:updateGradInput(input, target) 119 | model:accGradParameters(input, target, scale, true) 120 | w:add(initdw) 121 | err = w:add(-1, w1):abs():max() 122 | assert(err < 1e-5) 123 | end 124 | -------------------------------------------------------------------------------- /test/test_LinearNB.lua: -------------------------------------------------------------------------------- 1 | require 'nn' 2 | require 'cunn' 3 | require 'fbtorch' 4 | require 'fbcunn' 5 | 6 | local mytester = torch.Tester() 7 | local LinearNBTest = {} 8 | local jac = nn.Jacobian 9 | 10 | local precision = 1e-5 11 | 12 | function testAll(targettype) 13 | targettype = targettype or 'torch.DoubleTensor' 14 | local ini = math.random(3,5) 15 | local inj_vals = {math.random(3,5), 1} -- Also test the inj = 1 spatial case 16 | local input = torch.Tensor(ini):zero():type(targettype) 17 | 18 | for ind, inj in pairs(inj_vals) do 19 | local module = nn.LinearNB(ini, inj) 20 | if targettype == 'torch.CudaTensor' then 21 | module = module:cuda() 22 | end 23 | 24 | -- 1D 25 | local err = jac.testJacobian(module, input) 26 | mytester:assertlt(err, precision, 'error on state ') 27 | 28 | local err = jac.testJacobianParameters(module, input, module.weight, 29 | module.gradWeight) 30 | mytester:assertlt(err, precision, 'error on weight ') 31 | 32 | local err = jac.testJacobianUpdateParameters(module, input, module.weight) 33 | mytester:assertlt(err, precision, 'error on weight [direct update] ') 34 | 35 | for t,err in pairs(jac.testAllUpdate(module, input, 36 | 'weight', 'gradWeight')) do 37 | mytester:assertlt(err, precision, string.format( 38 | 'error on weight [%s]', t)) 39 | end 40 | 41 | -- 2D 42 | local nframe = math.random(50,70) 43 | local input = torch.Tensor(nframe, ini):zero():type(targettype) 44 | 45 | local err = jac.testJacobian(module,input) 46 | mytester:assertlt(err, precision, 'error on state ') 47 | 48 | local err = jac.testJacobianParameters(module, input, module.weight, 49 | module.gradWeight) 50 | mytester:assertlt(err,precision, 'error on weight ') 51 | 52 | local err = jac.testJacobianUpdateParameters(module, input, module.weight) 53 | mytester:assertlt(err,precision, 'error on weight [direct update] ') 54 | 55 | for t,err in pairs(jac.testAllUpdate(module, input, 56 | 'weight', 'gradWeight')) do 57 | mytester:assertlt(err, precision, string.format( 58 | 'error on weight [%s]', t)) 59 | end 60 | 61 | -- IO 62 | local ferr,berr = jac.testIO(module, input) 63 | mytester:asserteq(ferr, 0, torch.typename(module) 64 | .. ' - i/o forward err ') 65 | mytester:asserteq(berr, 0, torch.typename(module) 66 | .. ' - i/o backward err ') 67 | end 68 | end 69 | 70 | function LinearNBTest.testDouble() 71 | testAll() 72 | end 73 | 74 | 75 | mytester:add(LinearNBTest) 76 | mytester:run() 77 | -------------------------------------------------------------------------------- /test/test_OneBitDataParallel.lua: -------------------------------------------------------------------------------- 1 | require('fb.luaunit') 2 | require('fbtorch') 3 | require('fbcunn') 4 | require('fbnn') 5 | local TU = require('test.test_Util') 6 | local fboptim = require('fboptim') 7 | 8 | local function dp() 9 | return nn.OneBitDataParallel( 10 | 1, 11 | {momentum_rate=1.0, adagrad_learning_rate=1.0, min_elements=20} 12 | ) 13 | end 14 | 15 | 16 | function testDataParallelRunsForwardPass() 17 | local sim = TU.Sim { 18 | num_hidden = 2, 19 | output_width = 1, 20 | hidden_width = 512, 21 | input_width = 32, 22 | num_columns = 4, 23 | } 24 | 25 | local model, columns = sim:build_data_parallel(dp()) 26 | local inputs, _ = sim:gen_wide_example() 27 | local outputs = model:forward(inputs) 28 | 29 | for column_id = 1, sim.opts.num_columns do 30 | local column_input = sim:get_narrowed_input(inputs, column_id):double() 31 | print(column_input:size()) 32 | local gpu_output = outputs[{ {column_id} }] 33 | local cpu_output = columns[column_id]:forward(column_input) 34 | 35 | local norm_delta = TU.tensor_norm_difference(gpu_output, cpu_output) 36 | 37 | print(column_input:size(), gpu_output:size(), cpu_output:size()) 38 | print(gpu_output:norm(), cpu_output:norm()) 39 | assertTrue(norm_delta < 1E-5) 40 | end 41 | end 42 | 43 | function testDataParallelOnForwardPassIsEquivalentToSeparateColumns() 44 | local sim = TU.Sim { 45 | num_hidden = 2, 46 | output_width = 1, 47 | hidden_width = 512, 48 | input_width = 32, 49 | num_columns = 4, 50 | } 51 | 52 | local model, columns = sim:build_data_parallel(dp()) 53 | local inputs, _ = sim:gen_wide_example() 54 | local outputs = model:forward(inputs) 55 | 56 | for column_id = 1, sim.opts.num_columns do 57 | local column_input = sim:get_narrowed_input(inputs, column_id):double() 58 | print(column_input:size()) 59 | local gpu_output = outputs[{ {column_id} }] 60 | local cpu_output = columns[column_id]:forward(column_input) 61 | 62 | local norm_delta = 63 | TU.tensor_norm_difference(gpu_output, cpu_output) 64 | 65 | print(column_input:size(), gpu_output:size(), cpu_output:size()) 66 | print(gpu_output:norm(), cpu_output:norm()) 67 | assertTrue(norm_delta < 1E-5) 68 | end 69 | end 70 | 71 | function testDataParallelOnOptimLearns() 72 | local sim = TU.Sim { 73 | num_hidden = 1, 74 | output_width = 1, 75 | hidden_width = 500, 76 | input_width = 5, 77 | num_columns = 4, 78 | num_opt_rounds = 2, 79 | } 80 | 81 | local optim_state = { 82 | learningRate = 1e-1, 83 | weightDecay = 1e-4, 84 | momentum = 0.9, 85 | learningRateDecay = 1e-7 86 | } 87 | 88 | local model, _columns = sim:build_data_parallel(dp()) 89 | local opt = nn.Optim(model, optim_state) 90 | local criterion = nn.MSECriterion():cuda() 91 | 92 | for round = 1,sim.opts.num_opt_rounds do 93 | local inputs, targets = sim:gen_wide_example() 94 | local _outputs = model:forward(inputs) 95 | opt:optimize(fboptim.sgd, inputs, targets, criterion) 96 | local out = model:forward(inputs) 97 | print(out) 98 | local err = criterion:forward(out, targets) 99 | print(round, err) 100 | end 101 | end 102 | 103 | LuaUnit:main() 104 | -------------------------------------------------------------------------------- /test/test_OneBitQuantization.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | require('fb.luaunit') 4 | require('fbcunn') 5 | 6 | local num_tries = 10 7 | 8 | function testExactQuantization() 9 | for tries = 1, num_tries do 10 | local q = nn.OneBitQuantization() 11 | local t = torch.CudaTensor(torch.random(50), torch.random(50)) 12 | 13 | print('Quantizing ' .. t:size(1) .. ' x ' .. t:size(2)) 14 | 15 | -- We will get exact quantization if there is only one positive 16 | -- and one negative value in each row. 17 | for row = 1, t:size(1) do 18 | local pos_value = torch.uniform(10) 19 | local neg_value = -torch.uniform(10) 20 | 21 | for col = 1, t:size(2) do 22 | local val = pos_value 23 | 24 | if torch.bernoulli(0.5) == 0 then 25 | val = neg_value 26 | end 27 | 28 | t[row][col] = val 29 | end 30 | end 31 | 32 | local quantized = q:quantize(t) 33 | local dequantized = 34 | q:dequantize(quantized, q.avg_pos, q.avg_neg, t:size(2)) 35 | 36 | assertTrue((dequantized:float() - t:float()):abs():max() < 1e-5) 37 | end 38 | end 39 | 40 | function testErrorDecaysToZero() 41 | for tries = 1, num_tries do 42 | -- In order to show that quantization error works, we should be 43 | -- able to send a matrix via quantization, and then successfully 44 | -- send the zero matrix. 45 | -- For each successive pass, the quantization error should diminish 46 | -- and on the receiving side, we should get something that approximates 47 | -- the original matrix. 48 | local q = nn.OneBitQuantization() 49 | 50 | -- Send two matrices 51 | local orig1 = 52 | torch.randn(10 + torch.random(30), 10 + torch.random(30)):cuda() 53 | local orig2 = torch.randn(orig1:size(1), orig1:size(2)):cuda() 54 | 55 | -- This is the signal that we wish to approximate 56 | local orig = orig1:float() + orig2:float() 57 | 58 | print('Quantizing ' .. orig:size(1) .. ' x ' .. orig:size(2)) 59 | 60 | -- pass `orig1` 61 | local quantized = q:quantize(orig1) 62 | local dequantized = 63 | q:dequantize(quantized, q.avg_pos, q.avg_neg, orig1:size(2)) 64 | 65 | -- dequantized will become the approximation to `orig` 66 | local approx = dequantized:float() 67 | 68 | -- pass `orig2` 69 | quantized = q:quantize(orig2) 70 | dequantized = 71 | q:dequantize(quantized, q.avg_pos, q.avg_neg, orig2:size(2)) 72 | approx:add(dequantized:float()) 73 | 74 | -- Now, after sending some signal, we will pass 0 a couple of times, in 75 | -- order to flush the quantization error. The number of passes required is 76 | -- related to the size of the original matrix and is also dependent upon 77 | -- floating point precision. 78 | local zeros = torch.CudaTensor(orig:size(1), orig:size(2)) 79 | zeros:zero() 80 | 81 | for passes = 1, 100 do 82 | quantized = q:quantize(zeros) 83 | dequantized = 84 | q:dequantize(quantized, q.avg_pos, q.avg_neg, zeros:size(2)) 85 | 86 | approx:add(dequantized:float()) 87 | end 88 | 89 | assertTrue((orig:float() - approx):abs():max() < 5e-4) 90 | end 91 | end 92 | 93 | LuaUnit:main() 94 | -------------------------------------------------------------------------------- /test/test_OneBitSGD.lua: -------------------------------------------------------------------------------- 1 | require('fb.luaunit') 2 | require('fbtorch') 3 | require('cutorch') 4 | require('cunn') 5 | require('fbcunn') 6 | 7 | local TU = require('test.test_Util') 8 | local pl = require('pl.import_into')() 9 | local _fbd = require('fb.debugger') 10 | local OBSGD = require('fbcunn.OneBitSGD') 11 | 12 | torch.setdefaulttensortype('torch.CudaTensor') 13 | 14 | function testQuantizerOnSimpleExample() 15 | local gradient = torch.Tensor({{1}, {-1}}) 16 | local accumulated = torch.Tensor():typeAs(gradient):resizeAs(gradient) 17 | local quantizer = nn.OneBitQuantization() 18 | local quantized, avg_pos, avg_neg = 19 | OBSGD.quantize_gradient( 20 | gradient, quantizer, accumulated) 21 | TU.assertTensorEquals(quantizer.quantization_error, quantizer.quantization_error:clone():zero()) 22 | TU.assertTensorEquals(avg_pos, torch.Tensor({{1}, {0}})) 23 | TU.assertTensorEquals(avg_neg, torch.Tensor({{0}, {-1}})) 24 | end 25 | 26 | function testQuantizationReducesNormOfMatrix() 27 | for _ = 1,50 do 28 | local gradient = torch.Tensor(100, 20):normal() 29 | local accumulated = torch.Tensor():typeAs(gradient):resizeAs(gradient) 30 | local quantizer = nn.OneBitQuantization() 31 | 32 | OBSGD.quantize_gradient( 33 | gradient, quantizer, accumulated) 34 | assertTrue(gradient:norm() > quantizer.quantization_error:norm()) 35 | end 36 | end 37 | 38 | local function build_agg() 39 | return OBSGD.OneBitAggregator( 40 | {momentum_rate=1.0, adagrad_learning_rate=1.0}, 41 | function() return torch.Tensor(5, 5):zero() end, 42 | function(dst, src) dst:copy(src) end, 43 | 1 44 | ) 45 | end 46 | 47 | 48 | function testOBSGDSmoothing() 49 | local agg = build_agg() 50 | 51 | local smoothed = 52 | agg:_smooth_gradient(agg.gradient_tensor_factory():fill(1)) 53 | TU.assertTensorEquals(smoothed, agg.gradient_tensor_factory():fill(1)) 54 | end 55 | 56 | function testOBSGDAveraging() 57 | local agg = build_agg() 58 | local num_columns = 5 59 | local gradients = pl.List.range(num_columns):map( 60 | function(i) return agg.gradient_tensor_factory():fill(i) end) 61 | local averaged_gradients = agg:_accumulate_quantized_gradients(gradients) 62 | TU.assertTensorEquals( 63 | averaged_gradients, 64 | agg.gradient_tensor_factory():fill((num_columns+1) / 2) 65 | ) 66 | end 67 | 68 | function testOBSGDAggregation() 69 | local agg = build_agg() 70 | 71 | local num_columns = 5 72 | local gradients = pl.List.range(num_columns):map( 73 | function(i) return agg.gradient_tensor_factory():fill(i) end) 74 | 75 | local averaged_gradients = agg:_accumulate_quantized_gradients(gradients) 76 | TU.assertTensorEquals( 77 | averaged_gradients, 78 | agg.gradient_tensor_factory():fill((num_columns+1) / 2) 79 | ) 80 | end 81 | 82 | 83 | function testOBSGDEndTOEnd() 84 | local agg = build_agg() 85 | 86 | local num_columns = 5 87 | local gradients = pl.List.range(num_columns):map( 88 | function(i) return agg.gradient_tensor_factory():fill(i) end) 89 | 90 | local gradients_to_run = gradients:map(function(t) return t:clone() end) 91 | 92 | agg:run(gradients_to_run) 93 | 94 | TU.assertTensorEquals( 95 | agg.home_quantizer.quantization_error, 96 | agg.gradient_tensor_factory():zero() 97 | ) 98 | TU.assertTensorEquals( 99 | agg.adagrad_history, 100 | agg.gradient_tensor_factory():fill(9) 101 | ) 102 | TU.assertTensorEquals( 103 | agg.momentum_history, 104 | agg.gradient_tensor_factory():fill(1) 105 | ) 106 | 107 | -- Gradients get quantized to zero 108 | local after_expected = agg.gradient_tensor_factory():fill(1) 109 | for _, t in ipairs(pl.tablex.zip(gradients, gradients_to_run)) do 110 | local _before, after = table.unpack(t) 111 | TU.assertTensorEquals(after, after_expected) 112 | end 113 | end 114 | 115 | function testMomentumWorks() 116 | local gradient = torch.Tensor({1}) 117 | local momentum_history = torch.Tensor({50}) 118 | local momentum_rate = 5.0 119 | local new_gradient = 120 | OBSGD.momentum(gradient, momentum_rate, momentum_history) 121 | TU.assertTensorAlmostEquals(new_gradient, torch.Tensor({251})) 122 | end 123 | 124 | LuaUnit:main() 125 | -------------------------------------------------------------------------------- /test/test_SequentialCriterion.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | -- Author: Michael Mathieu 3 | 4 | require 'fb.luaunit' 5 | require 'nn' 6 | require 'fbcunn' 7 | require 'fbnn' 8 | 9 | local test_repeats = 100 10 | 11 | local function testSequentialCriterion_run(input_size, n_classes, 12 | module, crit, targettype) 13 | module = module:clone() 14 | crit = crit:clone() 15 | local modcrit = nn.SequentialCriterion(module:clone(), crit:clone()) 16 | targettype = targettype or torch.Tensor():type() 17 | 18 | local batch_size = torch.random(100) 19 | local input = torch.rand(batch_size, input_size) 20 | local target = 21 | torch.rand(batch_size):mul(n_classes):add(1):floor():type(targettype) 22 | 23 | local output1 = modcrit:forward(input, target) 24 | local z2 = module:forward(input) 25 | local output2 = crit:forward(z2, target) 26 | assertTrue(math.abs(output1-output2) < 1e-5) 27 | 28 | local gradInput1 = modcrit:updateGradInput(input, target) 29 | local derr_do2 = crit:updateGradInput(z2, target) 30 | local gradInput2 = module:updateGradInput(input, derr_do2) 31 | assertTrue(gradInput1:clone():add(-1, gradInput2):abs():max() < 1e-5) 32 | 33 | modcrit:zeroGradParameters() 34 | module:zeroGradParameters() 35 | if crit.zeroGradParameters then 36 | crit:zeroGradParameters() 37 | end 38 | modcrit:accGradParameters(input, target) 39 | if crit.accGradParameters then 40 | crit:accGradParameters(z2, target) 41 | end 42 | module:accGradParameters(input, derr_do2) 43 | modcrit:updateParameters(1) 44 | if crit.updateParameters then 45 | crit:updateParameters(1) 46 | end 47 | module:updateParameters(1) 48 | local output1 = modcrit:forward(input, target) 49 | local z2 = module:forward(input) 50 | local output2 = crit:forward(z2, target) 51 | assertTrue(math.abs(output1-output2) < 1e-5) 52 | end 53 | 54 | local function make_HSM(n_clusters, n_class, input_size) 55 | local mapping = {} 56 | local n_class_in_cluster = {} 57 | for i = 1, n_class do 58 | local cluster = torch.random(n_clusters) 59 | n_class_in_cluster[cluster] = n_class_in_cluster[cluster] or 0 60 | n_class_in_cluster[cluster] = n_class_in_cluster[cluster] + 1 61 | mapping[i] = {cluster, n_class_in_cluster[cluster]} 62 | end 63 | for i = 1,n_clusters do 64 | if n_class_in_cluster[i] == nil then 65 | n_class_in_cluster[i] = 1 66 | mapping[1+#mapping] = {i, 1} 67 | n_class = n_class + 1 68 | end 69 | end 70 | return nn.HSM(mapping, input_size) 71 | end 72 | 73 | function testSequentialCriterion() 74 | for i = 1, test_repeats do 75 | -- try with NLL 76 | local input_size = torch.random(200) 77 | local n_classes = torch.random(200) 78 | local module = nn.Linear(input_size, n_classes) 79 | local crit = nn.ClassNLLCriterion() 80 | testSequentialCriterion_run(input_size, n_classes, module, 81 | crit, 'torch.LongTensor') 82 | 83 | -- try with HSM 84 | local input1_size = torch.random(200) 85 | local input2_size = torch.random(200) 86 | local n_classes = torch.random(200) 87 | local module = nn.Sequential() 88 | module:add(nn.Linear(input1_size, input2_size)) 89 | module:add(nn.Threshold()) 90 | local crit = make_HSM(20, n_classes, input2_size) 91 | testSequentialCriterion_run(input1_size, n_classes, module, 92 | crit, 'torch.LongTensor') 93 | end 94 | end 95 | 96 | LuaUnit:main() 97 | -------------------------------------------------------------------------------- /test/test_SparseNLLCriterion.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | -- Author: Michael Mathieu 3 | 4 | require 'fb.luaunit' 5 | require 'cutorch' 6 | require 'nn' 7 | require 'fbcunn' 8 | require 'fbnn' 9 | 10 | local test_repeats = 5 11 | 12 | local function test_finite_diff_gradInput(model, input, target) 13 | local eps = 1e-3 14 | local output = model:updateOutput(input, target) 15 | local gradInput = model:updateGradInput(input, target):clone() 16 | 17 | local gradInput2 = torch.Tensor(input:size()) 18 | local outputP = torch.Tensor(output:size()) 19 | local outputM = torch.Tensor(output:size()) 20 | if input:dim() == 1 then 21 | for i = 1,input:size(1) do 22 | input[i] = input[i] + eps 23 | outputP:copy(model:updateOutput(input, target)) 24 | input[i] = input[i] - 2*eps 25 | outputM:copy(model:updateOutput(input, target)) 26 | input[i] = input[i] + eps 27 | gradInput2[i] = outputP:add(-1, outputM):div(2*eps) 28 | end 29 | else 30 | assert(input:dim() == 2) 31 | for i = 1,input:size(1) do 32 | for j = 1,input:size(2) do 33 | input[i][j] = input[i][j] + eps 34 | outputP:copy(model:updateOutput(input, target)) 35 | input[i][j] = input[i][j] - 2*eps 36 | outputM:copy(model:updateOutput(input, target)) 37 | input[i][j] = input[i][j] + eps 38 | gradInput2[i][j] = outputP:add(-1, outputM):div(2*eps) 39 | end 40 | end 41 | end 42 | gradInput2 = gradInput2:type(input:type()) 43 | return gradInput:add(-1, gradInput2):abs():max() 44 | end 45 | 46 | local function test_sparseNLL(K, n_classes, batch_size, cuda) 47 | local crit = nn.SparseNLLCriterion(K) 48 | local input = torch.randn(batch_size, n_classes) 49 | local targetP = torch.randn(batch_size, K):abs() 50 | local targetIdx = torch.LongTensor(batch_size, K) 51 | if cuda then 52 | crit = crit:cuda() 53 | input = input:cuda() 54 | targetP = targetP:cuda() 55 | targetIdx = torch.CudaTensor(targetIdx:size()):copy(targetIdx) 56 | end 57 | for i = 1, batch_size do 58 | targetP[i]:div(targetP[i]:sum()) 59 | local p = torch.randperm(n_classes) 60 | targetIdx[i]:copy(p[{{1,K}}]) 61 | end 62 | -- fprop 63 | local output_test = 0 64 | for i = 1, batch_size do 65 | for j = 1, K do 66 | output_test = output_test - input[i][targetIdx[i][j] ] * targetP[i][j] 67 | end 68 | end 69 | output_test = output_test / batch_size 70 | local fprop_err = 71 | math.abs(output_test - crit:forward(input, {targetP, targetIdx})[1]) 72 | 73 | --bprop 74 | local bprop_err = 75 | test_finite_diff_gradInput(crit, input, {targetP, targetIdx}) 76 | 77 | return fprop_err, bprop_err 78 | end 79 | 80 | function testSparseNLLCriterion() 81 | for k = 1, test_repeats do 82 | local n_classes = torch.random(100) 83 | local K = torch.random(n_classes) 84 | local batch_size = torch.random(32) 85 | local err1, err2 = test_sparseNLL(K, n_classes, batch_size, false) 86 | assertTrue(err1 < 1e-3) 87 | assertTrue(err2 < 1e-3) 88 | local err1, err2 = test_sparseNLL(K, n_classes, batch_size, true) 89 | assertTrue(err1 < 1e-3) 90 | assertTrue(err2 < 1e-3) 91 | end 92 | end 93 | 94 | LuaUnit:main() 95 | -------------------------------------------------------------------------------- /test/test_Util.lua: -------------------------------------------------------------------------------- 1 | require('fb.luaunit') 2 | require('fbtorch') 3 | require('cunn') 4 | require('optim') 5 | 6 | local M = {} 7 | 8 | local pl = require('pl.import_into')() 9 | 10 | function M.tensor_norm_difference(l, r) 11 | return math.abs(l:norm() - r:norm()) 12 | end 13 | 14 | function M.assertTensorEquals(a, b) 15 | assertEquals(0, (a:clone():add(b:clone():mul(-1))):abs():sum()) 16 | end 17 | 18 | function M.assertTensorAlmostEquals(a, b, eps) 19 | assertTrue((a:clone():add(b:clone():mul(-1))):norm() < (eps or 1E-10)) 20 | end 21 | 22 | local Sim = pl.class() 23 | M.Sim = Sim 24 | 25 | function Sim:_init(opts) 26 | self.opts = opts 27 | end 28 | 29 | function Sim:build_column() 30 | local seq = nn.Sequential() 31 | local pred = self.opts.input_width 32 | for i = 1,self.opts.num_hidden do 33 | seq:add(nn.Linear(pred, self.opts.hidden_width)) 34 | seq:add(nn.Tanh()) 35 | pred = self.opts.hidden_width 36 | end 37 | seq:add(nn.Linear(self.opts.hidden_width, self.opts.output_width)) 38 | seq:add(nn.Tanh()) 39 | return seq 40 | end 41 | 42 | function Sim:build_data_parallel(dp) 43 | local num_gpus = cutorch.getDeviceCount() 44 | local columns = {} 45 | 46 | for column_id = 1,self.opts.num_columns do 47 | local gpu_id = column_id % num_gpus 48 | if gpu_id == 0 then gpu_id = num_gpus end 49 | print(gpu_id) 50 | cutorch.withDevice( 51 | gpu_id, 52 | function() 53 | local column = self:build_column() 54 | table.insert(columns, column:clone()) 55 | dp:add(column:clone(), gpu_id) 56 | end 57 | ) 58 | end 59 | return dp:cuda(), columns 60 | end 61 | 62 | function Sim:target_function(x) 63 | -- admittedly tough for us to learn, but hey. 64 | local retval = torch.Tensor(self.opts.output_width) 65 | local sum = x:sum() 66 | retval[1] = math.sin(sum) 67 | return retval 68 | end 69 | 70 | function Sim:gen_wide_input() 71 | return torch.randn(self.opts.input_width * self.opts.num_columns) 72 | end 73 | 74 | function Sim:get_narrowed_input_range(i) 75 | assert(type(i) == 'number') 76 | local range_start = 1 + ((i - 1) * self.opts.input_width) 77 | local range_end = range_start + (self.opts.input_width) - 1 78 | return range_start, range_end 79 | end 80 | 81 | function Sim:get_narrowed_input(input, i) 82 | assert(torch.typename(input)) 83 | assert(type(i) == 'number') 84 | return input[{ {self:get_narrowed_input_range(i)} }] 85 | end 86 | 87 | function Sim:gen_wide_example() 88 | local samp = self:gen_wide_input() 89 | local retval = torch.Tensor(self.opts.output_width * self.opts.num_columns) 90 | for i = 1,self.opts.num_columns do 91 | retval[i] = self:target_function(self:get_narrowed_input(samp, i)) 92 | end 93 | return samp:cuda(), retval:cuda() 94 | end 95 | 96 | return M 97 | -------------------------------------------------------------------------------- /test/test_WeightedLookupTable.lua: -------------------------------------------------------------------------------- 1 | -- Copyright 2004-present Facebook. All Rights Reserved. 2 | 3 | require('fbtorch') 4 | require('fb.luaunit') 5 | require('fbcunn') 6 | 7 | require('nn') 8 | 9 | local function all(tensor) 10 | return torch.sum(torch.ne(tensor, 0)) == tensor:numel() 11 | end 12 | 13 | local function almost_equal(t1, t2, tol) 14 | return torch.lt(torch.abs(t1 - t2), tol) 15 | end 16 | 17 | -- w = weighted 18 | -- u = unweighted 19 | -- e.g. 20 | -- wlut = weighted lookup table 21 | -- ulut = unweighted lookup table 22 | 23 | function test_WeightedLookupTable_forward() 24 | local embedding_dim = 4 25 | local table_size = 30 26 | local input_length = 9 27 | local tol = 1e-8 28 | 29 | local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda() 30 | local ulut = nn.LookupTable(table_size, embedding_dim):cuda() 31 | ulut.weight:copy(wlut.weight) 32 | assert(all(torch.eq(wlut.weight, ulut.weight))) 33 | 34 | local uinput = torch.rand(input_length):mul(table_size):ceil() 35 | local weights = torch.rand(input_length, 1) 36 | local winput = torch.cat(uinput, weights, 2) 37 | 38 | local woutput = wlut:forward(winput:cuda()) 39 | local uoutput = ulut:forward(uinput:cuda()) 40 | weights = weights:cuda() 41 | local expected_woutput = torch.cmul(uoutput, weights:expandAs(uoutput)) 42 | 43 | assert(all(almost_equal(woutput:float(), expected_woutput:float(), tol))) 44 | end 45 | 46 | function test_WeightedLookupTable_accGradParameters() 47 | local embedding_dim = 4 48 | local table_size = 30 49 | local input_length = 9 50 | local tol = 1e-5 51 | 52 | local wlut = nn.WeightedLookupTable(table_size, embedding_dim):cuda() 53 | local ulut = nn.LookupTable(table_size, embedding_dim):cuda() 54 | ulut.weight:copy(wlut.weight) 55 | assert(all(torch.eq(wlut.weight, ulut.weight))) 56 | 57 | local uinput = torch.rand(input_length):mul(table_size):ceil() 58 | local weights = torch.range(1, input_length):reshape(input_length, 1) 59 | local winput = torch.cat(uinput, weights, 2) 60 | 61 | winput = winput:cuda() 62 | uinput = uinput:cuda() 63 | local woutput = wlut:forward(winput) 64 | local uoutput = ulut:forward(uinput) 65 | 66 | local wgradOutput = torch.randn(woutput:size()) 67 | local ugradOutput = torch.cmul(wgradOutput, weights:expandAs(wgradOutput)) 68 | 69 | wlut:accGradParameters(winput, wgradOutput:cuda(), 1) 70 | ulut:accGradParameters(uinput, ugradOutput:cuda(), 1) 71 | 72 | assert(all(almost_equal(wlut.gradWeight:float(), ulut.gradWeight:float(), tol))) 73 | end 74 | 75 | 76 | LuaUnit:main() 77 | --------------------------------------------------------------------------------