├── .gitignore ├── .gitmodules ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── doc ├── ContributorGuidelines.md ├── ImplementedDetails.md ├── MigrateStatusByFile.md ├── OlderChanges.md ├── img │ ├── gpupipelinemultiple.png │ ├── reduceall_pipelinestall.png │ └── singlegpuoperation.png └── torch │ ├── cuda-to-opencl.md │ ├── cutorch-apply.md │ └── torch.md ├── rocks └── cltorch-scm-1.rockspec └── src ├── CMakeLists.txt ├── FFI.lua ├── Random.lua ├── Storage.cpp ├── Tensor.cpp ├── Tensor.lua ├── TensorMath.lua ├── TensorOperator.c ├── Test.lua ├── UserKernel.cpp ├── UserKernel.h ├── cmake ├── build_EasyCL.cmake └── build_clBLAS.cmake ├── init.cpp ├── init.lua ├── lib ├── CMakeLists.txt ├── THCl.h ├── THClApply.cl ├── THClApply.cpp ├── THClApply.h ├── THClBlas.cpp ├── THClBlas.h ├── THClDeviceUtils.cl ├── THClDeviceUtils.cpp ├── THClDeviceUtils.h ├── THClGather.cl ├── THClGather.cpp ├── THClGeneral.cpp ├── THClGeneral.h ├── THClKernels.cpp ├── THClKernels.h ├── THClOperators.h ├── THClReduce.cl ├── THClReduce.cpp ├── THClReduce.h ├── THClReduceAll.cl ├── THClReduceAll.cpp ├── THClReduceAll.h ├── THClReduceApplyUtils.cl ├── THClReduceApplyUtils.cpp ├── THClReduceApplyUtils.h ├── THClScatter.cl ├── THClScatter.cpp ├── THClStorage.cpp ├── THClStorage.h ├── THClStorageCopy.cpp ├── THClStorageCopy.h ├── THClStorageGet.cl ├── THClStorageSet.cl ├── THClTensor.cpp ├── THClTensor.h ├── THClTensorCopy.cpp ├── THClTensorCopy.h ├── THClTensorIndex.cl ├── THClTensorIndex.cpp ├── THClTensorInfoCl.cl ├── THClTensorMasked.cl ├── THClTensorMasked.cpp ├── THClTensorMath.cpp ├── THClTensorMath.h ├── THClTensorMath2.cl ├── THClTensorMath2.cpp ├── THClTensorMathBlas.cpp ├── THClTensorMathCompare.cpp ├── THClTensorMathCompare.h ├── THClTensorMathCompareT.cpp ├── THClTensorMathPairwise.cpp ├── THClTensorMathPointwise.cpp ├── THClTensorMathPointwise.h ├── THClTensorMathScan.cl ├── THClTensorMathScan.cpp ├── THClTensorMathTransformReduce.cl ├── THClTensorMathTransformReduce.cpp ├── THClTypeParseTraits.cpp └── THClTypeParseTraits.h ├── test ├── run-test-device.sh ├── run-test-perf.sh ├── run-test-tensor.sh ├── test-device.lua ├── test-perf.lua ├── test-tensor.lua ├── test-zsh.zsh ├── test_userkernel.lua ├── unit_storage.lua └── unit_tensor.lua ├── torch ├── generic │ ├── Storage.cpp │ └── Tensor.cpp ├── utils.c └── utils.h ├── travis └── install-torch.sh └── util └── port.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | *_generated.* 7 | 8 | port/ 9 | /lua 10 | /luaT 11 | /TH 12 | 13 | # Precompiled Headers 14 | *.gch 15 | *.pch 16 | 17 | # Compiled Dynamic libraries 18 | *.so 19 | *.dylib 20 | *.dll 21 | 22 | # Fortran module files 23 | *.mod 24 | 25 | # Compiled Static libraries 26 | *.lai 27 | *.la 28 | *.a 29 | *.lib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | 36 | build/ 37 | share/ 38 | /share 39 | /back 40 | *~ 41 | 42 | generated/ 43 | non-templated/ 44 | templated-auto/ 45 | templates.manual/ 46 | 47 | build-*/ 48 | 49 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/EasyCL"] 2 | path = src/EasyCL 3 | url = https://github.com/hughperkins/EasyCL.git 4 | [submodule "src/clMathLibraries/clBLAS"] 5 | path = src/clMathLibraries/clBLAS 6 | url = https://github.com/hughperkins/clBLAS 7 | [submodule "src/boost-headers-lite"] 8 | path = src/boost-headers-lite 9 | url = https://github.com/hughperkins/boost-headers-lite 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: objective-c 2 | matrix: 3 | include: 4 | - env: OSX=10.11 5 | os: osx 6 | osx_image: osx10.11 7 | rvm: system 8 | 9 | before_install: 10 | - if [ -f ".git/shallow" ]; then travis_retry git fetch --unshallow; fi 11 | - whoami 12 | - pwd 13 | - bash src/travis/install-torch.sh 14 | - source ~/torch/install/bin/torch-activate 15 | - luajit -l torch -e 'print(torch.Tensor(3,2):uniform())' 16 | - mkdir ~/git 17 | - cd ~/git 18 | - cd ~/build/hughperkins/cltorch 19 | 20 | script: 21 | - cat ~/torch/install/bin/torch-activate 22 | - source ~/torch/install/bin/torch-activate 23 | - luarocks make rocks/cltorch-scm-1.rockspec 24 | - otool -L ~/torch/install/lib/lua/5.1/libcltorch.so 25 | - otool -l ~/torch/install/lib/lua/5.1/libcltorch.so | grep RPATH -A2 26 | - luajit -e 'require "cltorch"' 27 | - luajit -l cltorch -e "cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())" 28 | - 'luajit -e "require ''cltorch''; cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())"' 29 | - luajit -l cltorch -e "cltorch.setAllowNonGpus(1); props = cltorch.getDeviceProperties(1); for k,v in pairs(props) do print(k,v) end" 30 | - export TEST_EXCLUDES=test_blas,test_cumprod,test_cumsum,test_equals,test_indexcopy,test_indexfill,test_matrixwide,test_max2,test_mean,test_meanall,test_min2,test_norm,test_prod,test_prodall,test_sum_t,test_sumallt,test_reduceAll,test_sum,test_sum_t_offset,test_sumall 31 | - 'luajit -l cltorch -e "cltorch.setAllowNonGpus(1); cltorch.test()"' 32 | - zsh src/test/test-zsh.zsh 33 | - #zsh 34 | - #ps 35 | - #source ~/torch/install/bin/torch-activate 36 | - #luajit -e 'require "cltorch"' 37 | - #luajit -l cltorch -e "cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())" 38 | - #'luajit -e "require ''cltorch''; cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())"' 39 | - #luajit -l cltorch -e "cltorch.setAllowNonGpus(1); props = cltorch.getDeviceProperties(1); for k,v in pairs(props) do print(k,v) end" 40 | - #export TEST_EXCLUDES=test_blas,test_cumprod,test_cumsum,test_equals,test_indexcopy,test_indexfill,test_matrixwide,test_max2,test_mean,test_meanall,test_min2,test_norm,test_prod,test_prodall,test_sum_t,test_sumallt,test_reduceAll,test_sum,test_sum_t_offset,test_sumall 41 | 42 | notifications: 43 | email: 44 | on_success: never 45 | on_failure: never 46 | 47 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12) 2 | 3 | OPTION(DEV_RUN_COG "cltorch maintainers only, otherwise set to 'OFF'." OFF) 4 | 5 | if("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") 6 | set(ON_LINUX 1) 7 | endif() 8 | if("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows") 9 | set(ON_WINDOWS 1) 10 | endif() 11 | 12 | # https://cmake.org/Wiki/CMake_RPATH_handling 13 | SET(CMAKE_INSTALL_RPATH "${Torch_INSTALL_LUA_CPATH_SUBDIR}}/../..") 14 | SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) 15 | SET(CMAKE_MACOSX_RPATH TRUE) 16 | 17 | #SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}") 18 | 19 | FIND_PACKAGE(Torch REQUIRED) 20 | 21 | SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration") 22 | SET(CMAKE_CXX_FLAGS "-std=c++0x -Wall") 23 | 24 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/cmake") 25 | 26 | if(UNIX) 27 | execute_process(COMMAND git --version RESULT_VARIABLE git_test ERROR_QUIET) 28 | if(${git_test} EQUAL 0) 29 | if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git") 30 | message("using git setting USE_GIT 1") 31 | SET(USE_GIT 1) 32 | endif() 33 | endif() 34 | endif() 35 | 36 | if(UNIX) 37 | execute_process(COMMAND bash -c "${CMAKE_PREFIX_PATH}/bin/luarocks list distrocheck | grep distrocheck" 38 | RESULT_VARIABLE DISTROCHECK) 39 | if(${DISTROCHECK} EQUAL "1") 40 | message("CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH}") 41 | execute_process(COMMAND bash -c "${CMAKE_PREFIX_PATH}/bin/luarocks list") 42 | message(FATAL_ERROR " 43 | Please install cltorch from https://github.com/hughperkins/distro-cl") 44 | endif() 45 | else() # windows? 46 | execute_process(COMMAND "luarocks list distrocheck | find /i \"distrocheck\"" 47 | RESULT_VARIABLE DISTROCHECK) 48 | if(${DISTROCHECK} EQUAL "1") 49 | message("CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH}") 50 | execute_process(COMMAND "luarocks list") 51 | message(FATAL_ERROR " 52 | Please install cltorch from https://github.com/hughperkins/distro-cl") 53 | endif() 54 | endif() 55 | 56 | if(USE_GIT) 57 | message("using git") 58 | execute_process(COMMAND bash -c "echo $PWD" 59 | WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}") 60 | execute_process(COMMAND git status 61 | WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}") 62 | execute_process( 63 | COMMAND git submodule update --init --force --recursive 64 | WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" 65 | ) 66 | endif() 67 | 68 | INCLUDE("${CMAKE_MODULE_PATH}/build_clBLAS.cmake") 69 | INCLUDE_DIRECTORIES(${clBLAS_INCLUDE_DIRS}) 70 | #LINK_DIRECTORIES(${CLBLAS_LIBRARY_DIR}) 71 | 72 | INCLUDE("${CMAKE_MODULE_PATH}/build_EasyCL.cmake") 73 | INCLUDE_DIRECTORIES(${EasyCL_INCLUDE_DIRS}) 74 | #LINK_DIRECTORIES(${EASYCL_LIBRARY_DIR}) 75 | 76 | ADD_SUBDIRECTORY(src) 77 | 78 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src") 79 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src/lib") 80 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src/torch") 81 | 82 | SET(src src/UserKernel.cpp src/init.cpp src/torch/utils.c src/Storage.cpp src/Tensor.cpp TensorMath.c 83 | src/TensorOperator.c) 84 | SET(luasrc src/init.lua src/FFI.lua src/Tensor.lua src/Random.lua src/Test.lua src/test/unit_storage.lua src/test/unit_tensor.lua) 85 | 86 | ADD_TORCH_WRAP(cltorchtensormathwrap src/TensorMath.lua) 87 | 88 | ADD_TORCH_PACKAGE(cltorch "${src}" "${luasrc}") 89 | #ADD_DEPENDENCIES(cltorch clBLAS) 90 | #ADD_DEPENDENCIES(cltorch EasyCL) 91 | ADD_DEPENDENCIES(cltorch EasyCL-external) 92 | ADD_DEPENDENCIES(cltorchtensormathwrap EasyCL) 93 | #add_custom_target( 94 | 95 | execute_process(COMMAND git log -n 1 --pretty=%h OUTPUT_VARIABLE git_commit OUTPUT_STRIP_TRAILING_WHITESPACE) 96 | #execute_process(COMMAND echo string commit="${git_commit}" > ${CMAKE_CURRENT_SOURCE_DIR}/commit.h) 97 | file(GENERATE OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cltorch_commit_generated.h 98 | CONTENT "const char *cltorch_commit=\"${git_commit}\";\n" 99 | ) 100 | 101 | if(DEV_RUN_COG) 102 | add_custom_target( 103 | cog_cltorch 104 | python ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/cogapp/cog.py -q -I ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/cog-batteries -r ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/*.h 105 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 106 | ) 107 | add_dependencies(cltorch cog_cltorch) 108 | endif(DEV_RUN_COG) 109 | 110 | 111 | if(USE_GIT) 112 | add_custom_target( 113 | git_cltorch_rec 114 | git submodule update --init --force --recursive 115 | # COMMAND sed -i -e "s/-pedantic//" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt 116 | # COMMAND sed -i -e "s/-Wall -Wextra/-w/" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt 117 | # COMMAND sed -i -e "s/-Wstrict-prototypes/-w/" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt 118 | # COMMAND sed -i -e "s/${CMAKE_CXX_FLAGS}/${CMAKE_CXX_FLAGS} -w /" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt 119 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 120 | ) 121 | add_dependencies(cltorch git_cltorch_rec) 122 | add_dependencies(clBLAS-external git_cltorch_rec) 123 | add_dependencies(EasyCL-external git_cltorch_rec) 124 | endif() 125 | 126 | TARGET_LINK_LIBRARIES(cltorch luaT THCl) 127 | TARGET_LINK_LIBRARIES(cltorch EasyCL) 128 | TARGET_LINK_LIBRARIES(cltorch clBLAS) 129 | 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Hugh Perkins (Hugh Perkins) 2 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) 3 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) 4 | Copyright (c) 2011-2013 NYU (Clement Farabet) 5 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) 6 | Copyright (c) 2006 Idiap Research Institute (Samy Bengio) 7 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) 8 | 9 | All rights reserved. 10 | 11 | Redistribution and use in source and binary forms, with or without 12 | modification, are permitted provided that the following conditions are met: 13 | 14 | 1. Redistributions of source code must retain the above copyright 15 | notice, this list of conditions and the following disclaimer. 16 | 17 | 2. Redistributions in binary form must reproduce the above copyright 18 | notice, this list of conditions and the following disclaimer in the 19 | documentation and/or other materials provided with the distribution. 20 | 21 | 3. Neither the names of NEC Laboratories American and IDIAP Research 22 | Institute nor the names of its contributors may be used to endorse or 23 | promote products derived from this software without specific prior 24 | written permission. 25 | 26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | POSSIBILITY OF SUCH DAMAGE. 37 | -------------------------------------------------------------------------------- /doc/ContributorGuidelines.md: -------------------------------------------------------------------------------- 1 | # Contributor Guidelines 2 | 3 | This doc describes some of the things I use as guidelines when writing cltorch 4 | 5 | Cutorch is an awesome, excellent high-performance implementation. Whenever a functionality is available in cutorch, I think that porting the cutorch implementation to cltorch is going to tend to have several advantages: 6 | * can integrate future improvements from cutorch 7 | * development will tend to be much faster. Probably at least 4-10 times to port something existing, than to write it from scratch 8 | * correctness will likely be high, not too many logical bugs 9 | * performance will likely be reasonable 10 | 11 | Some things are not directly portable. Notable examples are: 12 | * thrust 13 | * cuda templated kernels 14 | 15 | Thrust is a cuda-specific library. There is something similar for OpenCL, which is VexCL. For now, I prefer not to use VexCL, because it uses Boosts, which, rightly or wrongly, I somehow feel is a bit of a sledgehammer-to-crack-a-nut, really hard to build on Windows. My last experience with boost was probably 11 years ago, so it might have changed :-P 16 | 17 | So, generally, for thrust, there are a few options I've used up till now: 18 | * for the reduceall method, it turned out there was a thrust-free implementation in `goodies` branch of cutorch, so I ported that across, and seems to work great :-) 19 | * for the MSECriterion implementation in `clnn`, I simply wrote the operations on the lua side, using the cltorch `pow` and so on implementations. I'm pretty sure performance will be ok 20 | * thrust is used all over the place. Some creativity will be required. Please get in touch if you come across a new situation, so we can discuss together. I mean, you dont have to, but maybe could be a good idea :-) 21 | 22 | Cuda C++ templates dont exist in OpenCL, at least not in the OpenCL 1.1 implementation I'm targeting. Interestingly, OpenCL kernels are compiled at runtime, which some might see as a disadvantage, but it actually gives us a lot of flexibility. And you can see that cltorch loads really quickly, whereas cutorch spends a while caching every possible kernel, when loaded. This has good or bad points, for either really. 23 | 24 | So... for the kernel templates, I quite like the compile-at-runtime approach. I'm using a lua engine to instantiate the kernel templates for cltorch at runtime, at the point where they are first needed. They are reused across multiple calls. Nvidia caches these compiled kernels to disk, where they can be reused almost immediately, even after restarting the process. 25 | 26 | # Porting utilities 27 | 28 | There is a python script in `src/util` called 'port.py'. It can help do a first-cut port of files, or facilitate meld of existing files. Run it as follows: 29 | * in the parent directory of 'cltorch', clone the 'cutorch' repository 30 | * currently, it expects the cutorch 'goodies2' branch to be cloned into `cutorch-goodies2` directory, but obviously you can hack port.py a bit to change the exact directory 31 | * from cltorch directory, run `python src/util/port.py` 32 | * A first-cut port of the files in ../cutorch-goodies2 will pop out in the `port` directory 33 | * .cuh files will become .h files 34 | * .cu files will become .cpp files 35 | * any kernels and kernel functions will plausibly be split into .cl files (with the same basename as the original .cu or .cuh file) 36 | 37 | # Adding original functionality, not present in cutorch 38 | 39 | For now, I haven't really come across this situation :-P The only brush I had with this was considering adding ClByteTensors, but for now, I've shelved the idea of implementing those initially in cltorch. 40 | 41 | I think that on the whole, for now, cutorch is the 'reference' implementation, and will probably remain so for a while. There is a whole team of incredibly talented, hard-working, motivated individuals maintaining, and improving cutorch. For the foreseeable future, I think cltorch will be following cutorch, though you never know :-) 42 | 43 | Therefore, on the whole, if I dont have to implement the original functionality myself, my recommendation would be: first add it to the cutorch side, then port it across to cltorch. 44 | 45 | On the other hand, in fairness, if it was me, I'd probably do it on the cltorch side, and plausibly in a way totally unlikely to encourage back-port into cutorch :-P So, anyway, if you want to implement something original in cltorch, perhaps you can discuss with me, and on the torch7 newsgroup? 46 | 47 | # Operator classes 48 | 49 | On the subject or original functionality, or at least, original implementations, in cutorch, operators, ie AddOp etc, are structs, which are injected directly into the nvcc compiler. In OpenCL, we dont have c++ in the kernels, it should be C99. So... well, that doesnt mean we couldnt use structs actually but ... we cant just take a struct from our .cpp/.h file and inject it into a kernel. We need to provide it as a text file. Again, thinking this through as I write, there's no particular reason why we cant provide structs to the OpenCL kernels. 50 | 51 | Anyway.... rightly or wrongly :-P what I've done for now is to change the operator structs into C++ classes, which derive from HasOperator1, HasOperator2, HasOperator3 and/or HasScalars. These are interfaces. HasOperator2 has a function called 'operator2()', which returns a string. The string will be injected into our OpenCL kernel templates. 52 | 53 | I think it works quite nicely, and it's easy to convert the structs into classes, and visa versa, though it is admittedly a slight deviation from the cutorch design. 54 | 55 | # cogapp 56 | 57 | Oh yes, by the way, I'm using [cogapp](https://bitbucket.org/ned/cog) to help do some of the templating. It needs a python environment. By default, it doesnt run, but if you want to modify any of the cl files, you'll need to rerun stringify. To get this to work: 58 | * make sure you have python available 59 | * cd into `build` directory, and do `ccmake ..` 60 | * set option `DEV_RUN_COG` to `ON` 61 | * and do `configure` then `generate` 62 | * => from now on, cogapp will run automatically, when you build, and reimport the .cl files into the corresponding .cpp file 63 | 64 | -------------------------------------------------------------------------------- /doc/MigrateStatusByFile.md: -------------------------------------------------------------------------------- 1 | # Migration status by file 2 | 3 | Porting status by file, compared with original cutorch files. Note that `.cpp` here could have been ported from `.c`, `.cpp`, or `.cu`. 4 | 5 | | File | Migration status | 6 | |---|---| 7 | | THClTensorMathCompare.cpp | Done | 8 | | THClTensormathCompareT.cpp | Done | 9 | | THClTensorMathPairwise.cpp | Done | 10 | | THClTensor.h | Done | 11 | | THClTensorCopy.h | Done | 12 | | THClTensorMath.h | Done | 13 | | THClTensor.cpp | 90% | 14 | | THClTensorCopy.cpp | 50% | 15 | | THClTensorMath.cpp | 50% | 16 | | THClTensorIndex.cpp | 0% | 17 | | THClTensorMath2.cpp | 20% | 18 | | THClTensorMathBlas.cpp | 30% | 19 | | THClBlas.cpp | 50% | 20 | | THClReduce.* | 90% | 21 | | THClReduceAll.* | 70% | 22 | | THClGeneral.* | 30% | 23 | | THClTensorMathTransformReduce.* | 0% | 24 | 25 | -------------------------------------------------------------------------------- /doc/OlderChanges.md: -------------------------------------------------------------------------------- 1 | # Older changes 2 | 3 | This page contains older changes, that have been moved from the [Recent Changes](https://github.com/hughperkins/cltorch#recent-changes) section on the main page. 4 | 5 | For the most recent changes please see [Recent Changes](https://github.com/hughperkins/cltorch#recent-changes) 6 | 7 | * 23rd July: 8 | * Fixed memory leak on Intel HD Graphics 9 | * 22th July: 10 | * Performance improvement: 11 | * All per-element operations are around 2-5 faster on NVIDIA and AMD now 12 | * In the specific, this means that times for Karpathy's [char-rnn](http://github.com/karpathy/char-rnn) are around 2-3 times faster on NVIDIA and AMD cards, compared to before 13 | * [colesbury](https://github.com/colesbury)'s pull request [#176](https://github.com/torch/cutorch/pull/176) ported to cltorch, 'Allow CudaTensors as indices' 14 | * [andresy](https://github.com/andresy)'s pull request [#203](https://github.com/torch/cutorch/pull/203) ported to cltorch, 'expose retain and free for CudaStorage/CudaTensor' 15 | * 19th July: 16 | * Upgrade EasyCL version 17 | * Need to explicitly enable timing now (just in case impacts performance) 18 | * DumpTimings now shows count of number of calls, as well as timings 19 | * 18th July: 20 | * Added custom user kernels 21 | * 16th July: 22 | * Did some cleaning: 23 | * source code now all in `src` directory, to keep the front page on github clean 24 | * moved a bunch of stuff from this page to other pages, ie older changes, and list of what works 25 | * 20x speed boost for Apply kernel, and char-rnn, on Intel HD5500 GPU 26 | * 15th July: 27 | * can pass point ClTensor now also to `:lt()`, `:gt()`, `:le()`, `:ge()`, `:eq()`, `:ne()` 28 | * added profiling: 29 | * `cltorch.setProfiling(1)` to enable (has a performance hit obviously, whilst enabled) 30 | * `cltorch.dumpProfiling()` to dump timings since last dump 31 | * timings are cumulative over kernel filename/kernelname combination 32 | * 14th July: 33 | * created point tensors: 34 | * `:sum()` can return a point tensor, which stays on the GPU, eliminating gpu pipeline stall, see presentation above 35 | * `add()`, `csub()`, `mul` and `div` can all accept a point tensor in place of their scalar argument 36 | * `:prod()` can return a point tensor too now, as can `:max()`, `:min()`, `:all()`, and `:any()` 37 | * can pass point ClTensor also to `:fill()` now 38 | * 13th July: 39 | * possible to use tensors without `:setDevice()` to same device as them first. Tested with `:sum()`, `:sum(1)`, and `:sum(2)` for now 40 | * 12th July: 41 | * add `cltorch.about()`, to provide build information 42 | * 10th July: 43 | * added cmin, cmax, for tensors and scalars (as per https://github.com/torch/cutorch/pull/198/files ) 44 | * 5th July: 45 | * fixed some Mac build/load issues, so builds/loads on Mac now (thank you to mlajtos, szagouyko, centime, luo123n, and pdhvip for their enormous help with fixing this :-) ) 46 | * getDeviceProperties and so on now only show GPU and APU devices, ignores pure CPU devices (which pure CPU devices are not supported by cltorch at this time) 47 | * added `cltorch.test()`, which runs unit tests 48 | * 4th July: 49 | * `torch.save` and `torch.load` implemented 50 | * 27th June: 51 | * fixed more bugs involving Tensor copy. Hopefully should be fixed permanently now :-P 52 | * added `cltorch.dumpTimings()`, which will dump cumulative timings for various parts of the engine. It's mostly for usage by maintainers / optimizers. 53 | * massive optimization for anything involving apply, reduce, reduceall, index etc => this makes the ltsm script at [karpathy/char-rnn](https://github.com/karpathy/char-rnn) run significantly faster when using OpenCL now :-) 54 | * 26th June: 55 | * add addcmul, and unit test 56 | * add addcdiv, and unit test 57 | * added `apply2` and `apply3` as synonyms for `map` and `map2` 58 | * can use `x`, `y`, `z` instead of `*out`, `*in1` and `*in2`, in `apply`, `map`, etc 59 | * fix a buffer copy bug (note: implies updating EasyCL, and rebuilding EasyCL, see notes on updating above) 60 | * 25th June: 61 | * added bernoulli (generates on host-side for now, but I guess this is fast enough for many things?) 62 | * 24th June: 63 | * added tests for `gather`, and removed some spam 64 | * added `scatter` (for both tensor or float source) 65 | * 23rd June: 66 | * Fixed bug where operations such as apply and map on tensors with non-zero offset didnt work correctly (ie, `fill` etc after `narrow` or similar) 67 | * Added `gather` 68 | * 22nd June: 69 | * Under the hood: 70 | * Moved marking a buffer dirty, ie modified on the GPU, from [THClTensorMathBlas.cpp](https://github.com/hughperkins/cltorch/blob/9133fb4f0a23a86c48dcb5dc9cc7d44f44850a3f/lib/THCl/THClTensorMathBlas.cpp#L202) to [THClBlas.cpp](https://github.com/hughperkins/cltorch/blob/9133fb4f0a23a86c48dcb5dc9cc7d44f44850a3f/lib/THCl/THClBlas.cpp#L424) 71 | * This fixes a bug in [clnn](https://github.com/hughperkins/clnn), where the results of a convolutional layer were not being written back to the output tensor 72 | * tests pass now on an AMD gpu (actually I managed to scrounge access to a W9100 :-D ) 73 | * 21st June: 74 | * Under the hood: 75 | * Upgraded new THClKernels class to handle `THClTensorInfo` 76 | * migrated Reduce, ReduceAll, etc to use THClKernels 77 | * upgraded EasyCL to handle `uint`, `long`, `ulong` 78 | * added `cltorch.finish()` and `cltorch.synchronize()`, both do same thing, which is a `clFinish()`, on current device 79 | * made it possible to require both cutorch and cltorch, as long as one requires cutorch followed by cltorch, in that order 80 | * 20th June: 81 | * rename new `sub` method to `csub` so doesnt collide with existing `sub` 82 | * added `cltorch.setTrace(1|0)`, which prints out every allocate or copy of gpu buffers (named 'wrapper's) 83 | * removed `set` and `get` methods, because cause repeated gpu buffer copy (actually, get not too bad, but does copy whole buffer; set copies whole buffer, repeatedly :-P ) 84 | * modifed `ClStorage.__string__` to first copy whole storage to FloatStorage, once, then convert this to string, rather than using now non-existent `get` 85 | * `torch.ClTensor{3,5,2}` will now first create this as a `FloatTensor` then call `copy` on this, to convert whole Tensor/Storage to `ClTensor` (avoids repeated `set` calls) 86 | * added `normall`, ie can do `torch.norm(c)`, `torch.norm(c, exponent)` 87 | * added `prod`, `prod(1)`, `prod(2)` 88 | * `max(1)` and `min(1)` now return the indices too, as well as the max. Ditto for dimension 2. 89 | * added `:all()` and `:any()` 90 | * added `:indexFill()` 91 | * added `:indexCopy()` 92 | * added `:indexSelect()` 93 | * added `torch.cumsum(x,2)` and `torch.cumsum(x,1)` 94 | * added `torch.cumprod(x,2)` and `torch.cumprod(x,1)` 95 | * Under the hood: 96 | * created new THClKernels class: 97 | * handles THClTensor kernel input 98 | * provides `run` method that takes a dim3 `grid` and `block` input, as for cutorch kernel launches 99 | * migrated TensorIndexed to use THClKernels 100 | * 19th June: 101 | * fixed a compile bug in EasyCL, when lua5.2/5.3 header files are present (not tested yet) 102 | * added `a:sub(b)` method, which does element-wise subtraction of b from a, and puts results in a 103 | * migrated to new version of EasyCL, with one fewer waitforevents, to try to boost perf a bit 104 | * added `apply`, `map`, `map2` :-) (which run on GPU, at full speed) 105 | * added 2-pass reduceall, ie can do reduceall on much larger tensors now 106 | * 18th June: 107 | * fixed a bug in clBLAS sger that meant that sger crashed on even tiny 5x5 matrices on nvidia, using either rowmajor or columnmajor :-) https://github.com/clMathLibraries/clBLAS/pull/109 108 | * note that you will need to `git submodule update`, and `rm -Rf build/clBLAS`, in order to pick up the new version of clBLAS 109 | * moved clBLAS initialization code out of inner loops => huge speed boost 110 | * added `:neg()` operator, which negates the tensor (like `-` but without reallocation, I think) 111 | * 15th-17th June: 112 | * pow(x,y) no longer returns undefined values for x containing, or being, negative 113 | * pow(x,y) now uses `pown` when y is an exact integer scalar (ie where (float)((int)y) == y) 114 | * when no opencl-enabled devices enabled, now raise a THError, with a clear error message, rather than throwing a C++ exception, with no error message output 115 | * under the hood: added cltorch.getState() 116 | * renamed libTHCL.so to libTHCl.so 117 | * added THCl include files to `install` section 118 | * masked fill works now 119 | * torch.addr works now 120 | * 15th June: 121 | * C:t() working 122 | * 14th June: 123 | * ReduceAll working :-) For now means: sometensor:sum() works 124 | * sometensor:sum(1) and sometensor:sum(2) working too now :-) 125 | * A:min(), A:max() added 126 | * created unit tests, in [test](test) directory, [cltorch-unit-tensor.lua](test/cltorch-unit-tensor.lua) which pass 127 | * 13th June: 128 | * added `cltorch.setDevice`/`cltorch.getDevice`, see [test-device.lua](test/test-device.lua) for an example 129 | * added EasyCL includes to EasyCL install section, to remove build errors with "EasyCL.h" not found, etc 130 | 131 | -------------------------------------------------------------------------------- /doc/img/gpupipelinemultiple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hughperkins/cltorch/3b47f53f36e9c752620672213203fec22d86bc20/doc/img/gpupipelinemultiple.png -------------------------------------------------------------------------------- /doc/img/reduceall_pipelinestall.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hughperkins/cltorch/3b47f53f36e9c752620672213203fec22d86bc20/doc/img/reduceall_pipelinestall.png -------------------------------------------------------------------------------- /doc/img/singlegpuoperation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hughperkins/cltorch/3b47f53f36e9c752620672213203fec22d86bc20/doc/img/singlegpuoperation.png -------------------------------------------------------------------------------- /doc/torch/cuda-to-opencl.md: -------------------------------------------------------------------------------- 1 | # CUDA to OpenCL 2 | 3 | Useful CUDA intro/info: 4 | - http://www.nvidia.com/docs/IO/116711/sc11-cuda-c-basics.pdf 5 | - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels 6 | - http://developer.amd.com/tools-and-sdks/opencl-zone/opencl-resources/programming-in-opencl/porting-cuda-applications-to-opencl/ 7 | 8 | notes: 9 | - `__global__` is a kernel, equivalent to OpenCL `kernel` 10 | - `mykernel<<>>(param1, param2, ...)` , with the triple brackets, is a *kernel launch* (equivalent to OpenCL `run(dims, num_workgroups * workgroup_size, workgroup_size)` (ish...) 11 | - `num_workgroups` and `workgroup_size` can be integers, or `dim3` 12 | - where there are 4 launch parameters, the fourth is the stream, ie `<<>>` 13 | - `__shared__` means local memory, ie `__local__` in OpenCL 14 | - `__syncthreads()` is like `barrier(CLK_LOCAL_MEM_FENCE)` in OpenCL 15 | - `cudaDeviceSynchronize()` is like `clFinish()` 16 | - `__device__` means a function that can be called from a kernel 17 | - `__host__` means a function that can be called from the host, ie from c/c++ main program 18 | - possible to add both `__device__` and `__host__`, just to be really confusing :-P 19 | 20 | ## Indexing 21 | 22 | |CUDA | OpenCL | 23 | |---|---| 24 | |gridDim | get_num_groups() | 25 | | blockDim | get_local_size() | 26 | | blockIdx | get_group_id() | 27 | | threadIdx | get_local_id() | 28 | | | get_global_id() | 29 | | | get_global_size() | 30 | 31 | 32 | -------------------------------------------------------------------------------- /doc/torch/cutorch-apply.md: -------------------------------------------------------------------------------- 1 | # cutorch-apply 2 | 3 | ## THCApply 4 | 5 | ``` 6 | typedef struct THCudaTensor 7 | { 8 | long *size; 9 | long *stride; 10 | int nDimension; 11 | 12 | THCudaStorage *storage; 13 | long storageOffset; 14 | int refcount; 15 | 16 | char flag; 17 | 18 | } THCudaTensor; 19 | ``` 20 | 21 | From THCReduceApplyUtils.h: 22 | ``` 23 | #define MAX_CUTORCH_DIMS 25 24 | ``` 25 | 26 | From THCReduceApplyUtils.h: 27 | ``` 28 | // CUDA kernel argument that defines tensor layout 29 | template 30 | struct TensorInfo { 31 | // Extracts size/stride information for the kernel. 32 | // Successive dimensions can be collapsed if the size/strides match 33 | // up and thus there are no holes between the dimensions. This is used 34 | // to reduce the complexity of the problem. 35 | // The optional `reduceDim` indicates a reduction dimension for the 36 | // given tensor, so that the output size for this dimension will be 1. 37 | TensorInfo(THCState* state, THCudaTensor* t, int reduceDim = -1); 38 | 39 | // Contiguous tensors of more than one dimension are collapsed down 40 | // to one tensor 41 | // note: since both __host__ and __device__, this is available from both main 42 | // c++ code, and from kernels 43 | __host__ __device__ inline bool isContiguous() const { 44 | return (dims == 1 && strides[0] == 1); 45 | } 46 | 47 | float* data; 48 | IndexType sizes[MAX_CUTORCH_DIMS]; 49 | IndexType strides[MAX_CUTORCH_DIMS]; 50 | int dims; 51 | }; 52 | ``` 53 | 54 | ``` 55 | // Translate a linear index for the apply to a float* offset; 56 | // specialized on `Dims` to reduce nvcc compilation time 57 | template 58 | struct IndexToOffset { 59 | static __host__ __device__ IndexType get( 60 | IndexType linearId, 61 | const TensorInfo& info) { 62 | IndexType offset = 0; 63 | 64 | // Use static dims 65 | for (int i = Dims - 1; i >= 0; --i) { 66 | IndexType curDimIndex = linearId % info.sizes[i]; 67 | IndexType curDimOffset = curDimIndex * info.strides[i]; 68 | offset += curDimOffset; 69 | 70 | if (i > 0) { 71 | linearId /= info.sizes[i]; 72 | } 73 | } 74 | 75 | return offset; 76 | } 77 | }; 78 | ``` 79 | 80 | ``` 81 | // This is the kernel entry point, since it is marked with `__global__` 82 | template 83 | __global__ void 84 | THCudaTensor_pointwiseApply3(TensorInfo a, 85 | TensorInfo b, 86 | TensorInfo c, 87 | IndexType totalElements, 88 | Op op) 89 | ``` 90 | 91 | ``` 92 | // This is a normal C++ host-side method, not kernel or anything 93 | // It happens to launch the kernel though, ie launches 94 | // THCudaTensor_pointwiseApply3, above 95 | template 96 | bool THCudaTensor_pointwiseApply3(THCState* state, 97 | THCudaTensor* a, 98 | THCudaTensor* b, 99 | THCudaTensor* c, 100 | const Op& op, 101 | TensorArgType aType = ReadWrite, 102 | TensorArgType bType = ReadOnly, 103 | TensorArgType cType = ReadOnly) { 104 | ... 105 | // triple quotes, so this is a kernel *launch* 106 | THCudaTensor_pointwiseApply3 107 | <<>>( 108 | aInfo, bInfo, cInfo, (TYPE) totalElements, op); 109 | ... 110 | } 111 | ``` 112 | 113 | -------------------------------------------------------------------------------- /doc/torch/torch.md: -------------------------------------------------------------------------------- 1 | # torch 2 | 3 | ## root / 4 | 5 | * init.c 6 | * contains luaopen_libtorch 7 | * creates `torch` table 8 | * calls torch_(type)(Storage|Tensor)_init 9 | * init.lua 10 | * requires libtorch 11 | * includes (Tensor|File|FFI|...).lua 12 | * defines torch.(type|class|include|...) 13 | * Storage.c 14 | * torch_Storage_(NAME) => torch_(Type)Storage_(NAME) 15 | * torch_Storage => torch.(Type)Storage 16 | * includes generic/Storage.c and THGenerateAllTypes.h 17 | * Tensor.c 18 | * includes generic/Tensor.c and THGenerateAllTypes.h 19 | * Tensor.lua 20 | * A few Tensor utility methods, like print, expand, totable 21 | * Tensor.(typeAs|byte|char|short|int|long|float|double) methods 22 | 23 | ## /generic 24 | 25 | * Storage.c 26 | * handles the lua types/interfaces, then calls THStorage methods 27 | * for char,byte,int,double,float, ...: 28 | * torch_(Type)Storage_new 29 | * calls TH(Type)Storage_new , with appropriate size 30 | * calls TH(Type)Storage_set, with values 31 | * torch_(Type)Storage_free 32 | * calls TH(Type)Storage_free 33 | * torch_(Type)Storage_resize 34 | * calls TH(Type)Storage_resize 35 | * (does little else) 36 | * torch_(Type)Storage_copy 37 | * if statement, baseed on source type, passed in as argument 38 | * calls appropriate THStorage_copy, based on source type 39 | * torch_(Type)Storage_fill 40 | * calls TH(Type)Storage_fill 41 | * torch_(Type)Storage_newindx 42 | * calls TH(Type)Storage_set 43 | * torch_(Type)Storage_index 44 | * calls TH(Type)Storage_get 45 | * torch_(Type)Storage_factory 46 | * calls TH(Type)Storage_new 47 | * torch_(type)Storage_init 48 | * registers the methods above 49 | * Tensor.c 50 | * for char,byte,int,double,float, ...: 51 | * torch_(Type)Tensor_new 52 | * creates a THLongStorage to read size of each dimension 53 | * creates a new TH(Type)Storage, and resizes it 54 | * calls TH(Type)Storage_set on each item 55 | 56 | ## lib/TH 57 | 58 | * TH.h 59 | * includes lib/TH/TH*.h 60 | * THGeneral.c/h 61 | * THAlloc 62 | * THFree 63 | * THRealloc 64 | * THSetArgErrorHandler 65 | * (a few others) 66 | * THStorage.h 67 | * THStorage => TH(Type)Storage 68 | * THStorage_(Name) => TH(Type)Storage_(Name) 69 | * includes lib/TH/generic/THStorage.h and THGenerateAllTypes.h 70 | * includes lib/TH/generic/THStorageCopy.h and THGenerateAllTypes.h 71 | * THStorage.c 72 | * includes lib/TH/generic/THStorage.c and all types 73 | * includes lib/TH/generic/THStorageCopy.c and all types 74 | * THTensor.c/h 75 | * generates all types for lib/TH/generic/THTensor*.h/c 76 | 77 | ## lib/TH/generic 78 | 79 | * THStorage.c/h 80 | * TH(Type)Storage_(new,newWithSize,newWithAllocator,free,newWithData, 81 | resize, fill, set, get) 82 | * THTensor.c/h 83 | * TH(Type)Tensor_(storage,storageOffset,size,stride,data,rawInit, 84 | new,newWithTensor,newWithStorage,newWithSize,newClone, resize, ...) 85 | 86 | # cutorch 87 | 88 | ## root / 89 | 90 | * init.lua 91 | * require libcutorch 92 | * include Tensor.lua, FFI.lua, test,la 93 | * init.c 94 | * defines and registers global functions, eg synchronize, getNumStreams, setDevice 95 | * calls cutorch_Cuda(Storage|Tensor|TensorMath|TensorOperator)_init(L) 96 | * intializes THCState, and stores it as _state 97 | * Storage.c 98 | * calls generic/Storage.c for Real=Cuda 99 | * defines cutorch_CudaStorage_copy for src type (Cuda|Byte|...) 100 | * defines cutorch_(Type)Storage_copy for all src types 101 | * registers the copy methods as 'copy' method of torch.ByteStorage etc 102 | 103 | * seems like since generic/Storage.c just calls appropriate THCuda method, that generic/Storage.c doesnt need much modificatoin? 104 | * Tensor.c 105 | * as for Storage.c: include generic/Tensor.c for every type, just overwite `copy` methods 106 | * Tensor.lua 107 | * injects a `cuda()` method to each of the other Tensor types 108 | * adds 'double()', 'float()' etc method to torch.CudaTensor type 109 | * FFI.lua 110 | * almost empty 111 | * contains the structs, ie: 112 | * THCState 113 | * THCudaStorage 114 | * THCudaTensor 115 | * adds `cdata` and `data` methods (?) to Storage and Tensor 116 | 117 | ## torch/generic 118 | 119 | * Storage.c and Tensor.c from torch/generic, -no change-, but modified some... eg THCState instead of THState, and cutorch_getState, instead of checkudata and not the only difference :-( 120 | 121 | ## lib/THC 122 | 123 | * THC.h 124 | * includes lib/THC/TH*.h 125 | * THCGeneral.h/c 126 | * includes cuda.h etc 127 | * defines THAssert 128 | * defines THC_API, THC_EXTERNC 129 | * defines THCState struct 130 | * implementation for global functions, like: 131 | * THCudaInit 132 | * THCudaBlas_init 133 | * THCState_getNumDevices 134 | * THCStorage.c/h/cu 135 | * defines THCudaStorage struct, containing allocator, refcount, .. 136 | * defines THCudaStorage_(new,set,get,free,fill,resize,data) 137 | * `fill` and `resize` are in .cu, presumably because these need kernels (seems like .cu is just more definitions of what are in the .h file though, some in the .c, some in the .cu) 138 | * other methods just use cudaMalloc, cudaFree, cudaMemcpy, etc 139 | * THCTensor.c/h/cu 140 | * various methods like retain, free, set1d/2d/..., get1d/2d/... 141 | squeeze, storage, new, data, lots of `new` methods, `resize` methods 142 | * meld shows there's basically no difference between the .c file and the original torch one, in lib/TH/generic/THTensor.c 143 | * the cu has two functions: 144 | * THCudaTensor_getDevice 145 | * THCudaTensor_getTextureObject 146 | * (No lib/THC/generic) 147 | 148 | -------------------------------------------------------------------------------- /rocks/cltorch-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "cltorch" 2 | version = "scm-1" 3 | 4 | source = { 5 | url = "git://github.com/hughperkins/cltorch.git", 6 | } 7 | 8 | description = { 9 | summary = "OpenCL backend for Torch", 10 | detailed = [[ 11 | ]], 12 | homepage = "https://github.com/hughperkins/cltorch", 13 | license = "BSD" 14 | } 15 | 16 | dependencies = { 17 | "torch >= 7.0", 18 | } 19 | 20 | build = { 21 | type = "command", 22 | build_command = [[ 23 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install 24 | ]], 25 | platforms = { 26 | windows = { 27 | build_command = [[ 28 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install 29 | ]] 30 | } 31 | }, 32 | install_command = "cd build" 33 | } 34 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | ADD_SUBDIRECTORY(lib) 4 | 5 | -------------------------------------------------------------------------------- /src/FFI.lua: -------------------------------------------------------------------------------- 1 | local ok, ffi = pcall(require, 'ffi') 2 | if ok then 3 | 4 | local cdefs = [[ 5 | 6 | typedef struct THClStorage 7 | { 8 | int device; 9 | float *data; 10 | void *cl; 11 | void *wrapper; 12 | long size; 13 | int refcount; 14 | char flag; 15 | void *allocator; 16 | void *allocatorContext; 17 | struct THClStorage *view; 18 | } THClStorage; 19 | 20 | typedef struct THClTensor 21 | { 22 | long *size; 23 | long *stride; 24 | int nDimension; 25 | 26 | THClStorage *storage; 27 | long storageOffset; 28 | int refcount; 29 | 30 | char flag; 31 | 32 | int device; 33 | } THClTensor; 34 | 35 | ]] 36 | ffi.cdef(cdefs) 37 | 38 | local Storage = torch.getmetatable('torch.ClStorage') 39 | local Storage_tt = ffi.typeof('THClStorage**') 40 | 41 | rawset(Storage, "cdata", function(self) return Storage_tt(self)[0] end) 42 | rawset(Storage, "data", function(self) return Storage_tt(self)[0].data end) 43 | -- Tensor 44 | local Tensor = torch.getmetatable('torch.ClTensor') 45 | local Tensor_tt = ffi.typeof('THClTensor**') 46 | 47 | rawset(Tensor, "cdata", function(self) return Tensor_tt(self)[0] end) 48 | 49 | rawset(Tensor, "data", 50 | function(self) 51 | self = Tensor_tt(self)[0] 52 | return self.storage ~= nil and self.storage.data + self.storageOffset or nil 53 | end 54 | ) 55 | 56 | end 57 | -------------------------------------------------------------------------------- /src/Random.lua: -------------------------------------------------------------------------------- 1 | -- I reckon that generating on host side and copying to gpu 2 | -- will work just fine for many scenarios, eg dropout 3 | -- I cite as evidence the answer by Klerik at 4 | -- http://stackoverflow.com/questions/9912143/how-to-get-a-random-number-in-opencl 5 | 6 | function torch.ClTensor:bernoulli(p) 7 | if p ~= nil then 8 | self:copy(torch.Tensor(self:size()):bernoulli(p)) 9 | else 10 | self:copy(torch.Tensor(self:size()):bernoulli()) 11 | end 12 | return self 13 | end 14 | 15 | function torch.ClTensor:uniform(a, b) 16 | if a == nil then 17 | a = 0 18 | end 19 | if b == nil then 20 | b = 1 21 | end 22 | self:copy(torch.Tensor(self:size()):uniform(a, b)) 23 | return self 24 | end 25 | 26 | -------------------------------------------------------------------------------- /src/Storage.cpp: -------------------------------------------------------------------------------- 1 | #include "torch/utils.h" 2 | #include "THCl.h" 3 | //#include "THFile.h" 4 | #include "luaT.h" 5 | 6 | extern "C" { 7 | void cltorch_ClStorage_init(lua_State* L); 8 | } 9 | 10 | #define EXCEPT_TO_THERROR(method) \ 11 | try { \ 12 | method; \ 13 | } catch(exception &e) { \ 14 | THError("Something went wrong: %s", e.what()); \ 15 | } 16 | 17 | /* everything is as the generic Storage.c, except few things (see below) */ 18 | 19 | #define real float 20 | #define Real Cl 21 | #define TH_GENERIC_FILE "generic/Storage.c" 22 | 23 | #define torch_Storage_(NAME) TH_CONCAT_4(torch_,Real,Storage_,NAME) 24 | 25 | #define THFile_readRealRaw(file, data, size) \ 26 | { \ 27 | float *fdata = (float*)THAlloc(sizeof(float)*size); \ 28 | THError("Not implemented"); \ 29 | THFile_readFloatRaw(file, fdata, size); \ 30 | /* THClCheck(clMemcpy(data, fdata, size * sizeof(float), clMemcpyHostToDevice));*/ \ 31 | THFree(fdata); \ 32 | } 33 | 34 | #define THFile_writeRealRaw(file, data, size) \ 35 | { \ 36 | float *fdata = (float*)THAlloc(sizeof(float)*size); \ 37 | THError("Not implemented"); \ 38 | /* THClCheck(clMemcpy(fdata, data, size * sizeof(float), clMemcpyDeviceToHost));*/ \ 39 | THFile_writeFloatRaw(file, fdata, size); \ 40 | THFree(fdata); \ 41 | } 42 | 43 | #define torch_Storage TH_CONCAT_STRING_3(torch.,Real,Storage) 44 | 45 | #include "generic/Storage.cpp" 46 | 47 | #undef real 48 | #undef Real 49 | #undef TH_GENERIC_FILE 50 | 51 | /* now we overwrite some methods specific to ClStorage */ 52 | 53 | static int cltorch_ClStorage_copy(lua_State *L) 54 | { 55 | THClState *state = cltorch_getstate(L); 56 | THClStorage *storage = static_cast(luaT_checkudata(L, 1, "torch.ClStorage")); 57 | void *src; 58 | if( (src = luaT_toudata(L, 2, "torch.ClStorage")) ) { 59 | EXCEPT_TO_THERROR(THClStorage_copy(state, storage, static_cast(src))); 60 | } else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) ) { 61 | EXCEPT_TO_THERROR(THClStorage_copyByte(state, storage, static_cast(src))); 62 | } else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) ) { 63 | EXCEPT_TO_THERROR(THClStorage_copyChar(state, storage, static_cast(src))); 64 | } else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) ) { 65 | EXCEPT_TO_THERROR(THClStorage_copyShort(state, storage, static_cast(src))); 66 | } else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) ) { 67 | EXCEPT_TO_THERROR(THClStorage_copyInt(state, storage, static_cast(src))); 68 | } else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) ) { 69 | EXCEPT_TO_THERROR(THClStorage_copyLong(state, storage, static_cast(src))); 70 | } else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) ) { 71 | EXCEPT_TO_THERROR(THClStorage_copyFloat(state, storage, static_cast(src))); 72 | } else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) { 73 | EXCEPT_TO_THERROR(THClStorage_copyDouble(state, storage, static_cast(src))); 74 | } else if( (src = luaT_toudata(L, 2, "torch.ClStorage")) ) { 75 | EXCEPT_TO_THERROR(THClStorage_copyCl(state, storage, static_cast(src))); 76 | } else 77 | luaL_typerror(L, 2, "torch.*Storage"); 78 | 79 | lua_settop(L, 1); 80 | return 1; 81 | } 82 | 83 | #define CL_IMPLEMENT_STORAGE_COPY(TYPEC) \ 84 | static int cltorch_##TYPEC##Storage_copy(lua_State *L) \ 85 | { \ 86 | TH##TYPEC##Storage *storage = static_cast(luaT_checkudata(L, 1, "torch." #TYPEC "Storage")); \ 87 | void *src; \ 88 | if( (src = luaT_toudata(L, 2, "torch." #TYPEC "Storage")) ) \ 89 | TH##TYPEC##Storage_copy(storage, static_cast(src)); \ 90 | else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) ) \ 91 | TH##TYPEC##Storage_copyByte(storage, static_cast(src)); \ 92 | else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) ) \ 93 | TH##TYPEC##Storage_copyChar(storage, static_cast(src)); \ 94 | else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) ) \ 95 | TH##TYPEC##Storage_copyShort(storage, static_cast(src)); \ 96 | else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) ) \ 97 | TH##TYPEC##Storage_copyInt(storage, static_cast(src)); \ 98 | else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) ) \ 99 | TH##TYPEC##Storage_copyLong(storage, static_cast(src)); \ 100 | else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) ) \ 101 | TH##TYPEC##Storage_copyFloat(storage, static_cast(src)); \ 102 | else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) \ 103 | TH##TYPEC##Storage_copyDouble(storage, static_cast(src)); \ 104 | else if( (src = luaT_toudata(L, 2, "torch.ClStorage")) ) \ 105 | TH##TYPEC##Storage_copyCl(cltorch_getstate(L), storage, static_cast(src)); \ 106 | else \ 107 | luaL_typerror(L, 2, "torch.*Storage"); \ 108 | \ 109 | lua_settop(L, 1); \ 110 | return 1; \ 111 | } 112 | 113 | CL_IMPLEMENT_STORAGE_COPY(Byte) 114 | CL_IMPLEMENT_STORAGE_COPY(Char) 115 | CL_IMPLEMENT_STORAGE_COPY(Short) 116 | CL_IMPLEMENT_STORAGE_COPY(Int) 117 | CL_IMPLEMENT_STORAGE_COPY(Long) 118 | CL_IMPLEMENT_STORAGE_COPY(Float) 119 | CL_IMPLEMENT_STORAGE_COPY(Double) 120 | 121 | void cltorch_ClStorage_init(lua_State* L) 122 | { 123 | /* the standard stuff */ 124 | torch_ClStorage_init(L); 125 | 126 | /* the copy methods */ 127 | { 128 | int i; 129 | 130 | const char* tnames[8] = {"torch.ByteStorage", 131 | "torch.CharStorage", 132 | "torch.ShortStorage", 133 | "torch.IntStorage", 134 | "torch.LongStorage", 135 | "torch.FloatStorage", 136 | "torch.DoubleStorage", 137 | "torch.ClStorage"}; 138 | 139 | static int (*funcs[8])(lua_State*) = {cltorch_ByteStorage_copy, 140 | cltorch_CharStorage_copy, 141 | cltorch_ShortStorage_copy, 142 | cltorch_IntStorage_copy, 143 | cltorch_LongStorage_copy, 144 | cltorch_FloatStorage_copy, 145 | cltorch_DoubleStorage_copy, 146 | cltorch_ClStorage_copy}; 147 | 148 | for(i = 0; i < 8; i++) 149 | { 150 | luaT_pushmetatable(L, tnames[i]); 151 | lua_pushcfunction(L, funcs[i]); 152 | lua_setfield(L, -2, "copy"); 153 | lua_pop(L, 1); 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/Tensor.lua: -------------------------------------------------------------------------------- 1 | -- this is misleading, since it copies onto cpu, and does it on cpu 2 | function torch.ClTensor.apply(self, func) 3 | local x = torch.FloatTensor(self:size()):copy(self) 4 | x:apply(func) 5 | self:copy(x) 6 | return self 7 | end 8 | 9 | local function Tensor__type(self,type) 10 | local current = torch.typename(self) 11 | if not type then return current end 12 | if type ~= current then 13 | local new = torch.getmetatable(type).new() 14 | if self:nElement() > 0 then 15 | new:resize(self:size()):copy(self) 16 | end 17 | return new 18 | else 19 | return self 20 | end 21 | end 22 | local function Tensor__typeAs(self,tensor) 23 | return self:type(tensor:type()) 24 | end 25 | local function Tensor__cl(self) 26 | return self:type('torch.ClTensor') 27 | end 28 | local function Tensor__double(self) 29 | return self:type('torch.DoubleTensor') 30 | end 31 | local function Tensor__float(self) 32 | return self:type('torch.FloatTensor') 33 | end 34 | 35 | local function Tensor__byte(self) 36 | return self:type('torch.ByteTensor') 37 | end 38 | 39 | local function Tensor__char(self) 40 | return self:type('torch.CharTensor') 41 | end 42 | 43 | local function Tensor__int(self) 44 | return self:type('torch.IntTensor') 45 | end 46 | 47 | local function Tensor__short(self) 48 | return self:type('torch.ShortTensor') 49 | end 50 | 51 | local function Tensor__long(self) 52 | return self:type('torch.LongTensor') 53 | end 54 | 55 | rawset(torch.getmetatable('torch.DoubleTensor'), 'cl', Tensor__cl) 56 | rawset(torch.getmetatable('torch.FloatTensor'), 'cl', Tensor__cl) 57 | rawset(torch.getmetatable('torch.ByteTensor'), 'cl', Tensor__cl) 58 | rawset(torch.getmetatable('torch.CharTensor'), 'cl', Tensor__cl) 59 | rawset(torch.getmetatable('torch.IntTensor'), 'cl', Tensor__cl) 60 | rawset(torch.getmetatable('torch.ShortTensor'), 'cl', Tensor__cl) 61 | rawset(torch.getmetatable('torch.LongTensor'), 'cl', Tensor__cl) 62 | rawset(torch.getmetatable('torch.ClTensor'), 'cl', Tensor__cl) 63 | 64 | rawset(torch.getmetatable('torch.ClTensor'), 'type', Tensor__type) 65 | rawset(torch.getmetatable('torch.ClTensor'), 'typeAs', Tensor__typeAs) 66 | rawset(torch.getmetatable('torch.ClTensor'), 'double', Tensor__double) 67 | rawset(torch.getmetatable('torch.ClTensor'), 'float', Tensor__float) 68 | rawset(torch.getmetatable('torch.ClTensor'), 'byte', Tensor__byte) 69 | rawset(torch.getmetatable('torch.ClTensor'), 'char', Tensor__char) 70 | rawset(torch.getmetatable('torch.ClTensor'), 'int', Tensor__int) 71 | rawset(torch.getmetatable('torch.ClTensor'), 'short', Tensor__short) 72 | rawset(torch.getmetatable('torch.ClTensor'), 'long', Tensor__long) 73 | 74 | do 75 | local metatable = torch.getmetatable('torch.ClTensor') 76 | -- hmmm, maybe these are running on cpu? :-P 77 | for _,func in pairs{'expand', 'expandAs', 'view', 'viewAs', 'repeatTensor', 78 | 'permute', 'split', 'chunk'} do 79 | rawset(metatable, func, torch[func]) 80 | end 81 | end 82 | 83 | -------------------------------------------------------------------------------- /src/TensorOperator.c: -------------------------------------------------------------------------------- 1 | #include "torch/utils.h" 2 | #include "luaT.h" 3 | #include "THCl.h" 4 | 5 | static int cltorch_ClTensorOperator___add__(lua_State *L) 6 | { 7 | THClTensor *tensor1 = luaT_toudata(L, 1, "torch.ClTensor"); 8 | THClTensor *tensor2 = luaT_toudata(L, 2, "torch.ClTensor"); 9 | THClTensor *r; 10 | THClState *state = cltorch_getstate(L); 11 | THAssert(THClTensor_checkGPU(state, 2, tensor1, tensor2)); 12 | 13 | if(!tensor1 && !tensor2) 14 | luaL_error(L, "expecting two Tensors or one Tensor and one number"); 15 | else 16 | { 17 | int device = -1; 18 | if(tensor1) { 19 | device = tensor1->storage->device; 20 | } else { 21 | device = tensor2->storage->device; 22 | } 23 | r = THClTensor_newv2(state, device); 24 | luaT_pushudata(L, r, "torch.ClTensor"); 25 | 26 | if(!tensor1 && tensor2) 27 | { 28 | THClTensor_resizeAs(state, r, tensor2); 29 | THClTensor_copy(state, r, tensor2); 30 | THClTensor_add(state, r, r, luaL_checknumber(L, 1)); 31 | } 32 | else if(tensor1 && !tensor2) 33 | { 34 | THClTensor_resizeAs(state, r, tensor1); 35 | THClTensor_copy(state, r, tensor1); 36 | THClTensor_add(state, r, r, luaL_checknumber(L, 2)); 37 | } 38 | else 39 | { 40 | THClTensor_resizeAs(state, r, tensor1); 41 | THClTensor_copy(state, r, tensor1); 42 | THClTensor_cadd(state, r, r, 1, tensor2); 43 | } 44 | } 45 | return 1; 46 | } 47 | 48 | static int cltorch_ClTensorOperator___sub__(lua_State *L) 49 | { 50 | THClTensor *tensor1 = luaT_toudata(L, 1, "torch.ClTensor"); 51 | THClTensor *tensor2 = luaT_toudata(L, 2, "torch.ClTensor"); 52 | THClTensor *r; 53 | THClState *state = cltorch_getstate(L); 54 | THAssert(THClTensor_checkGPU(state, 2, tensor1, tensor2)); 55 | 56 | if(!tensor1 && !tensor2) 57 | luaL_error(L, "expecting two Tensors or one Tensor and one number"); 58 | else 59 | { 60 | int device = -1; 61 | if(tensor1) { 62 | device = tensor1->storage->device; 63 | } else { 64 | device = tensor2->storage->device; 65 | } 66 | r = THClTensor_newv2(state, device); 67 | luaT_pushudata(L, r, "torch.ClTensor"); 68 | 69 | if(!tensor1 && tensor2) 70 | { 71 | THClTensor_resizeAs(state, r, tensor2); 72 | THClTensor_fill(state, r, luaL_checknumber(L, 1)); 73 | THClTensor_cadd(state, r, r, -1, tensor2); 74 | } 75 | else if(tensor1 && !tensor2) 76 | { 77 | THClTensor_resizeAs(state, r, tensor1); 78 | THClTensor_copy(state, r, tensor1); 79 | THClTensor_add(state, r, r, -luaL_checknumber(L, 2)); 80 | } 81 | else 82 | { 83 | THClTensor_resizeAs(state, r, tensor1); 84 | THClTensor_copy(state, r, tensor1); 85 | THClTensor_cadd(state, r, r, -1, tensor2); 86 | } 87 | } 88 | return 1; 89 | } 90 | 91 | static int cltorch_ClTensorOperator___unm__(lua_State *L) 92 | { 93 | THClTensor *tensor = luaT_checkudata(L, 1, "torch.ClTensor"); 94 | THClTensor *r; 95 | THClState *state = cltorch_getstate(L); 96 | THAssert(THClTensor_checkGPU(state, 1, tensor)); 97 | 98 | r = THClTensor_newv2(state, tensor->storage->device); 99 | luaT_pushudata(L, r, "torch.ClTensor"); 100 | THClTensor_resizeAs(state, r, tensor); 101 | THClTensor_copy(state, r, tensor); 102 | THClTensor_mul(state, r, r, -1); 103 | 104 | return 1; 105 | } 106 | 107 | static int cltorch_ClTensorOperator___mul__(lua_State *L) 108 | { 109 | THClTensor *tensor1 = luaT_toudata(L, 1, "torch.ClTensor"); 110 | THClTensor *tensor2 = luaT_toudata(L, 2, "torch.ClTensor"); 111 | THClTensor *r; 112 | THClState *state = cltorch_getstate(L); 113 | THAssert(THClTensor_checkGPU(state, 2, tensor1, tensor2)); 114 | 115 | if(!tensor1 && !tensor2) 116 | luaL_error(L, "expecting two Tensors or one Tensor and one number"); 117 | else 118 | { 119 | int device = -1; 120 | if(tensor1) { 121 | device = tensor1->storage->device; 122 | } else { 123 | device = tensor2->storage->device; 124 | } 125 | r = THClTensor_newv2(state, device); 126 | luaT_pushudata(L, r, "torch.ClTensor"); 127 | 128 | if(!tensor1 && tensor2) 129 | { 130 | THClTensor_resizeAs(state, r, tensor2); 131 | THClTensor_copy(state, r, tensor2); 132 | THClTensor_mul(state, r, r, luaL_checknumber(L, 1)); 133 | } 134 | else if(tensor1 && !tensor2) 135 | { 136 | THClTensor_resizeAs(state, r, tensor1); 137 | THClTensor_copy(state, r, tensor1); 138 | THClTensor_mul(state, r, r, luaL_checknumber(L, 2)); 139 | } 140 | else 141 | { 142 | int dimt = tensor1->nDimension; 143 | int dims = tensor2->nDimension; 144 | 145 | if(dimt == 1 && dims == 1) 146 | lua_pushnumber(L, THClTensor_dot(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */ 147 | else if(dimt == 2 && dims == 1) 148 | { 149 | THClTensor_resize1d(state, r, tensor1->size[0]); 150 | THClTensor_zero(state, r); 151 | THClTensor_addmv(state, r, 1, r, 1, tensor1, tensor2); 152 | } 153 | else if(dimt == 2 && dims == 2) 154 | { 155 | THClTensor_resize2d(state, r, tensor1->size[0], tensor2->size[1]); 156 | THClTensor_zero(state, r); 157 | THClTensor_addmm(state, r, 1, r, 1, tensor1, tensor2); 158 | } 159 | else 160 | luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension); 161 | } 162 | } 163 | return 1; 164 | } 165 | 166 | static int cltorch_ClTensorOperator___div__(lua_State *L) 167 | { 168 | THClTensor *tensor = luaT_checkudata(L, 1, "torch.ClTensor"); 169 | THClTensor *r; 170 | THClState *state = cltorch_getstate(L); 171 | THAssert(THClTensor_checkGPU(state, 1, tensor)); 172 | 173 | luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected"); 174 | 175 | r = THClTensor_newv2(state, tensor->storage->device); 176 | luaT_pushudata(L, r, "torch.ClTensor"); 177 | 178 | THClTensor_resizeAs(state, r, tensor); 179 | THClTensor_copy(state, r, tensor); 180 | THClTensor_mul(state, r, r, 1/lua_tonumber(L, 2)); 181 | 182 | return 1; 183 | } 184 | 185 | static const struct luaL_Reg cltorch_ClTensorOperator__ [] = { 186 | {"__add__", cltorch_ClTensorOperator___add__}, 187 | {"__sub__", cltorch_ClTensorOperator___sub__}, 188 | {"__unm__", cltorch_ClTensorOperator___unm__}, 189 | {"__mul__", cltorch_ClTensorOperator___mul__}, 190 | {"__div__", cltorch_ClTensorOperator___div__}, 191 | {NULL, NULL} 192 | }; 193 | 194 | void cltorch_ClTensorOperator_init(lua_State *L) 195 | { 196 | luaT_pushmetatable(L, "torch.ClTensor"); 197 | luaL_setfuncs(L, cltorch_ClTensorOperator__, 0); 198 | lua_pop(L, 1); 199 | } 200 | -------------------------------------------------------------------------------- /src/Test.lua: -------------------------------------------------------------------------------- 1 | function cltorch.test() 2 | print('running tests...') 3 | -- luaunit = require('luaunit') 4 | 5 | require('cltorch.unit_storage') 6 | print('aftter requiring cltorch.unit_storage') 7 | -- test_basic() 8 | local res = cltorch.tests.storage.test() 9 | print('res', res) 10 | assert(res == true) 11 | 12 | require('cltorch.unit_tensor') 13 | print('aftter requiring cltorch.unit_tensor') 14 | -- test_basic() 15 | res = cltorch.tests.tensor.test() 16 | assert(res == true) 17 | 18 | print('all tests finished') 19 | end 20 | 21 | -------------------------------------------------------------------------------- /src/UserKernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | extern "C" { 4 | #include "lua.h" 5 | void cltorch_UserKernel_init(lua_State *L); 6 | } 7 | 8 | -------------------------------------------------------------------------------- /src/cmake/build_EasyCL.cmake: -------------------------------------------------------------------------------- 1 | INCLUDE(ExternalProject) 2 | 3 | message("CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}") 4 | ExternalProject_Add( 5 | EasyCL-external 6 | STAMP_DIR ${CMAKE_BINARY_DIR}/EasyCL/stamp 7 | SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL 8 | PREFIX ${CMAKE_BINARY_DIR}/EasyCL 9 | INSTALL_DIR ${CMAKE_INSTALL_PREFIX} 10 | CMAKE_CACHE_ARGS 11 | -DBUILD_TESTS:BOOL=OFF 12 | -DPROVIDE_LUA_ENGINE:BOOL=OFF 13 | -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX} 14 | -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo 15 | ) 16 | 17 | ADD_LIBRARY(EasyCL SHARED IMPORTED) 18 | ADD_DEPENDENCIES(EasyCL EasyCL-external) 19 | #SET(EASYCL_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/include/deepcl ${CMAKE_INSTALL_PREFIX}/include/easycl ) 20 | SET(EasyCL_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include ) 21 | #SET(EasyCL_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clew${CMAKE_SHARED_LIBRARY_SUFFIX}) 22 | #SET(EasyCL_FOUND TRUE) 23 | 24 | #set_property(TARGET EasyCL 25 | # PROPERTY INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} 26 | #) 27 | 28 | set_target_properties(EasyCL PROPERTIES 29 | # INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} 30 | IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} 31 | ) 32 | 33 | #set_property(TARGET EasyCL 34 | # PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} 35 | # IMPORTED_LINK_INTERFACE_LIBRARIES_DEBUG "EasyCL;clBLAS;/usr/lib/x86_64-linux-gnu/libjpeg.so" 36 | # IMPORTED_LOCATION_DEBUG "/home/ubuntu/git/DeepCL/build/libDeepCL.so" 37 | # IMPORTED_SONAME_DEBUG "libDeepCL.so" 38 | #) 39 | 40 | #set_target_properties(EasyCL PROPERTIES 41 | # IMPORTED_LINK_INTERFACE_LIBRARIES_DEBUG "EasyCL;clBLAS;/usr/lib/x86_64-linux-gnu/libjpeg.so" 42 | # IMPORTED_LOCATION_DEBUG "/home/ubuntu/git/DeepCL/build/libDeepCL.so" 43 | # IMPORTED_SONAME_DEBUG "libDeepCL.so" 44 | #) 45 | 46 | add_custom_target(easycl_delete_stamp ALL 47 | COMMAND ${CMAKE_COMMAND} -E remove_directory "${CMAKE_BINARY_DIR}/EasyCL/stamp" 48 | ) 49 | add_dependencies(EasyCL-external easycl_delete_stamp) 50 | 51 | -------------------------------------------------------------------------------- /src/cmake/build_clBLAS.cmake: -------------------------------------------------------------------------------- 1 | INCLUDE(ExternalProject) 2 | 3 | message("CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}") 4 | ExternalProject_Add( 5 | clBLAS-external 6 | #GIT_REPOSITORY git@github.com:clMathLibraries/clBLAS.git 7 | #GIT_TAG master 8 | #DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/clMathLibraries/clBLAS 9 | #GIT_SUBMODULES clBLAS 10 | STAMP_DIR ${CMAKE_BINARY_DIR}/clBLAS/stamp 11 | SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src 12 | PREFIX ${CMAKE_BINARY_DIR}/clBLAS 13 | INSTALL_DIR ${CMAKE_INSTALL_PREFIX} 14 | #CONFIGURE_COMMAND ${CMAKE_COMMAND} -Wno-dev "-G${CMAKE_GENERATOR}" 15 | #-DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER} 16 | #"-DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} -w -fPIC" 17 | #-DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER} 18 | #"-DCMAKE_C_FLAGS:STRING=${CMAKE_C_FLAGS} -w -fPIC" 19 | #-DCMAKE_BUILD_TYPE:STRING=Release 20 | CMAKE_CACHE_ARGS 21 | -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX} 22 | -DOPENCL_INCLUDE_DIRS:STRING=${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include;${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include/proxy-opencl 23 | -DOPENCL_LIBRARIES:STRING=${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clew${CMAKE_SHARED_LIBRARY_SUFFIX} 24 | -DBUILD_SHARED_LIBS:BOOL=ON 25 | -DBUILD_CLIENT:BOOL=OFF 26 | -DBUILD_TEST:BOOL=OFF 27 | -DBUILD_KTEST:BOOL=OFF 28 | # -DADD_RPATH:BOOL=ON 29 | -DCMAKE_MACOSX_RPATH:BOOL=ON 30 | -DSUFFIX_LIB:STRING= 31 | -DCORR_TEST_WITH_ACML:BOOL=OFF 32 | -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo 33 | ) 34 | 35 | #ExternalProject_Get_Property(clBLAS-external install_dir) 36 | ADD_LIBRARY(clBLAS SHARED IMPORTED) 37 | #SET_TARGET_PROPERTIES(clBLAS PROPERTIES IMPORTED_LOCATION ${clBLAS_location}) 38 | ADD_DEPENDENCIES(clBLAS clBLAS-external) 39 | #SET(CLBLAS_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/include) 40 | SET(clBLAS_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include/proxy-opencl) 41 | #SET(CLBLAS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX}) 42 | #SET(CLBLAS_FOUND ON) 43 | 44 | set_target_properties(clBLAS PROPERTIES 45 | # INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} 46 | MACOSX_RPATH TRUE 47 | INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib 48 | INSTALL_RPATH_USE_LINK_PATH TRUE 49 | IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX} 50 | ) 51 | 52 | 53 | add_custom_target(clblas_delete_stamp clBLAS-external 54 | ${CMAKE_COMMAND} -E remove_directory "${CMAKE_BINARY_DIR}/clBLAS/stamp" 55 | ) 56 | 57 | -------------------------------------------------------------------------------- /src/init.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "EasyCL.h" 4 | using namespace std; 5 | 6 | #include "util/StatefulTimer.h" 7 | 8 | #include "cltorch_commit_generated.h" 9 | 10 | //#include "THClTensorRandom.h" 11 | 12 | extern "C" { 13 | #include "lua.h" 14 | #include "utils.h" 15 | #include "luaT.h" 16 | int luaopen_libcltorch( lua_State *L ); 17 | extern void cltorch_ClStorage_init(lua_State* L); 18 | extern void cltorch_ClTensor_init(lua_State* L); 19 | extern void cltorch_ClTensorMath_init(lua_State* L); 20 | extern void cltorch_ClTensorOperator_init(lua_State* L); 21 | extern void cltorch_UserKernel_init(lua_State*L); 22 | } 23 | 24 | #include "THClGeneral.h" 25 | #include "THClStorage.h" 26 | 27 | namespace cltorch { 28 | void setProperty(lua_State *L, string name, int value) 29 | { 30 | lua_pushnumber(L, value); 31 | lua_setfield(L, -2, name.c_str()); 32 | } 33 | void setProperty(lua_State *L, string name, string value) 34 | { 35 | lua_pushstring(L, value.c_str()); 36 | lua_setfield(L, -2, name.c_str()); 37 | } 38 | static int cltorch_setAllowNonGpus(lua_State *L) 39 | { 40 | THClState *state = cltorch_getstate(L); 41 | int allowNonGpus = luaL_checknumber(L, 1); 42 | THClSetAllowNonGpus(state, allowNonGpus); 43 | return 0; 44 | } 45 | static int cltorch_getDeviceCount(lua_State *L) 46 | { 47 | THClState *state = cltorch_getstate(L); 48 | int count = THClState_getNumDevices(state); 49 | lua_pushnumber(L, count); 50 | return 1; 51 | } 52 | static int cltorch_getDevice(lua_State *L) 53 | { 54 | THClState *state = cltorch_getstate(L); 55 | int device = state->currentDevice; 56 | lua_pushnumber(L, device+1); 57 | return 1; 58 | } 59 | static int cltorch_setDevice(lua_State *L) 60 | { 61 | THClState *state = cltorch_getstate(L); 62 | if(state->initialized == 0) { 63 | THCl_initializeState(state); 64 | } 65 | int device = luaL_checknumber(L, 1) - 1; 66 | if(device < 0 || device >= state->allocatedDevices) { 67 | THError("Device doesnt exist"); 68 | } 69 | state->currentDevice = device; 70 | return 0; 71 | } 72 | static int cltorch_synchronize(lua_State *L) 73 | { 74 | THClState *state = cltorch_getstate(L); 75 | EasyCL *cl = THClState_getClv2(state, state->currentDevice); 76 | cl->finish(); 77 | return 0; 78 | } 79 | static int cltorch_getDeviceProperties(lua_State *L) 80 | { 81 | THClState *state = cltorch_getstate(L); 82 | int device = (int)luaL_checknumber(L, 1)-1; 83 | int count = THClState_getNumDevices(state); 84 | if(device < 0 || device >= count) { 85 | THError("Device doesnt exist"); 86 | } 87 | 88 | easycl::DeviceInfo deviceInfo; 89 | if(state->allowNonGpus) { 90 | deviceInfo = easycl::DevicesInfo::getDeviceInfo( device ); 91 | } else { 92 | deviceInfo = easycl::DevicesInfo::getGpuInfo( device ); 93 | } 94 | lua_newtable(L); 95 | 96 | setProperty(L, "maxWorkGroupSize", deviceInfo.maxWorkGroupSize); 97 | setProperty(L, "platformVendor", deviceInfo.platformVendor); 98 | string deviceTypeString = ""; 99 | if( deviceInfo.deviceType == 4 ) { 100 | deviceTypeString = "GPU"; 101 | } 102 | if( deviceInfo.deviceType == 2 ) { 103 | deviceTypeString = "CPU"; 104 | } 105 | if( deviceInfo.deviceType == 8 ) { 106 | deviceTypeString = "Accelerator"; 107 | } 108 | setProperty(L, "deviceType", deviceTypeString); 109 | setProperty(L, "globalMemSizeMB", deviceInfo.globalMemSize / 1024 / 1024); 110 | setProperty(L, "localMemSizeKB", deviceInfo.localMemSize / 1024); 111 | setProperty(L, "globalMemCachelineSizeKB", deviceInfo.globalMemCachelineSize / 1024 ); 112 | setProperty(L, "maxMemAllocSizeMB", deviceInfo.maxMemAllocSize / 1024 / 1024); 113 | setProperty(L, "maxComputeUnits", deviceInfo.maxComputeUnits); 114 | setProperty(L, "maxWorkGroupSize", deviceInfo.maxWorkGroupSize); 115 | setProperty(L, "deviceName", deviceInfo.deviceName); 116 | setProperty(L, "openClCVersion", deviceInfo.openClCVersion); 117 | setProperty(L, "deviceVersion", deviceInfo.deviceVersion); 118 | setProperty(L, "maxClockFrequency", deviceInfo.maxClockFrequency); 119 | 120 | return 1; 121 | } 122 | 123 | static int cltorch_getState(lua_State *L) 124 | { 125 | lua_getglobal(L, "cltorch"); 126 | lua_getfield(L, -1, "_state"); 127 | lua_remove(L, -2); 128 | return 1; 129 | } 130 | static int cltorch_dumpTimings(lua_State *L) 131 | { 132 | StatefulTimer::timeCheck("before dump"); 133 | StatefulTimer::dump( true ); 134 | StatefulTimer::timeCheck("after dump"); 135 | return 0; 136 | } 137 | // note: this is global, not per-device 138 | static int cltorch_setEnableTiming(lua_State *L) 139 | { 140 | int trace = luaL_checknumber(L, 1); 141 | StatefulTimer::setEnabled(trace); 142 | if(trace) { 143 | cout << "Timing activated" << endl; 144 | } else { 145 | cout << "Timing disabled" << endl; 146 | } 147 | return 0; 148 | } 149 | static int cltorch_dumpProfiling(lua_State *L) 150 | { 151 | THClState *state = cltorch_getstate(L); 152 | EasyCL *cl = THClState_getClv2(state, state->currentDevice); 153 | cl->dumpProfiling(); 154 | return 0; 155 | } 156 | // if you turn this to 1, you will see all copies of data between 157 | // host and gpu 158 | // useful for checking we're not doing this too often... 159 | static int cltorch_setTrace(lua_State *L) 160 | { 161 | THClState *state = cltorch_getstate(L); 162 | int trace = luaL_checknumber(L, 1); 163 | state->trace = trace; 164 | return 0; 165 | } 166 | static int cltorch_setProfiling(lua_State *L) 167 | { 168 | THClState *state = cltorch_getstate(L); 169 | int trace = luaL_checknumber(L, 1); 170 | EasyCL *cl = THClState_getClv2(state, state->currentDevice); 171 | cl->setProfiling(trace); 172 | if(trace) { 173 | cout << "Profiling activated" << endl; 174 | } else { 175 | cout << "Profiling disabled" << endl; 176 | } 177 | return 0; 178 | } 179 | static int cltorch_setAddFinish(lua_State *L) 180 | { 181 | THClState *state = cltorch_getstate(L); 182 | int addFinish = luaL_checknumber(L, 1); 183 | state->addFinish = addFinish; 184 | if(addFinish) { 185 | cout << "AddFinish activated" << endl; 186 | } else { 187 | cout << "AddFinish disabled" << endl; 188 | } 189 | return 0; 190 | } 191 | static int cltorch_setDetailedTimings(lua_State *L) 192 | { 193 | THClState *state = cltorch_getstate(L); 194 | int detailedTimings = luaL_checknumber(L, 1); 195 | state->detailedTimings = detailedTimings; 196 | return 0; 197 | } 198 | static int cltorch_about(lua_State *L) 199 | { 200 | cout << "cltorch. OpenCL backend for Torch" << endl; 201 | cout << "Built from commit " << cltorch_commit << endl; 202 | cout << "More info, doc: https://github.com/hughperkins/cltorch" << endl; 203 | cout << "Issues: https://github.com/hughperkins/cltorch/issues" << endl; 204 | return 0; 205 | } 206 | 207 | static const struct luaL_Reg cltorch_stuff__ [] = { 208 | {"setAllowNonGpus", cltorch_setAllowNonGpus}, 209 | {"getDevice", cltorch_getDevice}, 210 | {"setDevice", cltorch_setDevice}, 211 | {"synchronize", cltorch_synchronize}, 212 | {"finish", cltorch_synchronize}, 213 | {"getDeviceCount", cltorch_getDeviceCount}, 214 | {"getDeviceProperties", cltorch_getDeviceProperties}, 215 | {"getState", cltorch_getState}, 216 | {"setTrace", cltorch_setTrace}, 217 | {"setAddFinish", cltorch_setAddFinish}, 218 | {"dumpTimings", cltorch_dumpTimings}, 219 | {"setProfiling", cltorch_setProfiling}, 220 | {"setEnableTiming", cltorch_setEnableTiming}, 221 | {"setDetailedTimings", cltorch_setDetailedTimings}, 222 | {"setTiming", cltorch_setEnableTiming}, 223 | {"dumpProfiling", cltorch_dumpProfiling}, 224 | {"about", cltorch_about}, 225 | {NULL, NULL} 226 | }; 227 | } 228 | 229 | int luaopen_libcltorch( lua_State *L ) { 230 | try { 231 | lua_newtable(L); 232 | luaL_setfuncs(L, cltorch::cltorch_stuff__, 0); 233 | 234 | THClState* state = (THClState*)malloc(sizeof(THClState)); 235 | THClInit(state); 236 | 237 | cltorch_ClStorage_init(L); 238 | cltorch_ClTensor_init(L); 239 | cltorch_ClTensorMath_init(L); 240 | cltorch_ClTensorOperator_init(L); 241 | cltorch_UserKernel_init(L); 242 | 243 | lua_pushlightuserdata(L, state); 244 | lua_setfield(L, -2, "_state"); 245 | } catch(runtime_error &e) { 246 | THError("Something went wrong: %s", e.what()); 247 | } 248 | return 1; 249 | } 250 | 251 | -------------------------------------------------------------------------------- /src/init.lua: -------------------------------------------------------------------------------- 1 | require "torch" 2 | 3 | -- check we are installed from distro, otherwise error message and exit... 4 | 5 | require 'os' 6 | 7 | xpcall(function() 8 | require 'distrocheck' 9 | end, function() 10 | print('Please install cltorch from distro, per instructions at https://github.com/hughperkins/cltorch') 11 | os.exit(1) 12 | end) 13 | 14 | -- store old copy functions, in case cutorch has been loaded 15 | -- note that this only works if cutorch is loaded first 16 | 17 | local torchtypes = {} 18 | table.insert(torchtypes, torch.DoubleTensor) 19 | table.insert(torchtypes, torch.FloatTensor) 20 | table.insert(torchtypes, torch.IntTensor) 21 | table.insert(torchtypes, torch.ByteTensor) 22 | table.insert(torchtypes, torch.CharTensor) 23 | table.insert(torchtypes, torch.ShortTensor) 24 | table.insert(torchtypes, torch.LongTensor) 25 | 26 | for i,torchtype in ipairs(torchtypes) do 27 | torchtype.cloldcopy = torchtype.copy 28 | end 29 | 30 | cltorch = paths.require("libcltorch") 31 | 32 | for i,torchtype in ipairs(torchtypes) do 33 | torchtype.clnewcopy = torchtype.copy 34 | end 35 | 36 | for i,torchtype in ipairs(torchtypes) do 37 | torchtype.copy = function (self, two) 38 | if(torch.type(two) == "torch.ClTensor") then 39 | torchtype.clnewcopy(self, two) 40 | else 41 | torchtype.cloldcopy(self, two) 42 | end 43 | return self 44 | end 45 | end 46 | 47 | -- convert to FloatStorage first, rather than repeatedly 48 | -- calling 'get' on ClStorage 49 | function torch.ClStorage.__tostring__(self) 50 | floatstorage = torch.FloatStorage(self:size()) 51 | floatstorage:copy(self) 52 | return string.gsub(floatstorage:__tostring__(), 'FloatStorage', 'ClStorage') 53 | end 54 | 55 | function torch.ClTensor.__tostring__(self) 56 | if self:size():size() ~= 0 then 57 | return torch.FloatTensor.__tostring__(self) 58 | else 59 | return tostring(self:s()) .. '\n[torch.ClTensor of 0 dimensions]' 60 | end 61 | end 62 | 63 | --torch.ClStorage.__tostring__ = torch.FloatStorage.__tostring__ 64 | --torch.ClTensor.__tostring__ = torch.FloatTensor.__tostring__ 65 | 66 | include('Test.lua') 67 | include('Tensor.lua') 68 | include('Random.lua') 69 | include('FFI.lua') 70 | --include('test.lua') 71 | 72 | --local unpack = unpack or table.unpack 73 | 74 | return cltorch 75 | 76 | -------------------------------------------------------------------------------- /src/lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | OPTION(DEV_RUN_COG "cltorch maintainers only, otherwise set to 'OFF'." OFF) 4 | 5 | SET(CMAKE_C_FLAGS "-std=c99") 6 | #SET(src 7 | # THCGeneral.c THCAllocator.c THCStorage.c THCStorageCopy.c THCTensor.c THCTensorCopy.c) 8 | SET(src 9 | THClGeneral.cpp THClStorage.cpp THClStorageCopy.cpp THClTensor.cpp THClTensorCopy.cpp THClTensorMath.cpp 10 | THClTensorMathPointwise.cpp THClReduceApplyUtils.cpp THClApply.cpp 11 | THClTensorMathCompare.cpp THClTensorMathCompareT.cpp 12 | THClTensorMathPairwise.cpp THClTensorMath2.cpp 13 | THClBlas.cpp THClTensorMathBlas.cpp THClBlas.cpp THClReduce.cpp 14 | THClTypeParseTraits.cpp THClReduceAll.cpp THClDeviceUtils.cpp 15 | THClTensorMasked.cpp THClTensorMathTransformReduce.cpp 16 | THClTensorIndex.cpp THClKernels.cpp THClTensorMathScan.cpp THClGather.cpp 17 | THClScatter.cpp ) 18 | set(src-cl) 19 | 20 | message("CLBLAS_INCLUDE_DIRS ${CLBLAS_INCLUDE_DIRS}") 21 | INCLUDE_DIRECTORIES(${CLBLAS_INCLUDE_DIRS}) 22 | message("${CMAKE_CURRENT_SOURCE_DIR}/src/boost-headers-lite") 23 | INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../boost-headers-lite) 24 | 25 | add_definitions(-DCL_USE_DEPRECATED_OPENCL_1_1_APIS) # this affects clew... 26 | 27 | ADD_LIBRARY(THCl SHARED ${src} ${src-cl}) 28 | TARGET_LINK_LIBRARIES(THCl TH ) 29 | #message("DEEPCL_LIBRARIES ${EASYCL_LIBRARIES}") 30 | #TARGET_LINK_LIBRARIES( THCl ${EASYCL_LIBRARIES} ) 31 | target_link_libraries(THCl EasyCL) 32 | target_link_libraries(THCl clew) 33 | TARGET_LINK_LIBRARIES( THCl clBLAS) 34 | #add_dependencies( THCl clBLAS ) 35 | #add_dependencies( THCl EasyCL ) 36 | add_dependencies( THCl EasyCL-external ) 37 | add_dependencies( THCl clBLAS-external ) 38 | 39 | if(DEV_RUN_COG) 40 | add_custom_target( 41 | cog_thcl 42 | python ${CMAKE_CURRENT_SOURCE_DIR}/../EasyCL/thirdparty/cogapp/cog.py -q -I ${CMAKE_CURRENT_SOURCE_DIR}/../EasyCL/cog-batteries -r ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h 43 | WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} 44 | ) 45 | add_dependencies( THCl cog_thcl ) 46 | endif(DEV_RUN_COG) 47 | 48 | INSTALL(FILES 49 | THCl.h 50 | THClGeneral.h 51 | THClBlas.h 52 | THClStorage.h 53 | THClStorageCopy.h 54 | THClTensor.h 55 | THClTensorCopy.h 56 | # THClTensorRandom.h 57 | THClTensorMath.h 58 | # THClTensorConv.h 59 | # THClTensorSort.h 60 | THClApply.h 61 | THClReduce.h 62 | THClReduceApplyUtils.h 63 | THClKernels.h 64 | THClOperators.h 65 | # THClAllocator.h 66 | DESTINATION "${Torch_INSTALL_INCLUDE_SUBDIR}/THCl") 67 | 68 | INSTALL(TARGETS THCl 69 | RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}" 70 | LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}" 71 | ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}") 72 | 73 | -------------------------------------------------------------------------------- /src/lib/THCl.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_INC 2 | #define THCL_INC 3 | 4 | #include "THClGeneral.h" 5 | //#include "THClAllocator.h" 6 | //#include "THClBlas.h" 7 | #include "THClStorage.h" 8 | #include "THClStorageCopy.h" 9 | #include "THClTensor.h" 10 | #include "THClTensorCopy.h" 11 | //#include "THClTensorRandom.h" 12 | #include "THClTensorMath.h" 13 | //#include "THClTensorConv.h" 14 | //#include "THClTensorSort.h" 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /src/lib/THClApply.cl: -------------------------------------------------------------------------------- 1 | // OpenCL kernels.... 2 | 3 | // expected templated values: 4 | // dims (vector of unique dimension values) 5 | // operation 6 | // dim1 7 | // dim2 8 | // dim3 9 | // ... dimD 10 | // num_input_tensors 11 | // include_scalar_input 12 | // 13 | // maybe should add: 14 | // IndexType (hardcoded to int for now) 15 | // MAX_CUTORCH_DIMS (hardcoded to 25 for now) 16 | 17 | // (Ported from cutorch's THCApply.cuh) 18 | 19 | // Maximum number of dimensions allowed for cutorch 20 | // #define MAX_CUTORCH_DIMS 25 21 | 22 | // Enum that indicates whether tensor arguments are read/write or 23 | // read-only 24 | //enum TensorArgType { ReadWrite, ReadOnly }; 25 | 26 | {% 27 | local total_opsize = num_tensors 28 | if include_scalar_input then 29 | total_opsize = total_opsize + 1 30 | end 31 | %} 32 | 33 | static inline void op( global float *out 34 | {% for t=1,(num_tensors-1) do %} 35 | , global float *in{{t}} 36 | {% end %} 37 | {% for s=1,(num_scalars) do %} 38 | , float val{{s}} 39 | {% end %} 40 | {% for pt=1,num_point_tensors do %} 41 | , global float *pointTensor{{pt}} 42 | {% end %} 43 | ) { 44 | {{operation}}; 45 | } 46 | 47 | kernel void 48 | THClTensor_pointwiseApplyD( 49 | {% for t=1,num_tensors do %} 50 | int offset_{{t}}, 51 | {% local thisdims = loadstring('return dims' .. t)() %} 52 | {% for d=1,thisdims do %} 53 | int size_{{t}}_{{d}}, 54 | int stride_{{t}}_{{d}}, 55 | {% end %} 56 | global float*data_{{t}}, 57 | {% end %} 58 | {% for i=1,num_scalars do %} 59 | float val{{i}}, 60 | {% end %} 61 | {% for i=1,num_point_tensors do %} 62 | global float *pointTensor{{i}}, 63 | {% end %} 64 | int totalElements) { 65 | int linearIndex = get_global_id(0); 66 | if(linearIndex < totalElements ) { 67 | {% if declare_linear_index then %} 68 | int thisLinearId; 69 | {% end %} 70 | {% for t=1,num_tensors do %} 71 | {% local thisdims = loadstring('return dims' .. t)() %} 72 | {% if thisdims == -2 then %} 73 | int derived_offset_{{t}} = linearIndex + offset_{{t}}; 74 | {% else %} 75 | {{IndexType}} derived_offset_{{t}} = offset_{{t}}; 76 | thisLinearId = linearIndex; 77 | {% for d=thisdims,1,-1 do %} // bake this in.... 78 | derived_offset_{{t}} += (thisLinearId % size_{{t}}_{{d}}) * stride_{{t}}_{{d}}; 79 | {% if d > 0 then %} 80 | thisLinearId /= size_{{t}}_{{d}}; 81 | {% end %} 82 | {% end %} 83 | 84 | {% end %} 85 | {% end %} 86 | 87 | op( 88 | {% for t=1,num_tensors do %} 89 | {% if t > 1 then %} , {% end %} 90 | &(data_{{t}}[derived_offset_{{t}}]) 91 | {% end %} 92 | 93 | {% for s=1,num_scalars do %} 94 | , val{{s}} 95 | {% end %} 96 | 97 | {% for pt=1,num_point_tensors do %} 98 | , pointTensor{{pt}} 99 | {% end %} 100 | ); 101 | } 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/lib/THClApply.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_APPLY_INC 2 | #define THCL_APPLY_INC 3 | 4 | #include "THClGeneral.h" 5 | #include "THClTensor.h" 6 | #include "THClOperators.h" 7 | #include "THClReduceApplyUtils.h" 8 | 9 | // 10 | // This file contains pointwise operation functions and kernels that 11 | // work on both contiguous and non-contiguous tensor arguments of 12 | // arbitrary (up to MAX_CLTORCH_DIMS) dimensioned arguments without 13 | // copying or temporary storage. 14 | // 15 | 16 | int getWorkgroupSize(THClState *state, int device); 17 | dim3 getApplyBlock(THClState *state, int device); 18 | dim3 getApplyGrid(THClState* state, int device, long totalElements); 19 | 20 | bool THClTensor_pointwiseApply1(THClState* state, 21 | THClTensor* a, 22 | const HasOperator1 *op, 23 | TensorArgType aType = ReadWrite); 24 | bool THClTensor_pointwiseApply2(THClState* state, 25 | THClTensor* a, 26 | THClTensor* b, 27 | const HasOperator2 *op, 28 | TensorArgType aType = ReadWrite, 29 | TensorArgType bType = ReadOnly); 30 | bool THClTensor_pointwiseApply3(THClState* state, 31 | THClTensor* a, 32 | THClTensor* b, 33 | THClTensor* c, 34 | const HasOperator3 *op, 35 | TensorArgType aType = ReadWrite, 36 | TensorArgType bType = ReadOnly, 37 | TensorArgType cType = ReadOnly); 38 | 39 | #endif // THCL_APPLY_INC 40 | 41 | -------------------------------------------------------------------------------- /src/lib/THClBlas.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_BLAS_INC 2 | #define THCL_BLAS_INC 3 | 4 | #include "THClGeneral.h" 5 | 6 | //class THClTensor; 7 | struct THClTensor; 8 | class CLWrapper; 9 | 10 | typedef struct THClBlasState { 11 | // cublasHandle_t* handles; 12 | // cublasHandle_t* current_handle; 13 | // int n_devices; 14 | } THClBlasState; 15 | 16 | /* Level 1 */ 17 | THCL_API void THClBlas_swap(THClState *state, long n, float *x, long incx, float *y, long incy); 18 | THCL_API void THClBlas_scal(THClState *state, long n, float a, float *x, long incx); 19 | THCL_API void THClBlas_copy(THClState *state, long n, float *x, long incx, float *y, long incy); 20 | THCL_API void THClBlas_axpy(THClState *state, long n, float a, float *x, long incx, float *y, long incy); 21 | THCL_API float THClBlas_dot(THClState *state, long n, 22 | CLWrapper *xwrapper, long xoffset, long incx, 23 | CLWrapper *ywrapper, long yoffset, long incy); 24 | 25 | /* Level 2 */ 26 | THCL_API void THClBlas_gemv(THClState *state, char trans, long m, long n, float alpha, 27 | THClTensor *a, long lda, 28 | THClTensor *x, long incx, 29 | float beta, 30 | THClTensor *y, long incy); 31 | 32 | THCL_API void THClBlas_ger(THClState *state, long m, long n, float alpha, 33 | THClTensor *x, long incx, 34 | THClTensor *y, long incy, 35 | THClTensor *a, long lda); 36 | 37 | /* Level 3 */ 38 | THCL_API void THClBlas_gemm(THClState *state, char transa, char transb, 39 | long m, long n, long k, float alpha, 40 | THClTensor *a, long lda, THClTensor *b, long ldb, float beta, THClTensor *c, long ldc); 41 | 42 | THCL_API void THClBlas_gemmBatched(THClState *state, char transa, char transb, long m, long n, long k, 43 | float alpha, CLWrapper *aWrapper, long lda, CLWrapper *bWrapper, long ldb, 44 | float beta, CLWrapper *cWrapper, long ldc, long batchCount); 45 | 46 | #endif 47 | 48 | -------------------------------------------------------------------------------- /src/lib/THClDeviceUtils.cl: -------------------------------------------------------------------------------- 1 | static inline {{IndexType}} THClCeilDiv({{IndexType}} a, {{IndexType}} b) { 2 | return (a + b - 1) / b; 3 | } 4 | 5 | -------------------------------------------------------------------------------- /src/lib/THClDeviceUtils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "THClDeviceUtils.h" 4 | 5 | #define DEFINE_THCLCEILDIV(TYPE) \ 6 | TYPE THClCeilDiv(TYPE a, TYPE b) { \ 7 | return (a + b - 1) / b; \ 8 | } 9 | 10 | DEFINE_THCLCEILDIV(uint32); 11 | DEFINE_THCLCEILDIV(uint64); 12 | DEFINE_THCLCEILDIV(int32); 13 | DEFINE_THCLCEILDIV(int64); 14 | 15 | //template uint64 THClCeilDiv(uint64 a, uint64 b); 16 | //template uint32 THClCeilDiv(uint32 a, uint32 b); 17 | //template int64 THClCeilDiv(int64 a, int64 b); 18 | //template int32 THClCeilDiv(int32 a, int32 b); 19 | 20 | std::string THClDeviceUtils_getKernelTemplate() { 21 | // [[[cog 22 | // import stringify 23 | // stringify.write_kernel( "kernel", "THClDeviceUtils.cl" ) 24 | // ]]] 25 | // generated using cog, from THClDeviceUtils.cl: 26 | const char * kernelSource = 27 | "static inline {{IndexType}} THClCeilDiv({{IndexType}} a, {{IndexType}} b) {\n" 28 | " return (a + b - 1) / b;\n" 29 | "}\n" 30 | "\n" 31 | ""; 32 | // [[[end]]] 33 | return kernelSource; 34 | } 35 | 36 | 37 | -------------------------------------------------------------------------------- /src/lib/THClDeviceUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_DEVICE_UTILS_INC 2 | #define THCL_DEVICE_UTILS_INC 3 | 4 | #include "THClGeneral.h" 5 | #include 6 | 7 | /** 8 | Computes ceil(a / b) 9 | */ 10 | //template 11 | //T THClCeilDiv(T a, T b); 12 | 13 | #define DECLARE_THCLCEILDIV(TYPE) \ 14 | TYPE THClCeilDiv(TYPE a, TYPE b); 15 | 16 | DECLARE_THCLCEILDIV(uint32); 17 | DECLARE_THCLCEILDIV(uint64); 18 | DECLARE_THCLCEILDIV(int32); 19 | DECLARE_THCLCEILDIV(int64); 20 | 21 | std::string THClDeviceUtils_getKernelTemplate(); 22 | 23 | #endif // THCL_DEVICE_UTILS_INC 24 | 25 | -------------------------------------------------------------------------------- /src/lib/THClGather.cl: -------------------------------------------------------------------------------- 1 | // probably should put this on its own somewhere, so we 2 | // dont have to either ocpy/paste, or include entire THClReduceApplyUtils 3 | typedef struct TensorInfoCl { 4 | unsigned int sizes[{{MAX_CLTORCH_DIMS}}]; 5 | unsigned int strides[{{MAX_CLTORCH_DIMS}}]; 6 | int offset; 7 | int dims; 8 | } TensorInfoCl; 9 | 10 | kernel void THClTensor_kernel_gather( 11 | global TensorInfoCl *dst_info, global float*dst_data, 12 | global const TensorInfoCl *src_info, global float*src_data, 13 | int dim, 14 | global const TensorInfoCl *idx_info, global float*idx_data, 15 | int totalElements 16 | ) 17 | { 18 | for (int _linearId = get_global_id(0); 19 | _linearId < totalElements; 20 | _linearId += get_global_size(0)) { 21 | 22 | // plan is: 23 | // based on our linearIndex, this gets us a spot in the index 24 | // tensor 25 | // this is also a spot in the tgt_data (at least, if we can 26 | // convert into actual coordinates, then it is the coordinates 27 | // in the target tensor 28 | // the coordinates in the source are teh same, except that 29 | // we replace that of dimension dim with the value from 30 | // the index tensor 31 | // 32 | // so, everything hinges on us getting the coordinates, I think? 33 | // so, lets do that :-) 34 | int idxOffset = idx_info->offset; 35 | int srcOffset = src_info->offset; 36 | int dstOffset = dst_info->offset; 37 | int linearId = _linearId; // copy it, since we'll modify it 38 | // for(int d={{dims}}-1; d >= 0; d--) { // just use slow, unbkaed loop for now, to 39 | // get it working 40 | int curDimIndex; 41 | {% for d=dims-1,0,-1 do %} 42 | curDimIndex = linearId % idx_info->sizes[{{d}}]; 43 | idxOffset += curDimIndex * idx_info->strides[{{d}}]; 44 | dstOffset += curDimIndex * dst_info->strides[{{d}}]; 45 | if( {{d}} != dim ) { // this only matters for the source, the others are 46 | // unaffected by which dimension we are on. I think. 47 | srcOffset += curDimIndex * src_info->strides[{{d}}]; 48 | } 49 | linearId /= idx_info->sizes[{{d}}]; 50 | {% end %} 51 | // } 52 | // now we have the idxoffset. get the value at that location 53 | int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based 54 | // then use this to get the final value for srcOffset 55 | srcOffset += idxValue * src_info->strides[dim]; 56 | // get the value... 57 | float value = src_data[srcOffset]; 58 | // and save it up... 59 | dst_data[dstOffset] = value; 60 | // thats it? 61 | } 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/lib/THClGather.cpp: -------------------------------------------------------------------------------- 1 | #include "THClTensorMath.h" 2 | #include "THClGeneral.h" 3 | //#include "THClBlas.h" 4 | #include "THClTensorCopy.h" 5 | //#include "THClTensorRandom.h" 6 | #include "THClApply.h" 7 | #include "THClReduce.h" 8 | #include "THClKernels.h" 9 | #include "THClReduceApplyUtils.h" 10 | 11 | #include 12 | #include 13 | using namespace std; 14 | 15 | static std::string getTemplate(); 16 | 17 | THCL_API void THClTensor_gather(THClState *state, THClTensor *self, THClTensor *src, long dim, THClTensor *index) { 18 | StatefulTimer::timeCheck("THClTensor_kernel_Gather START"); 19 | 20 | // src will be ndims 21 | // index will be ndims too, though one of the dims should have length 1 22 | // self will be ndims 23 | int nDims = src->nDimension; 24 | 25 | THArgCheck(nDims >= 2, 2, "Tensors should have at least 2 dimensions"); // I guess? 26 | THArgCheck(src->nDimension == nDims, 2, "All tensors should have same number of dims"); 27 | THArgCheck(index->nDimension == nDims, 4, "All tensors should have same number of dims"); 28 | THArgCheck(dim < nDims, 4, "dim out of bounds"); 29 | THArgCheck(dim >= 0, 4, "dim out of bounds"); 30 | THArgCheck(nDims < MAX_CLTORCH_DIMS, 2, "Tensors should have less than %i dimensions", MAX_CLTORCH_DIMS); // I guess? 31 | 32 | // THLongStorage *newSize; 33 | 34 | for( int i = 0; i < nDims; i++ ) { 35 | if( i != dim ) { 36 | THArgCheck(THClTensor_size(state, src, i) == THClTensor_size(state, index, i), 3, ("index tensor must have same dimensions as source tensor, but dimension " + easycl::toString(i) + " doesnt match").c_str()); 37 | } 38 | } 39 | 40 | const int device = src->storage->device; 41 | 42 | // hmmm ,I wonder if we need this any more, after migration to TensorMath.lua 43 | // if( self != src ) { 44 | // newSize = THLongStorage_newWithSize(index->nDimension); 45 | // THLongStorage_rawCopy(newSize, index->size); 46 | // THClTensor_resize(state, self, newSize, NULL); 47 | // THLongStorage_free(newSize); 48 | // } 49 | 50 | // since self is write-only, and index and src are read-only, ie none are read-write 51 | // so, we dnot need to worry about contiguity (at least, not from point of view of correctness) 52 | 53 | 54 | std::string uniqueName = __FILE__ ":gather:" + easycl::toString(nDims); 55 | EasyCL *cl = THClTensor_getCl(state, src); 56 | CLKernel *kernel = 0; 57 | if(cl->kernelExists(uniqueName)) { 58 | kernel = cl->getKernel(uniqueName); 59 | StatefulTimer::timeCheck("Apply3 1aa"); 60 | } else { 61 | TemplatedKernel kernelBuilder(cl); 62 | kernelBuilder.set("IndexType", "unsigned int"); 63 | kernelBuilder.set("dims", nDims); 64 | kernelBuilder.set("MAX_CLTORCH_DIMS", MAX_CLTORCH_DIMS); 65 | kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getTemplate(), "THClTensor_kernel_gather" ); 66 | } 67 | 68 | TensorInfoCl selfInfoCl(self); 69 | TensorInfoCl srcInfoCl(src); 70 | TensorInfoCl indexInfoCl(index); 71 | 72 | const dim3 block = getApplyBlock(state, device); 73 | 74 | long totalElements = THClTensor_nElement(state, index); 75 | dim3 grid = getApplyGrid(state, device, totalElements); 76 | 77 | THClKernels k(state, kernel); 78 | kernel->in(1, &selfInfoCl); 79 | kernel->out(self->storage->wrapper); 80 | kernel->in(1, &srcInfoCl); 81 | kernel->in(src->storage->wrapper); 82 | k.in((int)dim); 83 | kernel->in(1, &indexInfoCl); 84 | kernel->in(index->storage->wrapper); 85 | if( totalElements > ( 1l << 30 )) { 86 | throw std::runtime_error("Error: out of bounds for totalelements=" + easycl::toString(totalElements)); 87 | } 88 | k.in( (int)totalElements ); 89 | k.run(grid, block); 90 | 91 | if(state->addFinish) cl->finish(); 92 | StatefulTimer::timeCheck("THClTensor_kernel_Gather END"); 93 | } 94 | 95 | static std::string getTemplate() { 96 | // [[[cog 97 | // import stringify 98 | // stringify.write_kernel( "kernel", "THClGather.cl" ) 99 | // ]]] 100 | // generated using cog, from THClGather.cl: 101 | const char * kernelSource = 102 | "// probably should put this on its own somewhere, so we\n" 103 | "// dont have to either ocpy/paste, or include entire THClReduceApplyUtils\n" 104 | "typedef struct TensorInfoCl {\n" 105 | " unsigned int sizes[{{MAX_CLTORCH_DIMS}}];\n" 106 | " unsigned int strides[{{MAX_CLTORCH_DIMS}}];\n" 107 | " int offset;\n" 108 | " int dims;\n" 109 | "} TensorInfoCl;\n" 110 | "\n" 111 | "kernel void THClTensor_kernel_gather(\n" 112 | " global TensorInfoCl *dst_info, global float*dst_data,\n" 113 | " global const TensorInfoCl *src_info, global float*src_data,\n" 114 | " int dim,\n" 115 | " global const TensorInfoCl *idx_info, global float*idx_data,\n" 116 | " int totalElements\n" 117 | ")\n" 118 | "{\n" 119 | " for (int _linearId = get_global_id(0);\n" 120 | " _linearId < totalElements;\n" 121 | " _linearId += get_global_size(0)) {\n" 122 | "\n" 123 | " // plan is:\n" 124 | " // based on our linearIndex, this gets us a spot in the index\n" 125 | " // tensor\n" 126 | " // this is also a spot in the tgt_data (at least, if we can\n" 127 | " // convert into actual coordinates, then it is the coordinates\n" 128 | " // in the target tensor\n" 129 | " // the coordinates in the source are teh same, except that\n" 130 | " // we replace that of dimension dim with the value from\n" 131 | " // the index tensor\n" 132 | " //\n" 133 | " // so, everything hinges on us getting the coordinates, I think?\n" 134 | " // so, lets do that :-)\n" 135 | " int idxOffset = idx_info->offset;\n" 136 | " int srcOffset = src_info->offset;\n" 137 | " int dstOffset = dst_info->offset;\n" 138 | " int linearId = _linearId; // copy it, since we'll modify it\n" 139 | "// for(int d={{dims}}-1; d >= 0; d--) { // just use slow, unbkaed loop for now, to\n" 140 | " // get it working\n" 141 | " int curDimIndex;\n" 142 | " {% for d=dims-1,0,-1 do %}\n" 143 | " curDimIndex = linearId % idx_info->sizes[{{d}}];\n" 144 | " idxOffset += curDimIndex * idx_info->strides[{{d}}];\n" 145 | " dstOffset += curDimIndex * dst_info->strides[{{d}}];\n" 146 | " if( {{d}} != dim ) { // this only matters for the source, the others are\n" 147 | " // unaffected by which dimension we are on. I think.\n" 148 | " srcOffset += curDimIndex * src_info->strides[{{d}}];\n" 149 | " }\n" 150 | " linearId /= idx_info->sizes[{{d}}];\n" 151 | " {% end %}\n" 152 | "// }\n" 153 | " // now we have the idxoffset. get the value at that location\n" 154 | " int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based\n" 155 | " // then use this to get the final value for srcOffset\n" 156 | " srcOffset += idxValue * src_info->strides[dim];\n" 157 | " // get the value...\n" 158 | " float value = src_data[srcOffset];\n" 159 | " // and save it up...\n" 160 | " dst_data[dstOffset] = value;\n" 161 | " // thats it?\n" 162 | " }\n" 163 | "}\n" 164 | "\n" 165 | ""; 166 | // [[[end]]] 167 | return kernelSource; 168 | } 169 | 170 | -------------------------------------------------------------------------------- /src/lib/THClGeneral.cpp: -------------------------------------------------------------------------------- 1 | #include "THClGeneral.h" 2 | #include "TH.h" 3 | 4 | #include 5 | #include "EasyCL.h" 6 | #include 7 | #include "DeviceInfo.h" 8 | 9 | //using namespace easycl; 10 | 11 | //#include "THCTensorRandom.h" 12 | //#include "THCBlas.h" 13 | //#include "THCAllocator.h" 14 | 15 | /* Size of scratch space available in global memory per each SM + stream */ 16 | #define FLOATS_PER_SCRATCH_SPACE 4 17 | #define GLOBAL_SCRATCH_SPACE_PER_SM_STREAM (FLOATS_PER_SCRATCH_SPACE) * sizeof(float) 18 | 19 | void THCl_initializeState(THClState *state) { 20 | if(state->initialized) { 21 | return; 22 | } 23 | state->initialized = 1; 24 | if(state->allowNonGpus) { 25 | state->allocatedDevices = easycl::DevicesInfo::getNumDevices(); 26 | } else { 27 | state->allocatedDevices = easycl::DevicesInfo::getNumGpus(); 28 | } 29 | state->clByDevice = new EasyCL *[state->allocatedDevices]; 30 | state->scratchSpaceByDevice = new THClScratchSpace *[state->allocatedDevices]; 31 | state->trace = 0; 32 | state->detailedTimings = 0; 33 | state->addFinish = 0; 34 | // state->workgroupSizeByDevice = new int[state->allocatedDevices]; 35 | state->deviceInfoByDevice = (DeviceInfo **)new easycl::DeviceInfo *[state->allocatedDevices]; 36 | for(int i = 0; i < state->allocatedDevices; i++) { 37 | state->clByDevice[i] = 0; 38 | state->scratchSpaceByDevice[i] = 0; 39 | state->deviceInfoByDevice[i] = 0; 40 | } 41 | state->currentDevice = 0; 42 | //state->cl = EasyCL::createForFirstGpuOtherwiseCpu(); // obviously this should change... 43 | 44 | cl_int err; 45 | 46 | err = clblasSetup(); 47 | if (err != CL_SUCCESS) { 48 | THError("clblasSetup() failed with %d", err); 49 | } 50 | } 51 | void THClSetAllowNonGpus(THClState *state, int allowNonGpus) { 52 | if(state->initialized) { 53 | THError("cannot set allowNonGpus after initialization done"); 54 | } else { 55 | state->allowNonGpus = allowNonGpus; 56 | } 57 | } 58 | void THClInit(THClState* state) 59 | { 60 | state->initialized = 0; 61 | state->allowNonGpus = 0; 62 | state->trace = 0; 63 | state->detailedTimings = 0; 64 | state->addFinish = 0; 65 | state->currentDevice = 0; 66 | state->allocatedDevices = 0; 67 | 68 | state->clByDevice = 0; 69 | state->scratchSpaceByDevice = 0; 70 | state->deviceInfoByDevice = 0; 71 | } 72 | 73 | void THClShutdown(THClState* state) 74 | { 75 | if(state->initialized == 0) { 76 | return; 77 | } 78 | 79 | clblasTeardown(); 80 | for( int i = 0; i < state->allocatedDevices; i++ ) { 81 | delete state->clByDevice[i]; 82 | delete state->scratchSpaceByDevice[i]->wrapper; 83 | delete[] state->scratchSpaceByDevice[i]->data; 84 | delete (easycl::DeviceInfo*)state->deviceInfoByDevice[i]; 85 | } 86 | delete[] (easycl::DeviceInfo**)state->deviceInfoByDevice; 87 | delete[] state->clByDevice; 88 | delete[] state->scratchSpaceByDevice; 89 | state->initialized = 0; 90 | // delete[] state->workgroupSizeByDevice 91 | 92 | printf("THClShutdown() done\n"); 93 | printf("*******************************************\n"); 94 | } 95 | 96 | std::ostream &operator<<( std::ostream &os, const dim3 &obj ) { 97 | os << "dim3{" << obj.vec[0] << ", " << obj.vec[1] << ", " << obj.vec[2] << "}"; 98 | return os; 99 | } 100 | 101 | int THClState_getNumDevices(THClState* state) { 102 | if(state->initialized == 0) { 103 | THCl_initializeState(state); 104 | } 105 | return state->allocatedDevices; 106 | } 107 | void THClState_setDevice(THClState* state, int device) { 108 | if(state->initialized == 0) { 109 | THCl_initializeState(state); 110 | } 111 | state->currentDevice = device; 112 | } 113 | int THClState_getDevice(THClState* state) { 114 | if(state->initialized == 0) { 115 | THCl_initializeState(state); 116 | } 117 | return state->currentDevice; 118 | } 119 | EasyCL *THClState_getCl(THClState* state ) { 120 | if(state->initialized == 0) { 121 | THCl_initializeState(state); 122 | } 123 | return THClState_getClv2(state, state->currentDevice); 124 | } 125 | EasyCL *THClState_getCl(THClState* state, int *p_device) { 126 | if(state->initialized == 0) { 127 | THCl_initializeState(state); 128 | } 129 | if( p_device != 0 ) { 130 | *p_device = state->currentDevice; 131 | } 132 | return THClState_getClv2(state, state->currentDevice); 133 | } 134 | EasyCL *THClState_getClv2(THClState* state, int device) { 135 | if(!state->initialized) { 136 | THCl_initializeState(state); 137 | } 138 | if(state->allocatedDevices == 0) { 139 | THError("No OpenCL-enabled devices available"); 140 | } 141 | if(state->currentDevice >= state->allocatedDevices || state->currentDevice < 0) { 142 | THError("Please use setDevice to choose an available device first"); 143 | } 144 | if( state->clByDevice[device] == 0 ) { 145 | EasyCL *cl = 0; 146 | if(state->allowNonGpus) { 147 | cl = EasyCL::createForIndexedDevice(device); 148 | } else { 149 | cl = EasyCL::createForIndexedGpu(device); 150 | } 151 | state->clByDevice[device] = cl; 152 | THClScratchSpace *scratch = new THClScratchSpace(); 153 | scratch->data = new float[FLOATS_PER_SCRATCH_SPACE]; 154 | scratch->wrapper = cl->wrap(FLOATS_PER_SCRATCH_SPACE, scratch->data); 155 | scratch->wrapper->createOnDevice(); 156 | state->scratchSpaceByDevice[device] = scratch; 157 | state->deviceInfoByDevice[device] = (DeviceInfo *)new easycl::DeviceInfo(); 158 | if(state->allowNonGpus) { 159 | *((easycl::DeviceInfo *)state->deviceInfoByDevice[device]) = easycl::DevicesInfo::getDeviceInfo( device ); 160 | } else { 161 | *((easycl::DeviceInfo *)state->deviceInfoByDevice[device]) = easycl::DevicesInfo::getGpuInfo( device ); 162 | } 163 | } 164 | return state->clByDevice[device]; 165 | } 166 | 167 | THClScratchSpace* THClState_getDeviceScratchSpace(THClState* state, int device, int stream) 168 | { 169 | if(state->initialized == 0) { 170 | THCl_initializeState(state); 171 | } 172 | if( stream != 0 ) { 173 | THError("%d is not a stream", stream); 174 | } 175 | return state->scratchSpaceByDevice[device]; 176 | } 177 | 178 | size_t THClState_getCurrentDeviceScratchSpaceSize(THClState* state) 179 | { 180 | if(state->initialized == 0) { 181 | THCl_initializeState(state); 182 | } 183 | int device = state->currentDevice; 184 | return THClState_getDeviceScratchSpaceSize(state, device); 185 | } 186 | 187 | size_t THClState_getDeviceScratchSpaceSize(THClState* state, int device) 188 | { 189 | if(state->initialized == 0) { 190 | THCl_initializeState(state); 191 | } 192 | 193 | return GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; // true currently since we only have 194 | // one stream per device, currently 195 | } 196 | 197 | -------------------------------------------------------------------------------- /src/lib/THClGeneral.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_GENERAL_INC 2 | #define THCL_GENERAL_INC 3 | 4 | #include "THGeneral.h" 5 | #include "THAllocator.h" 6 | #undef log1p 7 | 8 | #ifdef __cplusplus 9 | # define THCL_EXTERNC extern "C" 10 | # define THCL_EXTERNCPP extern 11 | #else 12 | # define THCL_EXTERNC extern 13 | #endif 14 | 15 | #ifdef WIN32 16 | # ifdef THCL_EXPORTS 17 | # define THCL_API THCL_EXTERNC __declspec(dllexport) 18 | # else 19 | # define THCL_API THCL_EXTERNC __declspec(dllimport) 20 | # endif 21 | #else 22 | # define THCL_API THCL_EXTERNC 23 | #endif 24 | 25 | #ifdef __cplusplus 26 | #ifdef WIN32 27 | #ifdef THCL_EXPORTS 28 | #define THCL_API_CPP THCL_EXTERNCPP __declspec(dllexport) 29 | #else 30 | #define THCL_API_CPP THCL_EXTERNCPP __declspec(dllimport) 31 | #endif 32 | #else 33 | #define THCL_API_CPP THCL_EXTERNCPP 34 | #endif 35 | #endif 36 | 37 | //// from http://stackoverflow.com/questions/295120/c-mark-as-deprecated 38 | //#ifdef __GNUC__ 39 | //#define DEPRECATED __attribute__((deprecated)) 40 | //#elif defined(_MSC_VER) 41 | //#define DEPRECATED __declspec(deprecated) 42 | //#else 43 | //#pragma message("WARNING: You need to implement DEPRECATED for this compiler") 44 | //#define DEPRECATED 45 | //#endif 46 | 47 | #ifdef __GNUC__ 48 | #define DEPRECATED_POST __attribute__((deprecated)) 49 | #endif 50 | 51 | #ifdef __cplusplus 52 | //#define PTR_CLASS class* 53 | #define PTR_EASYCL EasyCL* 54 | #define PTR_DEVICEINFO DeviceInfo* 55 | #define PTR_CLWRAPPER CLWrapper* 56 | class EasyCL; 57 | class CLWrapper; 58 | class DeviceInfo; 59 | #else 60 | //#define PTR_CLASS struct* 61 | #define PTR_EASYCL struct EasyCL* 62 | #define PTR_DEVICEINFO struct DeviceInfo* 63 | #define PTR_CLWRAPPER struct CLWrapper* 64 | #endif 65 | 66 | #ifdef __cplusplus 67 | #include 68 | #endif // __cplusplus 69 | 70 | typedef struct THClScratchSpace { 71 | PTR_CLWRAPPER wrapper; 72 | float *data; 73 | } THClScratchSpace; 74 | 75 | /* Global state to be held in the cltorch table. */ 76 | typedef struct THClState 77 | { 78 | int initialized; 79 | int allowNonGpus; 80 | int allocatedDevices; 81 | int currentDevice; 82 | int trace; // default 0; set to 1 to see message for every gpu buffer alloc, delete, 83 | // or device <-> host transfer 84 | int addFinish; // default 0, should we add clFinish() after any kernel, enqueue, etc? 85 | // (good for debugging stuff, bad for perf) 86 | int detailedTimings; 87 | struct THClScratchSpace**scratchSpaceByDevice; // for now, do one 'stream' per device 88 | // can improve later... 89 | PTR_DEVICEINFO *deviceInfoByDevice; 90 | // int *workgroupSizeByDevice; 91 | PTR_EASYCL *clByDevice; 92 | // EasyCL *getCl(); 93 | } THClState; 94 | 95 | THCL_API void THCl_initializeState(THClState* state); 96 | THCL_API void THClSetAllowNonGpus(THClState *state, int allowNonGpus); 97 | THCL_API void THClInit(THClState* state); 98 | THCL_API void THClShutdown(THClState* state); 99 | //THCL_API void THClEnablePeerToPeerAccess(THClState* state); 100 | 101 | /* State manipulators and accessors */ 102 | THCL_API int THClState_getNumDevices(THClState* state); 103 | THCL_API void THClState_setDevice(THClState* state, int device); 104 | THCL_API int THClState_getDevice(THClState* state); 105 | THCL_API PTR_EASYCL THClState_getCl(THClState* state) DEPRECATED_POST; 106 | THCL_API PTR_EASYCL THClState_getClAndDevice(THClState* state, int *p_device) DEPRECATED_POST; 107 | THCL_API PTR_EASYCL THClState_getClv2(THClState* state, int device); 108 | 109 | //THCL_API void THClState_reserveStreams(THClState* state, int numStreams); 110 | //THCL_API int THClState_getNumStreams(THClState* state); 111 | 112 | //THCL_API cudaStream_t THClState_getDeviceStream(THClState *state, int device, int stream); 113 | //THCL_API cudaStream_t THClState_getCurrentStream(THClState *state); 114 | //THCL_API int THClState_getCurrentStreamIndex(THClState *state); 115 | //THCL_API void THClState_setStream(THClState *state, int device, int stream); 116 | //THCL_API void THClState_setStreamForCurrentDevice(THClState *state, int stream); 117 | 118 | //THCL_API void THClState_reserveBlasHandles(THClState* state, int numHandles); 119 | //THCL_API int THClState_getNumBlasHandles(THClState* state); 120 | 121 | //THCL_API clblasHandle_t THClState_getDeviceBlasHandle(THClState *state, int device, int handle); 122 | //THCL_API clblasHandle_t THClState_getCurrentBlasHandle(THClState *state); 123 | //THCL_API int THClState_getCurrentBlasHandleIndex(THClState *state); 124 | //THCL_API void THClState_setBlasHandle(THClState *state, int device, int handle); 125 | //THCL_API void THClState_setBlasHandleForCurrentDevice(THClState *state, int handle); 126 | 127 | /* For the current device and stream, returns the allocated scratch space */ 128 | THCL_API struct THClScratchSpace* THClState_getCurrentDeviceScratchSpace(THClState* state) DEPRECATED_POST; 129 | THCL_API struct THClScratchSpace* THClState_getDeviceScratchSpace(THClState* state, int device, int stream); 130 | THCL_API size_t THClState_getCurrentDeviceScratchSpaceSize(THClState* state) DEPRECATED_POST; 131 | THCL_API size_t THClState_getDeviceScratchSpaceSize(THClState* state, int device); 132 | 133 | //#define THClCheck(err) __THClCheck(err, __FILE__, __LINE__) 134 | //#define THCublasCheck(err) __THCublasCheck(err, __FILE__, __LINE__) 135 | 136 | //THCL_API void __THClCheck(cudaError_t err, const char *file, const int line); 137 | //THCL_API void __THCublasCheck(clblasStatus_t status, const char *file, const int line); 138 | 139 | typedef unsigned long long uint64; 140 | typedef unsigned int uint32; 141 | typedef long long int64; 142 | typedef int int32; 143 | 144 | // define dim3, since this came from cuda in cutorch 145 | #ifdef __cplusplus 146 | class dim3 { 147 | public: 148 | uint32 vec[3]; 149 | size_t vec_for_cl[3]; 150 | // size_t vec_size_t[3]; 151 | dim3() { 152 | vec[0] = 1; 153 | vec[1] = 1; 154 | vec[2] = 1; 155 | } 156 | dim3( uint32 x ) { 157 | vec[0] = x; 158 | vec[1] = 1; 159 | vec[2] = 1; 160 | } 161 | dim3( uint32 x, uint32 y ) { 162 | vec[0] = x; 163 | vec[1] = y; 164 | vec[2] = 1; 165 | } 166 | dim3( uint32 x, uint32 y, uint32 z ) { 167 | vec[0] = x; 168 | vec[1] = y; 169 | vec[2] = z; 170 | } 171 | inline uint32 x() { 172 | return vec[0]; 173 | } 174 | inline uint32 y() { 175 | return vec[1]; 176 | } 177 | inline uint32 z() { 178 | return vec[2]; 179 | } 180 | size_t const *as_size_t() { 181 | for( int i = 0; i < 3; i++ ) { 182 | vec_for_cl[i] = vec[i]; 183 | } 184 | return vec_for_cl; 185 | } 186 | }; 187 | 188 | std::ostream &operator<<( std::ostream &os, const dim3 &obj ); 189 | 190 | //typedef struct _dim3 { 191 | // int x; 192 | // int y; 193 | // int z; 194 | // _dim3( int x ) { 195 | // this->x = x; 196 | // y = 1; 197 | // z = 1; 198 | // } 199 | //} dim3; 200 | #endif // __cplusplus 201 | 202 | // seems that min is really inconsistent across standard libraires, lets just make our own ... :-/ 203 | static inline int THCl_min( int a, int b ) { 204 | return a < b ? a : b; 205 | } 206 | 207 | #endif 208 | 209 | -------------------------------------------------------------------------------- /src/lib/THClKernels.cpp: -------------------------------------------------------------------------------- 1 | #include "THClKernels.h" 2 | #include "EasyCL.h" 3 | #include "THClTensor.h" 4 | #include 5 | #include "THClReduceApplyUtils.h" 6 | #include "CLKernel_structs.h" 7 | 8 | #include 9 | using namespace std; 10 | 11 | // Constructor 12 | THClKernels::THClKernels(THClState *state, CLKernel *kernel) : 13 | state(state), 14 | kernel(kernel) { 15 | } 16 | THClKernels::~THClKernels() { 17 | for( int i = 0; i < (int)tensorInfoCls.size(); i++ ) { 18 | delete tensorInfoCls[i]; 19 | } 20 | } 21 | // CLTensors ===================== 22 | THClKernels *THClKernels::in(THClTensor *tensor) { 23 | try { 24 | kernel->in(THClTensor_wrapper(state, tensor)); 25 | kernel->in((int)THClTensor_storageOffset(state, tensor)); 26 | } catch( runtime_error &e ) { 27 | THError(e.what()); 28 | } 29 | return this; 30 | } 31 | THClKernels *THClKernels::inout(THClTensor *tensor) { 32 | try { 33 | kernel->inout(THClTensor_wrapper(state, tensor)); 34 | kernel->in((int)THClTensor_storageOffset(state, tensor)); 35 | } catch( runtime_error &e ) { 36 | THError(e.what()); 37 | } 38 | return this; 39 | } 40 | THClKernels *THClKernels::out(THClTensor *tensor) { 41 | try { 42 | kernel->out(THClTensor_wrapper(state, tensor)); 43 | kernel->in((int)THClTensor_storageOffset(state, tensor)); 44 | } catch( runtime_error &e ) { 45 | THError(e.what()); 46 | } 47 | return this; 48 | } 49 | // CLTensors v2 ===================== 50 | THClKernels *THClKernels::inv2(THClTensor *tensor) { 51 | try { 52 | TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensor); 53 | kernel->in(1, tensorInfoCl); 54 | kernel->in(THClTensor_wrapper(state, tensor)); 55 | tensorInfoCls.push_back(tensorInfoCl); 56 | } catch( runtime_error &e ) { 57 | THError(e.what()); 58 | } 59 | return this; 60 | } 61 | THClKernels *THClKernels::inoutv2(THClTensor *tensor) { 62 | try { 63 | TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensor); 64 | kernel->in(1, tensorInfoCl); 65 | kernel->inout(THClTensor_wrapper(state, tensor)); 66 | tensorInfoCls.push_back(tensorInfoCl); 67 | } catch( runtime_error &e ) { 68 | THError(e.what()); 69 | } 70 | return this; 71 | } 72 | THClKernels *THClKernels::outv2(THClTensor *tensor) { 73 | try { 74 | TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensor); 75 | kernel->in(1, tensorInfoCl); 76 | kernel->out(THClTensor_wrapper(state, tensor)); 77 | tensorInfoCls.push_back(tensorInfoCl); 78 | } catch( runtime_error &e ) { 79 | THError(e.what()); 80 | } 81 | return this; 82 | } 83 | // scalars ================== 84 | THClKernels *THClKernels::in(int value) { 85 | try { 86 | kernel->in(value); 87 | } catch( runtime_error &e ) { 88 | THError(e.what()); 89 | } 90 | return this; 91 | } 92 | THClKernels *THClKernels::in(float value) { 93 | try { 94 | kernel->in(value); 95 | } catch( runtime_error &e ) { 96 | THError(e.what()); 97 | } 98 | return this; 99 | } 100 | // CLTensorInfos ================ 101 | template< typename IndexType > 102 | THClKernels *THClKernels::in(TensorInfotensorInfo) { 103 | TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensorInfo); 104 | kernel->in(1, tensorInfoCl); 105 | kernel->in(tensorInfo.wrapper); 106 | tensorInfoCls.push_back(tensorInfoCl); 107 | return this; 108 | } 109 | template< typename IndexType > 110 | THClKernels *THClKernels::inout(TensorInfotensorInfo) { 111 | TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensorInfo); 112 | kernel->in(1, tensorInfoCl); 113 | kernel->inout(tensorInfo.wrapper); 114 | tensorInfoCls.push_back(tensorInfoCl); 115 | return this; 116 | } 117 | template< typename IndexType > 118 | THClKernels *THClKernels::out(TensorInfotensorInfo) { 119 | TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensorInfo); 120 | if( !tensorInfo.wrapper->isOnDevice() ) { 121 | tensorInfo.wrapper->createOnDevice(); 122 | } 123 | kernel->in(1, tensorInfoCl); 124 | kernel->out(tensorInfo.wrapper); 125 | tensorInfoCls.push_back(tensorInfoCl); 126 | return this; 127 | } 128 | // CLWrapper =============== 129 | THClKernels *THClKernels::in(CLWrapper *wrapper) { 130 | try { 131 | kernel->in(wrapper); 132 | } catch( runtime_error &e ) { 133 | THError(e.what()); 134 | } 135 | return this; 136 | } 137 | THClKernels *THClKernels::inout(CLWrapper *wrapper) { 138 | try { 139 | kernel->inout(wrapper); 140 | } catch( runtime_error &e ) { 141 | THError(e.what()); 142 | } 143 | return this; 144 | } 145 | THClKernels *THClKernels::out(CLWrapper *wrapper) { 146 | try { 147 | if( !wrapper->isOnDevice() ) { 148 | wrapper->createOnDevice(); 149 | } 150 | kernel->out(wrapper); 151 | } catch( runtime_error &e ) { 152 | THError(e.what()); 153 | } 154 | return this; 155 | } 156 | void THClKernels::run(dim3 grid, dim3 block) { 157 | dim3 global_ws; 158 | for( int i = 0; i < 3; i++ ) { 159 | global_ws.vec[i] = grid.vec[i] * block.vec[i]; 160 | } 161 | try { 162 | kernel->run(3, global_ws.as_size_t(), block.as_size_t()); 163 | } catch( runtime_error &e ) { 164 | cout << e.what() << endl; 165 | THError(e.what()); 166 | } 167 | } 168 | // locals ================== 169 | THClKernels *THClKernels::localFloats(int count) { 170 | try { 171 | kernel->localFloats(count); 172 | } catch( runtime_error &e ) { 173 | THError(e.what()); 174 | } 175 | return this; 176 | } 177 | 178 | // template instantiations ==================== 179 | #define DECLARE_THCLKERNELS(IndexType) \ 180 | template \ 181 | THClKernels *THClKernels::in(TensorInfotensorInfo); \ 182 | template \ 183 | THClKernels *THClKernels::inout(TensorInfotensorInfo); \ 184 | template \ 185 | THClKernels *THClKernels::out(TensorInfotensorInfo); 186 | 187 | DECLARE_THCLKERNELS(uint32); 188 | DECLARE_THCLKERNELS(uint64); 189 | 190 | template CLKernel *CLKernel::in<>(int N, const TensorInfoCl *data); 191 | template CLKernel *CLKernel::inout<>(int N, const TensorInfoCl *data); 192 | template CLKernel *CLKernel::out<>(int N, const TensorInfoCl *data); 193 | 194 | -------------------------------------------------------------------------------- /src/lib/THClKernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | //class THClState; 6 | class CLKernel; 7 | //class THClTensor; 8 | class CLWrapper; 9 | 10 | #include "THClGeneral.h" 11 | #include "THClReduceApplyUtils.h" 12 | 13 | // inty types 14 | // ========== 15 | // 16 | // this uses ints for all the long/int type things 17 | // we can create a new version later that uses longs 18 | // for now, you need to make surethat anything inty in the kernel parameters is an int, not a long etc 19 | // 20 | // Passing THClTensors 21 | // =================== 22 | // - when you are passing in a tensor, you need two parameters for each tensor, 23 | // in the kernel, eg lets say in the cuda, there is a kernel parameter 24 | // 25 | // __global__ foo( float *src, ... 26 | // 27 | // This will become, in our kernel: 28 | // 29 | // kernel foo( global float *src_data, int src_offset, ... 30 | // 31 | // thats it :-) now just use an object of this class to pass in the data 32 | // oh...in the kernel, when you use the src_data object, make sure to 33 | // add the offset. Like: 34 | // 35 | // src[i] 36 | // 37 | // ... in the cuda becomes: 38 | // 39 | // src_data[src_offset + i] 40 | // 41 | // ....in the opencl 42 | // 43 | // Passing THClTEnsorInfos 44 | // ======================= 45 | // 46 | // On the receiving side, there needs to be two global parameters. ie, if in cuda kernel 47 | // we have: 48 | // 49 | // __global__ foo(THClTensorInfo mytensor, ...) 50 | // 51 | // on OpenCL kernel, we will have: 52 | // 53 | // kernel foo(global THClTensorInfoCl *mytensor_info, global float *mytensor_data, ...) 54 | // 55 | // You'll also need to define THClTensorInfoCl struct in your kernel, eg by including 56 | // code from include_THClReduceApplyUtils.cl, see THClApply.h for an example 57 | // 58 | // in, inout, out 59 | // ============== 60 | // Note on difference between 'in', 'out', 'inout': 61 | // - 'inout' and 'out' will mark the CLWrapper gpu buffer as 'dirty', 62 | // needing to be 63 | // copied to host, if we want to work on host-side 64 | // - 'out' will allocate the CLWrapper device-side buffer, if not already 65 | // allocated (in and inout will throw an error, if not allocated on device-side 66 | // already) 67 | class THClKernels { 68 | THClState *state; 69 | CLKernel *kernel; 70 | 71 | std::vector< TensorInfoCl * >tensorInfoCls; 72 | 73 | public: 74 | THClKernels(THClState *state, CLKernel *kernel); 75 | ~THClKernels(); 76 | 77 | THClKernels *in(THClTensor *tensor); 78 | THClKernels *inout(THClTensor *tensor); 79 | THClKernels *out(THClTensor *tensor); 80 | 81 | THClKernels *inv2(THClTensor *tensor); // expects kernel parameters as `global struct THClTensorInfoCl *a_info, global float *a_data` 82 | THClKernels *inoutv2(THClTensor *tensor); 83 | THClKernels *outv2(THClTensor *tensor); 84 | 85 | template< typename IndexType > 86 | THClKernels *in(TensorInfotensorInfo); 87 | template< typename IndexType > 88 | THClKernels *inout(TensorInfotensorInfo); 89 | template< typename IndexType > 90 | THClKernels *out(TensorInfotensorInfo); 91 | 92 | THClKernels *in(CLWrapper *wrapper); 93 | THClKernels *inout(CLWrapper *wrapper); 94 | THClKernels *out(CLWrapper *wrapper); 95 | 96 | THClKernels *in(int value); 97 | THClKernels *in(float value); 98 | 99 | THClKernels *localFloats(int count); 100 | 101 | void run(dim3 grid, dim3 block); // uses cutorch-compatible dimensions 102 | }; 103 | 104 | -------------------------------------------------------------------------------- /src/lib/THClOperators.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | 5 | #include 6 | 7 | class OpBase { 8 | public: 9 | virtual std::string getName() const { return "OpBase"; } // just to make this class polymorphic 10 | }; 11 | 12 | class HasScalars : public OpBase { 13 | public: 14 | virtual int getNumScalars() const = 0; 15 | virtual float getScalar(int index) const = 0; 16 | }; 17 | 18 | class HasOperator1 : public OpBase { 19 | public: 20 | virtual std::string operator1() const = 0; 21 | }; 22 | 23 | class HasOperator2 : public OpBase { 24 | public: 25 | virtual std::string operator2() const = 0; 26 | }; 27 | 28 | class HasOperator3 : public OpBase { 29 | public: 30 | virtual std::string operator3() const = 0; 31 | }; 32 | 33 | class HasGlobalTensors { 34 | public: 35 | virtual int getNumGlobalTensors() const = 0; 36 | virtual THClTensor *getTensor(int index) const = 0; 37 | virtual std::string getTensorName(int index) const = 0; 38 | }; 39 | 40 | class HasPointTensors { 41 | public: 42 | virtual int getNumPointTensors() const = 0; 43 | virtual const THClTensor *getPointTensor(int index) const = 0; 44 | }; 45 | 46 | #endif // __cplusplus 47 | 48 | -------------------------------------------------------------------------------- /src/lib/THClReduce.cl: -------------------------------------------------------------------------------- 1 | // Threads per thread block 2 | #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 3 | 4 | static inline float modifyOp(float _in1) { 5 | float _out; 6 | float *in1 = &_in1; 7 | float *out = &_out; 8 | {{modify_operation}}; 9 | return _out; 10 | } 11 | 12 | static inline float reduceOp(float _in1, float _in2) { 13 | // I guess the compiler can sort this stuff out :-P 14 | float _out; 15 | float *in1 = &_in1; 16 | float *in2 = &_in2; 17 | float *out = &_out; 18 | {{reduce_operation}}; 19 | return _out; 20 | } 21 | 22 | {{include_THClReduceApplyUtils}} 23 | 24 | static inline {{IndexType}} getReduceNoncontigDimSliceIndex() { 25 | // Each thread handles one slice 26 | return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0); 27 | } 28 | 29 | // Kernel that handles an entire reduction of a slice of a tensor per each thread 30 | kernel void 31 | THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info, 32 | global float *out_data, 33 | global TensorInfoCl *in_info, 34 | global float *in_data, 35 | int reductionStride, 36 | int reductionSize, 37 | int totalSlices, 38 | float init) { 39 | const {{IndexType}} sliceIndex = getReduceNoncontigDimSliceIndex(); 40 | 41 | if ((int)sliceIndex >= totalSlices) { 42 | return; 43 | } 44 | 45 | // Each thread picks a point in `out` and `in` for which it is 46 | // producing the reduction 47 | const {{IndexType}} outOffset = 48 | IndexToOffset_{{1000 + dim1}}_get(sliceIndex, &out_info[0]); 49 | const {{IndexType}} inBaseOffset = 50 | IndexToOffset_{{1000 + dim2}}_get(sliceIndex, &in_info[0]); 51 | 52 | // For each point in reductionSize, reduce into `r` 53 | {{IndexType}} inOffset = inBaseOffset; 54 | float r = init; 55 | 56 | for ({{IndexType}} i = 0; (int)i < reductionSize; ++i) { 57 | r = reduceOp(r, modifyOp(in_data[inOffset])); 58 | inOffset += reductionStride; 59 | } 60 | 61 | // Write out reduced value 62 | out_data[outOffset] = r; 63 | } 64 | 65 | static inline {{IndexType}} getReduceContigDimSliceIndex() { 66 | // Each block handles one slice 67 | return getLinearBlockId(); 68 | } 69 | 70 | // Kernel that handles an entire reduction of a slice of a tensor per 71 | // each block 72 | kernel void 73 | THClTensor_reduceContigDim(global TensorInfoCl *out_info, 74 | global float *out_data, 75 | global TensorInfoCl *in_info, 76 | global float *in_data, 77 | int reductionSize, 78 | int totalSlices, 79 | float init, 80 | local float *smem) { 81 | const {{IndexType}} sliceIndex = getReduceContigDimSliceIndex(); 82 | 83 | if ((int)sliceIndex >= totalSlices) { 84 | return; 85 | } 86 | 87 | // Get the offset in `out` for the reduction 88 | const {{IndexType}} outOffset = 89 | IndexToOffset_{{1000 + dim1}}_get(sliceIndex, &out_info[0]); 90 | 91 | // Get the base offset in `in` for this block's reduction 92 | const {{IndexType}} inBaseOffset = 93 | IndexToOffset_{{1000 + dim2}}_get(sliceIndex, &in_info[0]); 94 | 95 | // Each thread in the block will reduce some subset of elements in 96 | // the slice. The elements are guaranteed contiguous starting at 97 | // `inBaseOffset`. 98 | float r = init; 99 | for ({{IndexType}} i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) { 100 | r = reduceOp(r, modifyOp(in_data[inBaseOffset + i])); 101 | } 102 | 103 | // Reduce within the block 104 | // extern __shared__ float smem[]; 105 | r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init); 106 | 107 | if (/*threadIdx.x*/ get_local_id(0) == 0) { 108 | // Write out reduced value 109 | out_data[outOffset] = r; 110 | } 111 | } 112 | 113 | -------------------------------------------------------------------------------- /src/lib/THClReduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef THCL_REDUCE_INC 4 | #define THCL_REDUCE_INC 5 | 6 | #include 7 | #include 8 | #include 9 | #include "THClReduceApplyUtils.h" 10 | #include "templates/TemplatedKernel.h" 11 | #include "util/easycl_stringhelper.h" 12 | #include "EasyCL.h" 13 | #include "THClTypeParseTraits.h" 14 | #include "THClDeviceUtils.h" 15 | #include "THClKernels.h" 16 | #include "util/StatefulTimer.h" 17 | 18 | 19 | std::string THClReduce_getKernelSource(); 20 | 21 | // 22 | // This file contains dimension reduction operation functions and 23 | // kernels that work on both contiguous and non-contiguous tensor 24 | // arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned 25 | // arguments without copying or temporary storage. 26 | // 27 | 28 | //#define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16 29 | bool THClTensor_reduceDim(THClState* state, 30 | THClTensor* out, 31 | THClTensor* in, 32 | float init, 33 | const HasOperator2 *modifyOp, 34 | const HasOperator3 *reduceOp, 35 | int dim); 36 | 37 | #undef THCL_NONCONTIG_REDUCE_BLOCK_SIZE 38 | 39 | #endif // THCL_REDUCE_INC 40 | 41 | -------------------------------------------------------------------------------- /src/lib/THClReduceAll.cl: -------------------------------------------------------------------------------- 1 | {{include_THClDeviceUtils}} 2 | 3 | static inline float modifyOp(float _in1) { 4 | float _out; 5 | float *in1 = &_in1; 6 | float *out = &_out; 7 | {{modify_operation}}; 8 | return _out; 9 | } 10 | 11 | static inline float reduceOp(float _in1, float _in2) { 12 | // I guess the compiler can sort this stuff out :-P 13 | float _out; 14 | float *in1 = &_in1; 15 | float *in2 = &_in2; 16 | float *out = &_out; 17 | {{reduce_operation}}; 18 | return _out; 19 | } 20 | 21 | {{include_THClReduceApplyUtils}} 22 | 23 | // Kernel that handles an entire reduction of a tensor in one pass 24 | kernel void 25 | THClTensor_reduceAll(global TensorInfoCl *in_info, 26 | global float *in_data, 27 | {{IndexType}} totalElements, 28 | float init, 29 | global float* out, 30 | local float *smem) { 31 | // With a block-wide stride, have each thread perform its own reduction. 32 | float r = init; 33 | for ({{IndexType}} i = get_local_id(0); i < totalElements; i += get_local_size(0)) { 34 | const {{IndexType}} inOffset = IndexToOffset_{{1000 + dim1}}_get(i, &in_info[0]); 35 | r = reduceOp(r, modifyOp(in_data[inOffset])); 36 | } 37 | 38 | // Reduce within the block 39 | r = reduceBlock(smem, get_local_size(0), r, init); 40 | 41 | if(get_local_id(0) == 0) { 42 | // Write out reduced value 43 | out[0] = r; 44 | } 45 | } 46 | 47 | static inline {{IndexType}} getStartIndex({{IndexType}} totalSize) { 48 | {{IndexType}} sizePerBlock = THClCeilDiv(totalSize, ({{IndexType}}) get_num_groups(0)); 49 | return get_group_id(0) * sizePerBlock; 50 | } 51 | 52 | static inline {{IndexType}} getEndIndex({{IndexType}} totalSize) { 53 | {{IndexType}} sizePerBlock = THClCeilDiv(totalSize, ({{IndexType}}) get_num_groups(0)); 54 | return min(({{IndexType}}) ((get_group_id(0) + 1) * sizePerBlock), totalSize); 55 | } 56 | 57 | // Kernel that handles an entire reduction of a tensor in two passes 58 | kernel void 59 | THClTensor_reduceAllPass1(global TensorInfoCl *in_info, 60 | global float *in_data, 61 | {{IndexType}} totalElements, 62 | float init, 63 | global float* scratchSpace, 64 | local float *smem) { 65 | const {{IndexType}} startIndex = getStartIndex(totalElements); 66 | const {{IndexType}} endIndex = getEndIndex(totalElements); 67 | 68 | // With a block-wide stride, have each thread perform its own reduction. 69 | float r = init; 70 | for ({{IndexType}} i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) { 71 | const {{IndexType}} inOffset = IndexToOffset_{{1000 + dim1}}_get(i, &in_info[0]); 72 | r = reduceOp(r, modifyOp(in_data[inOffset])); 73 | } 74 | 75 | // Reduce within the block 76 | r = reduceBlock(smem, get_local_size(0), r, init); 77 | 78 | if ((int)get_local_id(0) == 0) { 79 | // Write out block-wide reduced value 80 | scratchSpace[get_group_id(0)] = r; 81 | } 82 | } 83 | 84 | kernel void THClTensor_reduceAllPass2(int numPass1Blocks, 85 | float init, 86 | global float* scratchSpace, 87 | global float* out, 88 | local float *smem) { 89 | float r = init; 90 | if ((int)get_local_id(0) < numPass1Blocks) { 91 | r = scratchSpace[get_local_id(0)]; 92 | } 93 | 94 | // Reduce within the block 95 | r = reduceBlock(smem, numPass1Blocks, r, init); 96 | 97 | if((int)get_local_id(0) == 0) { 98 | out[0] = r; 99 | } 100 | } 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /src/lib/THClReduceAll.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "THClGeneral.h" 4 | #include "THClTensor.h" 5 | #include "THClOperators.h" 6 | #include "EasyCL.h" 7 | 8 | // 9 | // This file contains dimension reduction operation functions and 10 | // kernels that work on both contiguous and non-contiguous tensor 11 | // arguments of arbitrary (up to MAX_CLTORCH_DIMS) dimensioned 12 | // arguments without copying or temporary storage, for reducing an 13 | // entire tensor to one value. 14 | // 15 | 16 | bool THClTensor_reduceAll(THClState* state, 17 | THClTensor* in, 18 | const HasOperator2 *modifyOp, 19 | const HasOperator3 *reduceOp, 20 | float init, 21 | CLWrapper *res); 22 | 23 | -------------------------------------------------------------------------------- /src/lib/THClReduceApplyUtils.cl: -------------------------------------------------------------------------------- 1 | // kernel argument that defines tensor layout 2 | typedef struct TensorInfoCl { 3 | // Extracts size/stride information for the kernel. 4 | // Successive dimensions can be collapsed if the size/strides match 5 | // up and thus there are no holes between the dimensions. This is used 6 | // to reduce the complexity of the problem. 7 | // The optional `reduceDim` indicates a reduction dimension for the 8 | // given tensor, so that the output size for this dimension will be 1. 9 | 10 | {{IndexType}} sizes[{{MAX_CLTORCH_DIMS}}]; 11 | {{IndexType}} strides[{{MAX_CLTORCH_DIMS}}]; 12 | {{IndexType}} offset; 13 | int dims; 14 | } TensorInfoCl; 15 | // Contiguous tensors of more than one dimension are collapsed down 16 | // to one tensor 17 | {% if defiscontiguous==1 then %} 18 | static inline bool TensorInfo_isContiguous( global TensorInfoCl *tensorInfo ) { 19 | return (tensorInfo->dims == 1 && tensorInfo->strides[0] == 1); 20 | } 21 | {% end %} 22 | 23 | // Translate a linear index for the apply to a float* offset; 24 | // specialized on `Dims` to reduce nvcc compilation time 25 | {% for _,dim in ipairs(dims) do %} 26 | static inline {{IndexType}} IndexToOffset_{{1000 + dim}}_get( {{IndexType}} linearId, global TensorInfoCl *info) { 27 | {{IndexType}} offset = info->offset; 28 | 29 | // Use static dims 30 | // for (int i = {{dim}} - 1; i >= 0; --i) { 31 | {{IndexType}} curDimIndex; 32 | {{IndexType}} curDimOffset; 33 | {% for i=dim-1,0,-1 do %} // bake this in.... 34 | curDimIndex = linearId % info->sizes[{{i}}]; 35 | curDimOffset = curDimIndex * info->strides[{{i}}]; 36 | offset += curDimOffset; 37 | 38 | {% if i > 0 then %} 39 | linearId /= info->sizes[{{i}}]; 40 | {% end %} 41 | {% end %} 42 | // } 43 | 44 | return offset; 45 | } 46 | {% end %} 47 | 48 | static inline {{IndexType}} IndexToOffset_998_get({{IndexType}} linearId, global const TensorInfoCl *info) { 49 | return linearId + info->offset; 50 | } 51 | 52 | static inline {{IndexType}} IndexToOffset_999_get({{IndexType}} linearId, global const TensorInfoCl *info) { 53 | {{IndexType}} offset = info->offset; 54 | 55 | // Use dynamic dims 56 | for (int i = info->dims - 1; i >= 0; --i) { 57 | {{IndexType}} curDimIndex = linearId % info->sizes[i]; 58 | {{IndexType}} curDimOffset = curDimIndex * info->strides[i]; 59 | offset += curDimOffset; 60 | 61 | linearId /= info->sizes[i]; 62 | } 63 | 64 | return offset; 65 | } 66 | 67 | static inline {{IndexType}} getLinearBlockId() { 68 | return get_group_id(2) * get_num_groups(1) * get_num_groups(0) + 69 | get_group_id(1) * get_num_groups(0) + 70 | get_group_id(0); 71 | } 72 | 73 | // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will 74 | // return the reduced value 75 | {% if defreduceblock == 1 then %} 76 | static inline float reduceBlock( local float* smem, 77 | int numVals, 78 | float threadVal, 79 | float init) { 80 | if (numVals == 0) { 81 | return init; 82 | } 83 | 84 | if ((int)get_local_id(0) < numVals) { 85 | smem[ get_local_id(0)] = threadVal; 86 | } 87 | 88 | // First warp will perform reductions across warps 89 | barrier(CLK_LOCAL_MEM_FENCE); 90 | if ((get_local_id(0) / {{WarpSize}}) == 0) { 91 | float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init; 92 | 93 | for (int i = {{WarpSize}} + get_local_id(0); i < numVals; i += {{WarpSize}}) { 94 | r = reduceOp(r, smem[i]); 95 | } 96 | 97 | smem[get_local_id(0)] = r; 98 | } 99 | 100 | // First thread will perform reductions across the block 101 | barrier(CLK_LOCAL_MEM_FENCE); 102 | 103 | float r = init; 104 | if (get_local_id(0) == 0) { 105 | r = smem[0]; 106 | 107 | int numLanesParticipating = min(numVals, {{WarpSize}}); 108 | 109 | if (numLanesParticipating == 32) { 110 | // Unroll for {{WarpSize}} == 32 and numVals >= 32 111 | // #pragma unroll 112 | // unrolling by hand, so compiler-independent 113 | {% for i=1,31 do %} 114 | r = reduceOp(r, smem[{{i}}]); 115 | {% end %} 116 | } else { 117 | for (int i = 1; i < numLanesParticipating; ++i) { 118 | r = reduceOp(r, smem[i]); 119 | } 120 | } 121 | } 122 | 123 | return r; 124 | } 125 | {% end %} 126 | 127 | -------------------------------------------------------------------------------- /src/lib/THClScatter.cl: -------------------------------------------------------------------------------- 1 | // probably should put this on its own somewhere, so we 2 | // dont have to either ocpy/paste, or include entire THClReduceApplyUtils 3 | typedef struct TensorInfoCl { 4 | unsigned int sizes[{{MAX_CLTORCH_DIMS}}]; 5 | unsigned int strides[{{MAX_CLTORCH_DIMS}}]; 6 | int offset; 7 | int dims; 8 | } TensorInfoCl; 9 | 10 | {% if scatter then %} 11 | kernel void THClTensor_kernel_scatter( 12 | global TensorInfoCl *dst_info, global float*dst_data, 13 | int dim, 14 | global const TensorInfoCl *idx_info, global float*idx_data, 15 | global const TensorInfoCl *src_info, global float*src_data, 16 | int totalElements 17 | ) 18 | { 19 | for (int _linearId = get_global_id(0); 20 | _linearId < totalElements; 21 | _linearId += get_global_size(0)) { 22 | 23 | // plan is: 24 | // based on our linearIndex, this gets us a spot in the index 25 | // tensor 26 | // this is also a spot in the src_data (at least, if we can 27 | // convert into actual coordinates, then it is the coordinates 28 | // in the src tensor 29 | // the coordinates in the dest are teh same, except that 30 | // we replace that of dimension dim with the value from 31 | // the index tensor 32 | // 33 | // so, everything hinges on us getting the coordinates, I think? 34 | // so, lets do that :-) 35 | int idxOffset = idx_info->offset; 36 | int srcOffset = src_info->offset; 37 | int dstOffset = dst_info->offset; 38 | int linearId = _linearId; // copy it, since we'll modify it 39 | // for(int d={{dims}}-1; d >= 0; d--) { // just use slow, unbkaed loop for now, to 40 | // get it working 41 | int curDimIndex; 42 | {% for d=dims-1,0,-1 do %} 43 | curDimIndex = linearId % idx_info->sizes[{{d}}]; 44 | idxOffset += curDimIndex * idx_info->strides[{{d}}]; 45 | srcOffset += curDimIndex * src_info->strides[{{d}}]; 46 | if( {{d}} != dim ) { // this only matters for the source, the others are 47 | // unaffected by which dimension we are on. I think. 48 | dstOffset += curDimIndex * dst_info->strides[{{d}}]; 49 | } 50 | linearId /= idx_info->sizes[{{d}}]; 51 | {% end %} 52 | // } 53 | // now we have the idxoffset. get the value at that location 54 | int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based 55 | // then use this to get the final value for dstOffset 56 | dstOffset += idxValue * dst_info->strides[dim]; 57 | // get the value... 58 | float value = src_data[srcOffset]; 59 | // and save it up... 60 | dst_data[dstOffset] = value; 61 | // thats it? 62 | } 63 | } 64 | {% end %} 65 | 66 | {% if scatterFill then %} 67 | kernel void THClTensor_kernel_scatterFill( 68 | global TensorInfoCl *dst_info, global float*dst_data, 69 | const int dim, 70 | global const TensorInfoCl *idx_info, global float*idx_data, 71 | const float src_val, 72 | const int totalElements 73 | ) 74 | { 75 | for (int _linearId = get_global_id(0); 76 | _linearId < totalElements; 77 | _linearId += get_global_size(0)) { 78 | 79 | // plan is: 80 | // based on our linearIndex, this gets us a spot in the index 81 | // tensor 82 | // the coordinates in the dest are teh same, except that 83 | // we replace that of dimension dim with the value from 84 | // the index tensor 85 | // 86 | // so, everything hinges on us getting the coordinates, I think? 87 | // so, lets do that :-) 88 | int idxOffset = idx_info->offset; 89 | int dstOffset = dst_info->offset; 90 | int linearId = _linearId; // copy it, since we'll modify it 91 | // for(int d={{dims}}-1; d >= 0; d--) { // just use slow, unbkaed loop for now, to 92 | // get it working 93 | int curDimIndex; 94 | {% for d=dims-1,0,-1 do %} 95 | curDimIndex = linearId % idx_info->sizes[{{d}}]; 96 | idxOffset += curDimIndex * idx_info->strides[{{d}}]; 97 | if( {{d}} != dim ) { // this only matters for the source, the others are 98 | // unaffected by which dimension we are on. I think. 99 | dstOffset += curDimIndex * dst_info->strides[{{d}}]; 100 | } 101 | linearId /= idx_info->sizes[{{d}}]; 102 | {% end %} 103 | // } 104 | // now we have the idxoffset. get the value at that location 105 | int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based 106 | // then use this to get the final value for dstOffset 107 | dstOffset += idxValue * dst_info->strides[dim]; 108 | // and save value up... 109 | dst_data[dstOffset] = src_val; 110 | // thats it? 111 | } 112 | } 113 | {% end %} 114 | 115 | -------------------------------------------------------------------------------- /src/lib/THClStorage.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_STORAGE_INC 2 | #define THCL_STORAGE_INC 3 | 4 | #include "THStorage.h" 5 | #include "THClGeneral.h" 6 | 7 | #define TH_STORAGE_REFCOUNTED 1 8 | #define TH_STORAGE_RESIZABLE 2 9 | #define TH_STORAGE_FREEMEM 4 10 | 11 | //extern int THClStorage_traceOn; 12 | 13 | typedef struct THClStorage 14 | { 15 | int device; 16 | float *data; // I know this seems a bit superfluous.... 17 | PTR_EASYCL cl; 18 | PTR_CLWRAPPER wrapper; 19 | long size; 20 | int refcount; 21 | char flag; 22 | THAllocator *allocator; 23 | void *allocatorContext; 24 | struct THClStorage *view; 25 | } THClStorage; 26 | 27 | 28 | THCL_API float* THClStorage_data(THClState *state, const THClStorage*); 29 | THCL_API long THClStorage_size(THClState *state, const THClStorage*); 30 | 31 | /* slow access -- checks everything */ 32 | THCL_API void THClStorage_set(THClState *state, THClStorage*, long, float); 33 | THCL_API float THClStorage_get(THClState *state, const THClStorage*, long); 34 | 35 | THCL_API THClStorage* THClStorage_new(THClState *state) DEPRECATED_POST; 36 | THCL_API THClStorage* THClStorage_newv2(THClState *state, int device); 37 | THCL_API THClStorage* THClStorage_newWithSize(THClState *state, int device, long size); 38 | THCL_API THClStorage* THClStorage_newWithSize1(THClState *state, int device, float); 39 | THCL_API THClStorage* THClStorage_newWithSize2(THClState *state, int device, float, float); 40 | THCL_API THClStorage* THClStorage_newWithSize3(THClState *state, int device, float, float, float); 41 | THCL_API THClStorage* THClStorage_newWithSize4(THClState *state, int device, float, float, float, float); 42 | THCL_API THClStorage* THClStorage_newWithMapping(THClState *state, int device, const char *filename, long size, int shared); 43 | 44 | /* takes ownership of data */ 45 | THCL_API THClStorage* THClStorage_newWithData(THClState *state, int device, float *data, long size); 46 | 47 | THCL_API THClStorage* THClStorage_newWithAllocator(THClState *state, int device, long size, 48 | THAllocator* allocator, 49 | void *allocatorContext); 50 | THCL_API THClStorage* THClStorage_newWithDataAndAllocator( 51 | THClState *state, int device, float* data, long size, THAllocator* allocator, void *allocatorContext); 52 | 53 | THCL_API void THClStorage_setFlag(THClState *state, THClStorage *storage, const char flag); 54 | THCL_API void THClStorage_clearFlag(THClState *state, THClStorage *storage, const char flag); 55 | THCL_API void THClStorage_retain(THClState *state, THClStorage *storage); 56 | 57 | THCL_API void THClStorage_free(THClState *state, THClStorage *storage); 58 | THCL_API void THClStorage_resize(THClState *state, THClStorage *storage, long size); 59 | THCL_API void THClStorage_fill(THClState *state, THClStorage *storage, float value); 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /src/lib/THClStorageCopy.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "THClStorageCopy.h" 4 | #include "THClGeneral.h" 5 | 6 | #include 7 | #include "EasyCL.h" 8 | 9 | using namespace std; 10 | 11 | void THClStorage_rawCopy(THClState *state, THClStorage *self, float *src) 12 | { 13 | THError("not available yet for THClStorage"); 14 | // THClCheck(clMemcpyAsync(self->data, src, self->size * sizeof(float), clMemcpyDeviceToDevice, THClState_getCurrentStream(state))); 15 | } 16 | 17 | void THClStorage_copy(THClState *state, THClStorage *self, THClStorage *src) 18 | { 19 | THArgCheck(self->size == src->size, 2, "size does not match"); 20 | if( !self->wrapper->isOnDevice() ) { 21 | self->wrapper->createOnDevice(); 22 | } 23 | src->wrapper->copyTo( self->wrapper ); 24 | if(state->trace) cout << "wrapper->copyTo() size" << self->size << endl; 25 | } 26 | 27 | void THClStorage_copyCl(THClState *state, THClStorage *self, THClStorage *src) 28 | { 29 | THError("not available yet for THClStorage"); 30 | THArgCheck(self->size == src->size, 2, "size does not match"); 31 | // THClCheck(clMemcpyAsync(self->data, src->data, self->size * sizeof(float), clMemcpyDeviceToDevice, THClState_getCurrentStream(state))); 32 | } 33 | 34 | void THClStorage_copyFloat(THClState *state, THClStorage *self, struct THFloatStorage *src) 35 | { 36 | // cout << "THClStorgae_copyFloat()" << endl; 37 | THArgCheck(self->size == src->size, 2, "size does not match"); 38 | for( int i = 0; i < self->size; i++ ) { 39 | self->data[i] = src->data[i]; 40 | } 41 | self->wrapper->copyToDevice(); 42 | if(state->trace) cout << "wrapper->copyToDevice() size" << self->size << endl; 43 | // THClCheck(clMemcpy(self->data, src->data, self->size * sizeof(float), clMemcpyHostToDevice)); 44 | } 45 | 46 | #define TH_CL_STORAGE_IMPLEMENT_COPY(TYPEC) \ 47 | void THClStorage_copy##TYPEC(THClState *state, THClStorage *self, struct TH##TYPEC##Storage *src) \ 48 | { \ 49 | THFloatStorage *buffer; \ 50 | THArgCheck(self->size == src->size, 2, "size does not match"); \ 51 | buffer = THFloatStorage_newWithSize(src->size); \ 52 | THFloatStorage_copy##TYPEC(buffer, src); \ 53 | THClStorage_copyFloat(state, self, buffer); \ 54 | THFloatStorage_free(buffer); \ 55 | } 56 | 57 | TH_CL_STORAGE_IMPLEMENT_COPY(Byte) 58 | TH_CL_STORAGE_IMPLEMENT_COPY(Char) 59 | TH_CL_STORAGE_IMPLEMENT_COPY(Short) 60 | TH_CL_STORAGE_IMPLEMENT_COPY(Int) 61 | TH_CL_STORAGE_IMPLEMENT_COPY(Long) 62 | TH_CL_STORAGE_IMPLEMENT_COPY(Double) 63 | 64 | void THFloatStorage_copyCl(THClState *state, THFloatStorage *self, struct THClStorage *src) 65 | { 66 | // cout << "THfloatStorage_copyCl" << endl; 67 | THArgCheck(self->size == src->size, 2, "size does not match"); 68 | if( src->size == 0 ) { 69 | // dont need to do anything... 70 | return; 71 | } 72 | if( src->wrapper->isDeviceDirty() ) { 73 | src->wrapper->copyToHost(); 74 | if(state->trace) cout << "wrapper->copyToHost() size" << self->size << endl; 75 | } 76 | for( int i = 0; i < self->size; i++ ) { 77 | self->data[i] = src->data[i]; 78 | } 79 | } 80 | 81 | #define TH_CL_STORAGE_IMPLEMENT_COPYTO(TYPEC) \ 82 | void TH##TYPEC##Storage_copyCl(THClState *state, TH##TYPEC##Storage *self, struct THClStorage *src) \ 83 | { \ 84 | THFloatStorage *buffer; \ 85 | THArgCheck(self->size == src->size, 2, "size does not match"); \ 86 | buffer = THFloatStorage_newWithSize(src->size); \ 87 | THFloatStorage_copyCl(state, buffer, src); \ 88 | TH##TYPEC##Storage_copyFloat(self, buffer); \ 89 | THFloatStorage_free(buffer); \ 90 | } 91 | 92 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Byte) 93 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Char) 94 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Short) 95 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Int) 96 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Long) 97 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Double) 98 | 99 | -------------------------------------------------------------------------------- /src/lib/THClStorageCopy.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_STORAGE_COPY_INC 2 | #define THCL_STORAGE_COPY_INC 3 | 4 | #include "THClStorage.h" 5 | #include "THClGeneral.h" 6 | 7 | /* Support for copy between different Storage types */ 8 | 9 | THCL_API void THClStorage_rawCopy(THClState *state, THClStorage *storage, float *src); 10 | THCL_API void THClStorage_copy(THClState *state, THClStorage *storage, THClStorage *src); 11 | THCL_API void THClStorage_copyByte(THClState *state, THClStorage *storage, struct THByteStorage *src); 12 | THCL_API void THClStorage_copyChar(THClState *state, THClStorage *storage, struct THCharStorage *src); 13 | THCL_API void THClStorage_copyShort(THClState *state, THClStorage *storage, struct THShortStorage *src); 14 | THCL_API void THClStorage_copyInt(THClState *state, THClStorage *storage, struct THIntStorage *src); 15 | THCL_API void THClStorage_copyLong(THClState *state, THClStorage *storage, struct THLongStorage *src); 16 | THCL_API void THClStorage_copyFloat(THClState *state, THClStorage *storage, struct THFloatStorage *src); 17 | THCL_API void THClStorage_copyDouble(THClState *state, THClStorage *storage, struct THDoubleStorage *src); 18 | 19 | THCL_API void THByteStorage_copyCl(THClState *state, THByteStorage *self, struct THClStorage *src); 20 | THCL_API void THCharStorage_copyCl(THClState *state, THCharStorage *self, struct THClStorage *src); 21 | THCL_API void THShortStorage_copyCl(THClState *state, THShortStorage *self, struct THClStorage *src); 22 | THCL_API void THIntStorage_copyCl(THClState *state, THIntStorage *self, struct THClStorage *src); 23 | THCL_API void THLongStorage_copyCl(THClState *state, THLongStorage *self, struct THClStorage *src); 24 | THCL_API void THFloatStorage_copyCl(THClState *state, THFloatStorage *self, struct THClStorage *src); 25 | THCL_API void THDoubleStorage_copyCl(THClState *state, THDoubleStorage *self, struct THClStorage *src); 26 | THCL_API void THClStorage_copyCl(THClState *state, THClStorage *self, THClStorage *src); 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /src/lib/THClStorageGet.cl: -------------------------------------------------------------------------------- 1 | kernel void THClStorageGet(global float *res, global float *data, int index) { 2 | if(get_global_id(0) == 0) { 3 | res[0] = data[index]; 4 | } 5 | } 6 | 7 | -------------------------------------------------------------------------------- /src/lib/THClStorageSet.cl: -------------------------------------------------------------------------------- 1 | kernel void THClStorageSet(global float *data, int index, float value) { 2 | if(get_global_id(0) == 0) { 3 | // int index2 = index; 4 | // data[index2] = 44; 5 | data[index] = value; 6 | // data[2] = index2; 7 | // data[3] = value; 8 | } 9 | } 10 | 11 | -------------------------------------------------------------------------------- /src/lib/THClTensor.h: -------------------------------------------------------------------------------- 1 | #ifndef THCL_TENSOR_INC 2 | #define THCL_TENSOR_INC 3 | 4 | #include 5 | 6 | #include "THTensor.h" 7 | #include "THClStorage.h" 8 | #include "THClGeneral.h" 9 | 10 | #define TH_TENSOR_REFCOUNTED 1 11 | 12 | //struct CLWrapper; 13 | 14 | typedef struct THClTensor 15 | { 16 | long *size; 17 | long *stride; 18 | int nDimension; 19 | 20 | THClStorage *storage; 21 | long storageOffset; 22 | int refcount; 23 | 24 | char flag; 25 | 26 | int device; 27 | } THClTensor; 28 | 29 | 30 | /**** access methods ****/ 31 | THCL_API THClStorage* THClTensor_storage(THClState *state, const THClTensor *self); 32 | THCL_API long THClTensor_storageOffset(THClState *state, const THClTensor *self); 33 | THCL_API int THClTensor_nDimension(THClState *state, const THClTensor *self); 34 | THCL_API long THClTensor_size(THClState *state, const THClTensor *self, int dim); 35 | THCL_API long THClTensor_stride(THClState *state, const THClTensor *self, int dim); 36 | THCL_API THLongStorage *THClTensor_newSizeOf(THClState *state, THClTensor *self); 37 | THCL_API THLongStorage *THClTensor_newStrideOf(THClState *state, THClTensor *self); 38 | THCL_API float *THClTensor_data(THClState *state, const THClTensor *self); 39 | #ifdef __cplusplus 40 | THCL_API class CLWrapper *THClTensor_wrapper(THClState *state, const THClTensor *self); 41 | #endif // __cplusplus 42 | 43 | THCL_API void THClTensor_setFlag(THClState *state, THClTensor *self, const char flag); 44 | THCL_API void THClTensor_clearFlag(THClState *state, THClTensor *self, const char flag); 45 | 46 | 47 | /**** creation methods ****/ 48 | THCL_API THClTensor *THClTensor_new(THClState *state) DEPRECATED_POST; 49 | THCL_API THClTensor *THClTensor_newv2(THClState *state, int device); 50 | THCL_API THClTensor *THClTensor_newWithTensor(THClState *state, THClTensor *tensor); 51 | /* stride might be NULL */ 52 | THCL_API THClTensor *THClTensor_newWithStorage(THClState *state, int device, THClStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_); 53 | THCL_API THClTensor *THClTensor_newWithStorage1d(THClState *state, int device, THClStorage *storage_, long storageOffset_, 54 | long size0_, long stride0_); 55 | THCL_API THClTensor *THClTensor_newWithStorage2d(THClState *state, int device, THClStorage *storage_, long storageOffset_, 56 | long size0_, long stride0_, 57 | long size1_, long stride1_); 58 | THCL_API THClTensor *THClTensor_newWithStorage3d(THClState *state, int device, THClStorage *storage_, long storageOffset_, 59 | long size0_, long stride0_, 60 | long size1_, long stride1_, 61 | long size2_, long stride2_); 62 | THCL_API THClTensor *THClTensor_newWithStorage4d(THClState *state, int device, THClStorage *storage_, long storageOffset_, 63 | long size0_, long stride0_, 64 | long size1_, long stride1_, 65 | long size2_, long stride2_, 66 | long size3_, long stride3_); 67 | 68 | /* stride might be NULL */ 69 | THCL_API THClTensor *THClTensor_newWithSize(THClState *state, int device, THLongStorage *size_, THLongStorage *stride_); 70 | THCL_API THClTensor *THClTensor_newWithSize1d(THClState *state, int device, long size0_); 71 | THCL_API THClTensor *THClTensor_newWithSize2d(THClState *state, int device, long size0_, long size1_); 72 | THCL_API THClTensor *THClTensor_newWithSize3d(THClState *state, int device, long size0_, long size1_, long size2_); 73 | THCL_API THClTensor *THClTensor_newWithSize4d(THClState *state, int device, long size0_, long size1_, long size2_, long size3_); 74 | 75 | THCL_API THClTensor *THClTensor_newClone(THClState *state, THClTensor *self); 76 | THCL_API THClTensor *THClTensor_newContiguous(THClState *state, THClTensor *tensor); 77 | THCL_API THClTensor *THClTensor_newSelect(THClState *state, THClTensor *tensor, int dimension_, long sliceIndex_); 78 | THCL_API THClTensor *THClTensor_newNarrow(THClState *state, THClTensor *tensor, int dimension_, long firstIndex_, long size_); 79 | THCL_API THClTensor *THClTensor_newTranspose(THClState *state, THClTensor *tensor, int dimension1_, int dimension2_); 80 | THCL_API THClTensor *THClTensor_newUnfold(THClState *state, THClTensor *tensor, int dimension_, long size_, long step_); 81 | 82 | THCL_API void THClTensor_resize(THClState *state, THClTensor *tensor, THLongStorage *size, THLongStorage *stride); 83 | THCL_API void THClTensor_resizeAs(THClState *state, THClTensor *tensor, THClTensor *src); 84 | THCL_API void THClTensor_resize0d(THClState *state, THClTensor *tensor); 85 | THCL_API void THClTensor_resize1d(THClState *state, THClTensor *tensor, long size0_); 86 | THCL_API void THClTensor_resize2d(THClState *state, THClTensor *tensor, long size0_, long size1_); 87 | THCL_API void THClTensor_resize3d(THClState *state, THClTensor *tensor, long size0_, long size1_, long size2_); 88 | THCL_API void THClTensor_resize4d(THClState *state, THClTensor *tensor, long size0_, long size1_, long size2_, long size3_); 89 | THCL_API void THClTensor_resize5d(THClState *state, THClTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_); 90 | 91 | THCL_API void THClTensor_set(THClState *state, THClTensor *self, THClTensor *src); 92 | THCL_API void THClTensor_setStorage(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_); 93 | THCL_API void THClTensor_setStorage1d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_, 94 | long size0_, long stride0_); 95 | THCL_API void THClTensor_setStorage2d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_, 96 | long size0_, long stride0_, 97 | long size1_, long stride1_); 98 | THCL_API void THClTensor_setStorage3d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_, 99 | long size0_, long stride0_, 100 | long size1_, long stride1_, 101 | long size2_, long stride2_); 102 | THCL_API void THClTensor_setStorage4d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_, 103 | long size0_, long stride0_, 104 | long size1_, long stride1_, 105 | long size2_, long stride2_, 106 | long size3_, long stride3_); 107 | 108 | THCL_API void THClTensor_narrow(THClState *state, THClTensor *self, THClTensor *src, int dimension_, long firstIndex_, long size_); 109 | THCL_API void THClTensor_select(THClState *state, THClTensor *self, THClTensor *src, int dimension_, long sliceIndex_); 110 | THCL_API void THClTensor_transpose(THClState *state, THClTensor *self, THClTensor *src, int dimension1_, int dimension2_); 111 | THCL_API void THClTensor_unfold(THClState *state, THClTensor *self, THClTensor *src, int dimension_, long size_, long step_); 112 | 113 | THCL_API void THClTensor_squeeze(THClState *state, THClTensor *self, THClTensor *src); 114 | THCL_API void THClTensor_squeeze1d(THClState *state, THClTensor *self, THClTensor *src, int dimension_); 115 | 116 | THCL_API int THClTensor_isContiguous(THClState *state, const THClTensor *self); 117 | THCL_API int THClTensor_isSameSizeAs(THClState *state, const THClTensor *self, const THClTensor *src); 118 | THCL_API long THClTensor_nElement(THClState *state, const THClTensor *self); 119 | 120 | THCL_API void THClTensor_retain(THClState *state, THClTensor *self); 121 | THCL_API void THClTensor_free(THClState *state, THClTensor *self); 122 | THCL_API void THClTensor_freeCopyTo(THClState *state, THClTensor *self, THClTensor *dst); 123 | 124 | /* Slow access methods [check everything] */ 125 | THCL_API void THClTensor_set1d(THClState *state, THClTensor *tensor, long x0, float value); 126 | THCL_API void THClTensor_set2d(THClState *state, THClTensor *tensor, long x0, long x1, float value); 127 | THCL_API void THClTensor_set3d(THClState *state, THClTensor *tensor, long x0, long x1, long x2, float value); 128 | THCL_API void THClTensor_set4d(THClState *state, THClTensor *tensor, long x0, long x1, long x2, long x3, float value); 129 | 130 | THCL_API float THClTensor_get1d(THClState *state, const THClTensor *tensor, long x0); 131 | THCL_API float THClTensor_get2d(THClState *state, const THClTensor *tensor, long x0, long x1); 132 | THCL_API float THClTensor_get3d(THClState *state, const THClTensor *tensor, long x0, long x1, long x2); 133 | THCL_API float THClTensor_get4d(THClState *state, const THClTensor *tensor, long x0, long x1, long x2, long x3); 134 | 135 | /* GPU-specific functions */ 136 | //THCL_API cudaTextureObject_t THClTensor_getTextureObject(THClState *state, THClTensor *self); 137 | THCL_API int THClTensor_getDevice(THClState *state, const THClTensor *self); 138 | THCL_API int THClTensor_checkGPU(THClState *state, unsigned int nTensors, ...); 139 | 140 | // new 141 | #ifdef __cplusplus 142 | THCL_API_CPP std::string THClTensor_toString(THClState *state, const THClTensor *tensor); 143 | THCL_API EasyCL *THClTensor_getCl(THClState *state, const THClTensor *tensor); 144 | #endif // __cplusplus 145 | THCL_API int THClTensor_getDevice(THClState *state, const THClTensor *tensor); 146 | 147 | #endif 148 | -------------------------------------------------------------------------------- /src/lib/THClTensorCopy.h: -------------------------------------------------------------------------------- 1 | #ifndef TH_CL_TENSOR_COPY_INC 2 | #define TH_CL_TENSOR_COPY_INC 3 | 4 | #include "THClTensor.h" 5 | #include "THClGeneral.h" 6 | 7 | THCL_API void THClTensor_copy(THClState *state, THClTensor *self, THClTensor *src); 8 | THCL_API void THClTensor_copyByte(THClState *state, THClTensor *self, THByteTensor *src); 9 | THCL_API void THClTensor_copyChar(THClState *state, THClTensor *self, THCharTensor *src); 10 | THCL_API void THClTensor_copyShort(THClState *state, THClTensor *self, THShortTensor *src); 11 | THCL_API void THClTensor_copyInt(THClState *state, THClTensor *self, THIntTensor *src); 12 | THCL_API void THClTensor_copyLong(THClState *state, THClTensor *self, THLongTensor *src); 13 | THCL_API void THClTensor_copyFloat(THClState *state, THClTensor *self, THFloatTensor *src); 14 | THCL_API void THClTensor_copyDouble(THClState *state, THClTensor *self, THDoubleTensor *src); 15 | 16 | THCL_API void THByteTensor_copyCl(THClState *state, THByteTensor *self, THClTensor *src); 17 | THCL_API void THCharTensor_copyCl(THClState *state, THCharTensor *self, THClTensor *src); 18 | THCL_API void THShortTensor_copyCl(THClState *state, THShortTensor *self, THClTensor *src); 19 | THCL_API void THIntTensor_copyCl(THClState *state, THIntTensor *self, THClTensor *src); 20 | THCL_API void THLongTensor_copyCl(THClState *state, THLongTensor *self, THClTensor *src); 21 | THCL_API void THFloatTensor_copyCl(THClState *state, THFloatTensor *self, THClTensor *src); 22 | THCL_API void THDoubleTensor_copyCl(THClState *state, THDoubleTensor *self, THClTensor *src); 23 | THCL_API void THClTensor_copyCl(THClState *state, THClTensor *self, THClTensor *src); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/lib/THClTensorIndex.cl: -------------------------------------------------------------------------------- 1 | // from lib/THC/THCTensorIndex.cu: 2 | 3 | kernel void THClTensor_kernel_indexFill( 4 | global float *tensor_data, int tensor_offset, 5 | global int* stride, 6 | global float *index_data, int index_offset, 7 | int src_nDim, 8 | int dim, int idx_size, int tensor_size, int size_dim, float val 9 | ) 10 | { 11 | int thread_idx = get_group_id(0) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0); 12 | 13 | long flat_size = tensor_size / idx_size; 14 | 15 | if (thread_idx < flat_size) 16 | { 17 | long coeff = 0; 18 | for (int i=0; i dim) 31 | { 32 | coeff = leftover / stride[d]; 33 | leftover -= coeff * stride[d]; 34 | srcIdx += coeff * stride[d]; 35 | } 36 | } 37 | tensor_data[tensor_offset + srcIdx + (int)((index_data[index_offset + i])-1)*stride[dim] ] = val; 38 | } 39 | } 40 | } 41 | 42 | kernel void THClTensor_kernel_indexCopy( 43 | global float *res_data, int res_offset, 44 | global float *src_data, int src_offset, 45 | global int* res_stride, global float *index_data, int index_offset, 46 | int res_nDim, int dim, int idx_size, int src_size, int size_dim 47 | ) 48 | { 49 | int thread_idx = get_group_id(0) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0); 50 | 51 | long flat_size = src_size / idx_size; 52 | 53 | if (thread_idx < flat_size) 54 | { 55 | long coeff = 0; 56 | for (int i=0; i dim) 72 | { 73 | coeff = leftover / res_stride[d]; 74 | leftover -= coeff * res_stride[d]; 75 | targetIdx += coeff * res_stride[d]; 76 | resIdx += coeff * res_stride[d]; 77 | } 78 | } 79 | res_data[res_offset + resIdx + ((int)(index_data[index_offset + i])-1)*res_stride[dim] ] = src_data[src_offset + targetIdx + i*res_stride[dim] ]; 80 | } 81 | } 82 | } 83 | 84 | kernel void THClTensor_kernel_indexSelect( 85 | global float *tensor_data, int tensor_offset, global float *src_data, int src_offset, 86 | global int* src_stride, global float *index_data, int index_offset, 87 | int src_nDim, int dim, int idx_size, int tensor_size, int size_dim 88 | ) 89 | { 90 | int thread_idx = get_group_id(0) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0); 91 | 92 | long flat_size = tensor_size / idx_size; 93 | 94 | if (thread_idx < flat_size) 95 | { 96 | long coeff = 0; 97 | for (int i=0; i dim) 113 | { 114 | coeff = leftover / src_stride[d]; 115 | leftover -= coeff * src_stride[d]; 116 | targetIdx += coeff * src_stride[d]; 117 | srcIdx += coeff * src_stride[d]; 118 | } 119 | } 120 | tensor_data[tensor_offset + targetIdx + i*src_stride[dim] ] = src_data[src_offset + srcIdx + ((int)(index_data[index_offset + i])-1)*src_stride[dim] ]; 121 | } 122 | } 123 | } 124 | 125 | -------------------------------------------------------------------------------- /src/lib/THClTensorInfoCl.cl: -------------------------------------------------------------------------------- 1 | typedef struct THClTensorInfoCl { 2 | unsigned int sizes[{{MAX_CLTORCH_DIMS}}]; 3 | unsigned int strides[{{MAX_CLTORCH_DIMS}}]; 4 | int offset; 5 | int dims; 6 | } TensorInfoCl; 7 | 8 | -------------------------------------------------------------------------------- /src/lib/THClTensorMasked.cl: -------------------------------------------------------------------------------- 1 | // from lib/THC/THCTensorMasked.cu: 2 | 3 | struct TensorMaskedFillOp { 4 | TensorMaskedFillOp(float v) : value(v) {} 5 | /*__device__*/ /*__forceline__*/ void operator()(float* t, float* mask) { 6 | // Really mask should be `0` or `1` but we can't propagate errors here. 7 | if (*mask != 0.0f) { 8 | *t = value; 9 | } 10 | } 11 | 12 | float value; 13 | }; 14 | 15 | struct TensorMaskedCopyOp { 16 | TensorMaskedCopyOp(float* s, float* bm, float* ps) 17 | : src(s), 18 | baseMask(bm), 19 | maskPrefixSum(ps) { 20 | } 21 | 22 | /*__device__*/ /*__forceline__*/ void operator()(float* out, float* mask) { 23 | // Really mask should be `0` or `1` but we can't propagate errors here. 24 | if (*mask != 0.0f) { 25 | // We've already checked that this offset is <= 2^24, so this is ok. 26 | int srcOffset = (int) (mask - baseMask); 27 | *out = src[(int) maskPrefixSum[srcOffset]]; 28 | } 29 | } 30 | 31 | // Where we are copying from 32 | float* src; 33 | 34 | // The base address of mask so we can calculate offset 35 | float* baseMask; 36 | 37 | // The index we are copying from 38 | float* maskPrefixSum; 39 | }; 40 | 41 | struct TensorMaskedSelectOp { 42 | TensorMaskedSelectOp(float* t) : out(t) {} 43 | /*__device__*/ /*__forceline__*/ void operator()(float* mask, float* maskPrefixSum, float* in) { 44 | // Really mask should be `0` or `1` but we can't propagate errors here. 45 | if (*mask != 0.0f) { 46 | out[(int) *maskPrefixSum] = *in; 47 | } 48 | } 49 | 50 | float* out; 51 | }; 52 | 53 | -------------------------------------------------------------------------------- /src/lib/THClTensorMath2.cl: -------------------------------------------------------------------------------- 1 | // from lib/THC/THCTensorMath2.cu: 2 | 3 | // Given the sum of values and the sum of squares, compute the variance or standard deviation. 4 | template 5 | /*__forceline__*/ /*__device__*/ float THClTensor_computeVar(float sum, float sum2, unsigned row_size) { 6 | if (flag) { 7 | sum /= row_size; 8 | sum2 /= row_size; 9 | sum2 -= sum * sum; 10 | sum2 = (sum2 < 0 ? 0 : sum2); 11 | } 12 | else { 13 | sum /= row_size; 14 | sum2 /= row_size - 1; 15 | sum2 -= ((float)row_size) / ((float)(row_size - 1)) * sum * sum; 16 | sum2 = (sum2 < 0 ? 0 : sum2); 17 | } 18 | if (apply_sqrt) 19 | return sqrt(sum2); 20 | else 21 | return sum2; 22 | } 23 | 24 | /* Compute the variance (or standard deviation) along an outer dimension of a tensor. 25 | * 26 | * - num_orows is the size of the flattened outer dimensions; 27 | * - num_irows is the size of the flattened inner dimensions; 28 | * - row_size is the size of the dimension along which to compute the variance; 29 | * - if flag is set, normalize by `row_size` instead of `row_size - 1` 30 | * - if apply_sqrt is set, compute the standard deviation instead of variance 31 | * 32 | * The dimensions to the outside and inside of the specified dimension are considered as flattened. 33 | * Thread blocks with the same get_group_id(1) process an "outer row" (i.e. an element of the flattened 34 | * outer dimensions, which contains several "inner rows"). 35 | * Each thread processes a single inner row at a time. 36 | */ 37 | template 38 | kernel void THClTensor_kernel_varOuterDim(float *tgt, float *src_, unsigned num_orows, unsigned num_irows, unsigned row_size) 39 | { 40 | for (unsigned orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) { 41 | for (unsigned irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { 42 | float *src = src_ + orow * row_size * num_irows + irow; 43 | float sum = 0, sum2 = 0; 44 | 45 | for (unsigned col = 0; col < row_size; ++col) { 46 | float val = *src; 47 | sum += val; 48 | sum2 += val * val; 49 | 50 | src += num_irows; 51 | } 52 | 53 | tgt[orow * num_irows + irow] = THClTensor_computeVar(sum, sum2, row_size); 54 | } 55 | } 56 | } 57 | 58 | /* Compute the variance (or standard deviation) of the innermost dimension of a tensor. 59 | * 60 | * - num_rows is the size of the flattened outer dimensions; 61 | * - row_size is the size of the innermost dimension; 62 | * - if flag is set, normalize by `row_size` instead of `row_size - 1` 63 | * - if apply_sqrt is set, compute the standard deviation instead of variance 64 | * 65 | * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is 66 | * considered as having 'num_rows' rows of size 'row_size'. 67 | * Each thread block processes one or more sets of contiguous rows (processing multiple rows 68 | * per thread block is quicker than processing a single row, especially for short rows). 69 | */ 70 | template 71 | kernel void THClTensor_kernel_varInnermostDim(float *tgt, float *src_, unsigned num_rows, unsigned row_size) 72 | { 73 | local float ssum[32][16]; 74 | local float ssum2[32][16]; 75 | 76 | for (unsigned block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) { 77 | unsigned row = block_row + get_local_id(1); 78 | float sum = 0, sum2 = 0; 79 | if (row < num_rows) { 80 | float *src = src_ + row * row_size; 81 | // Sequential reduction within a thread. 82 | for (unsigned col = get_local_id(0); col < row_size; col += get_local_size(0)) { 83 | float val = src[col]; 84 | sum += val; 85 | sum2 += val * val; 86 | } 87 | } 88 | ssum[get_local_id(1)][get_local_id(0)] = sum; 89 | ssum2[get_local_id(1)][get_local_id(0)] = sum2; 90 | barrier(CLK_LOCAL_MEM_FENCE); 91 | 92 | // Reduce intermediate values to single value. 93 | for (unsigned s = 8; s > 1; s >>= 1) { 94 | if (row < num_rows && get_local_id(0) < s) { 95 | ssum[get_local_id(1)][get_local_id(0)] += ssum[get_local_id(1)][get_local_id(0) + s]; 96 | ssum2[get_local_id(1)][get_local_id(0)] += ssum2[get_local_id(1)][get_local_id(0) + s]; 97 | } 98 | barrier(CLK_LOCAL_MEM_FENCE); 99 | } 100 | 101 | if (row < num_rows && get_local_id(0) == 0) { 102 | sum = ssum[get_local_id(1)][0] + ssum[get_local_id(1)][1]; 103 | sum2 = ssum2[get_local_id(1)][0] + ssum2[get_local_id(1)][1]; 104 | tgt[row] = THClTensor_computeVar(sum, sum2, row_size); 105 | } 106 | barrier(CLK_LOCAL_MEM_FENCE); 107 | } 108 | } 109 | 110 | kernel void THClTensor_kernel_renorm(float *data, const float value, const long size, const float maxnorm) 111 | { 112 | local float buffer[32]; 113 | long tx = get_local_id(0); 114 | long bx = get_group_id(0); 115 | long step = get_local_size(0); 116 | float *row = data + size*bx; 117 | 118 | buffer[tx] = 0; 119 | 120 | // get norm of axis 121 | for (long i=tx; i> 1; stride > 0; stride >>= 1) 127 | { 128 | barrier(CLK_LOCAL_MEM_FENCE); 129 | if (tx < stride) 130 | buffer[tx] += buffer[tx+stride]; 131 | } 132 | // clip norms 133 | barrier(CLK_LOCAL_MEM_FENCE); 134 | float norm = pow(buffer[0], 1/value); 135 | if (norm > maxnorm) 136 | { 137 | norm = maxnorm / (norm + 1e-7); 138 | // renormalize 139 | for (long i=tx; i 2 | 3 | #include "THClTensorMath.h" 4 | #include "THClGeneral.h" 5 | //#include "THClBlas.h" 6 | #include "THClTensorCopy.h" 7 | //#include "THCTensorRandom.h" 8 | #include "THClApply.h" 9 | #include "THClTensorMathCompare.h" 10 | 11 | using namespace std; 12 | 13 | #ifndef DIVUP 14 | #define DIVUP(x, y) (((x) + (y) - 1) / (y)) 15 | #endif 16 | 17 | void THClTensor_logicalValue(THClState *state, THClTensor *self_, THClTensor *src, HasOperator2 *op) 18 | { 19 | THClTensor_resizeAs(state, self_, src); 20 | 21 | if (!THClTensor_pointwiseApply2(state, self_, src, op)) { 22 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 23 | } 24 | } 25 | 26 | class TensorGenCompareValueOp : public HasOperator2, public HasScalars { 27 | public: 28 | int getNumScalars() const { return 1; } 29 | float getScalar( int index ) const { return val; } 30 | TensorGenCompareValueOp(std::string op, float v) : 31 | val(v), 32 | op(op) {} 33 | string operator2() const { 34 | return "*out = (*in1 " + op + " val1)"; 35 | } 36 | const float val; 37 | std::string op; 38 | }; 39 | 40 | #define GENERATE_THClTensor_LogValue(NAME, OP) \ 41 | void THClTensor_##NAME##Value(THClState *state, THClTensor *self_, THClTensor *src, float value) \ 42 | { \ 43 | THAssert(THClTensor_checkGPU(state, 2, self_, src)); \ 44 | TensorGenCompareValueOp op(#OP, value); \ 45 | THClTensor_logicalValue(state, self_, src, &op); \ 46 | } 47 | 48 | GENERATE_THClTensor_LogValue(ge, >=) 49 | GENERATE_THClTensor_LogValue(ne, !=) 50 | GENERATE_THClTensor_LogValue(eq, ==) 51 | GENERATE_THClTensor_LogValue(le, <=) 52 | GENERATE_THClTensor_LogValue(lt, <) 53 | GENERATE_THClTensor_LogValue(gt, >) 54 | 55 | class TensorGenComparePointTensorOp : public HasOperator2, public HasPointTensors { 56 | public: 57 | int getNumPointTensors() const { return 1; } 58 | const THClTensor *getPointTensor( int index ) const { return val; } 59 | TensorGenComparePointTensorOp(std::string op, THClTensor *v) : 60 | val(v), 61 | op(op) {} 62 | string operator2() const { 63 | return "*out = (*in1 " + op + " *pointTensor1)"; 64 | } 65 | const THClTensor *val; 66 | std::string op; 67 | }; 68 | 69 | #define GENERATE_THClTensor_LogPointTensor(NAME, OP) \ 70 | void THClTensor_##NAME##PointTensor(THClState *state, THClTensor *self_, THClTensor *src, THClTensor *value) \ 71 | { \ 72 | THAssert(THClTensor_checkGPU(state, 3, self_, src, value)); \ 73 | TensorGenComparePointTensorOp op(#OP, value); \ 74 | THClTensor_logicalValue(state, self_, src, &op); \ 75 | } 76 | 77 | GENERATE_THClTensor_LogPointTensor(ge, >=) 78 | GENERATE_THClTensor_LogPointTensor(ne, !=) 79 | GENERATE_THClTensor_LogPointTensor(eq, ==) 80 | GENERATE_THClTensor_LogPointTensor(le, <=) 81 | GENERATE_THClTensor_LogPointTensor(lt, <) 82 | GENERATE_THClTensor_LogPointTensor(gt, >) 83 | 84 | -------------------------------------------------------------------------------- /src/lib/THClTensorMathCompare.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | void THClTensor_logicalValue(THClState *state, THClTensor *self_, THClTensor *src, HasOperator2 *op); 4 | void THClTensor_logicalTensor(THClState *state, THClTensor *self_, THClTensor *src1, THClTensor *src2, HasOperator3 *op); 5 | 6 | -------------------------------------------------------------------------------- /src/lib/THClTensorMathCompareT.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "THClTensorMath.h" 4 | #include "THClGeneral.h" 5 | //#include "THCBlas.h" 6 | #include "THClTensorCopy.h" 7 | //#include "THCTensorRandom.h" 8 | #include "THClApply.h" 9 | //#include "THCReduce.cuh" 10 | #include "THClTensorMathCompare.h" 11 | 12 | using namespace std; 13 | 14 | #ifndef DIVUP 15 | #define DIVUP(x, y) (((x) + (y) - 1) / (y)) 16 | #endif 17 | 18 | void THClTensor_logicalTensor(THClState *state, THClTensor *self_, THClTensor *src1, THClTensor *src2, HasOperator3 *op) 19 | { 20 | THClTensor_resizeAs(state, self_, src1); 21 | THArgCheck(THClTensor_nElement(state, src1) == THClTensor_nElement(state, src2), 3, "sizes do not match"); 22 | 23 | if (!THClTensor_pointwiseApply3(state, self_, src1, src2, op)) { 24 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 25 | } 26 | } 27 | 28 | class TensorGenLogOp : public HasOperator3 { 29 | public: 30 | string logop; 31 | TensorGenLogOp(string logop) { 32 | this->logop = logop; 33 | } 34 | string operator3() const { 35 | return "*out = (float) (*in1 " + logop + " *in2)"; 36 | } 37 | }; 38 | 39 | #define GENERATE_THClTensor_LogOpTensor(NAME, LOGOP) \ 40 | void THClTensor_##NAME##Tensor(THClState *state, THClTensor *self_, THClTensor *src1, THClTensor *src2) \ 41 | { \ 42 | if( src2->nDimension == 0 ) { \ 43 | THClTensor_##NAME##PointTensor(state, self_, src1, src2); \ 44 | return; \ 45 | } \ 46 | THAssert(THClTensor_checkGPU(state, 3, self_, src1, src2)); \ 47 | TensorGenLogOp op(#LOGOP); \ 48 | THClTensor_logicalTensor(state, self_, src1, src2, &op); \ 49 | } 50 | 51 | GENERATE_THClTensor_LogOpTensor(lt, <) 52 | GENERATE_THClTensor_LogOpTensor(gt, >) 53 | GENERATE_THClTensor_LogOpTensor(le, <=) 54 | GENERATE_THClTensor_LogOpTensor(ge, >=) 55 | GENERATE_THClTensor_LogOpTensor(ne, !=) 56 | GENERATE_THClTensor_LogOpTensor(eq, ==) 57 | 58 | 59 | -------------------------------------------------------------------------------- /src/lib/THClTensorMathPairwise.cpp: -------------------------------------------------------------------------------- 1 | #include "THClTensorMath.h" 2 | #include "THClGeneral.h" 3 | //#include "THCBlas.h" 4 | #include "THClTensorCopy.h" 5 | //#include "THCTensorRandom.h" 6 | #include "THClApply.h" 7 | //#include "THCReduce.cuh" 8 | 9 | #include 10 | #include 11 | 12 | using namespace std; 13 | 14 | #ifndef DIVUP 15 | #define DIVUP(x, y) (((x) + (y) - 1) / (y)) 16 | #endif 17 | 18 | class TensorAddConstantOp : public HasOperator1, public HasOperator2, public HasScalars { 19 | public: 20 | int getNumScalars() const { return 1; } 21 | float getScalar( int index ) const { return val; } 22 | TensorAddConstantOp(float v) : val(v) {} 23 | string operator2() const { 24 | return "*out = *in1 + val1"; 25 | } 26 | string operator1() const { 27 | return "*out += val1"; 28 | } 29 | const float val; 30 | }; 31 | 32 | class TensorAddPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors { 33 | public: 34 | int getNumPointTensors() const { return 1; } 35 | const THClTensor *getPointTensor( int index ) const { return val; } 36 | TensorAddPointTensorOp(THClTensor *v) : val(v) {} 37 | string operator2() const { 38 | return "*out = *in1 + *pointTensor1"; 39 | } 40 | string operator1() const { 41 | return "*out += *pointTensor1"; 42 | } 43 | const THClTensor *val; 44 | }; 45 | 46 | class TensorSubPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors { 47 | public: 48 | int getNumPointTensors() const { return 1; } 49 | const THClTensor *getPointTensor( int index ) const { return val; } 50 | TensorSubPointTensorOp(THClTensor *v) : val(v) {} 51 | string operator2() const { 52 | return "*out = *in1 - *pointTensor1"; 53 | } 54 | string operator1() const { 55 | return "*out -= *pointTensor1"; 56 | } 57 | const THClTensor *val; 58 | }; 59 | 60 | void THClTensor_add(THClState *state, THClTensor *self_, THClTensor *src_, float value) 61 | { 62 | THAssert(THClTensor_checkGPU(state, 2, self_, src_)); 63 | if (self_ == src_) { 64 | TensorAddConstantOp op(value); 65 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 66 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 67 | } 68 | } else { 69 | THClTensor_resizeAs(state, self_, src_); 70 | 71 | TensorAddConstantOp op(value); 72 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 73 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 74 | } 75 | } 76 | } 77 | 78 | void THClTensor_sub(THClState *state, THClTensor *self_, THClTensor *src_, float value) 79 | { 80 | THAssert(THClTensor_checkGPU(state, 2, self_, src_)); 81 | TensorAddConstantOp op(-value); 82 | if (self_ == src_) { 83 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 84 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 85 | } 86 | } else { 87 | THClTensor_resizeAs(state, self_, src_); 88 | 89 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 90 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 91 | } 92 | } 93 | } 94 | 95 | class TensorMulConstantOp : public HasOperator2, public HasOperator1, public HasScalars { 96 | public: 97 | int getNumScalars() const { return 1; } 98 | float getScalar( int index ) const { return val; } 99 | TensorMulConstantOp(float v) : val(v) {} 100 | string operator2() const { 101 | return "*out = *in1 * val1"; 102 | } 103 | string operator1() const { 104 | return "*out *= val1"; 105 | } 106 | const float val; 107 | }; 108 | 109 | class TensorMulPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors { 110 | public: 111 | int getNumPointTensors() const { return 1; } 112 | const THClTensor *getPointTensor( int index ) const { return val; } 113 | TensorMulPointTensorOp(THClTensor *v) : val(v) {} 114 | string operator2() const { 115 | return "*out = *in1 * *pointTensor1"; 116 | } 117 | string operator1() const { 118 | return "*out *= *pointTensor1"; 119 | } 120 | const THClTensor *val; 121 | }; 122 | 123 | class TensorDivPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors { 124 | public: 125 | int getNumPointTensors() const { return 1; } 126 | const THClTensor *getPointTensor( int index ) const { return val; } 127 | TensorDivPointTensorOp(THClTensor *v) : val(v) {} 128 | string operator2() const { 129 | return "*out = *in1 / *pointTensor1"; 130 | } 131 | string operator1() const { 132 | return "*out /= *pointTensor1"; 133 | } 134 | const THClTensor *val; 135 | }; 136 | 137 | void THClTensor_mul(THClState *state, THClTensor *self_, THClTensor *src_, float value) 138 | { 139 | THAssert(THClTensor_checkGPU(state, 2, self_, src_)); 140 | if (self_ == src_) { 141 | TensorMulConstantOp op(value); 142 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 143 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 144 | } 145 | } else { 146 | THClTensor_resizeAs(state, self_, src_); 147 | 148 | TensorMulConstantOp op(value); 149 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 150 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 151 | } 152 | } 153 | } 154 | 155 | void THClTensor_add_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_) 156 | { 157 | THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_)); 158 | TensorAddPointTensorOp op(value_); 159 | if (self_ == src_) { 160 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 161 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 162 | } 163 | } else { 164 | THClTensor_resizeAs(state, self_, src_); 165 | 166 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 167 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 168 | } 169 | } 170 | } 171 | 172 | void THClTensor_sub_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_) 173 | { 174 | THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_)); 175 | TensorSubPointTensorOp op(value_); 176 | if (self_ == src_) { 177 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 178 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 179 | } 180 | } else { 181 | THClTensor_resizeAs(state, self_, src_); 182 | 183 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 184 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 185 | } 186 | } 187 | } 188 | 189 | void THClTensor_mul_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_) 190 | { 191 | THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_)); 192 | TensorMulPointTensorOp op(value_); 193 | if (self_ == src_) { 194 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 195 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 196 | } 197 | } else { 198 | THClTensor_resizeAs(state, self_, src_); 199 | 200 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 201 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 202 | } 203 | } 204 | } 205 | 206 | void THClTensor_div_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_) 207 | { 208 | THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_)); 209 | TensorDivPointTensorOp op(value_); 210 | if (self_ == src_) { 211 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 212 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 213 | } 214 | } else { 215 | THClTensor_resizeAs(state, self_, src_); 216 | 217 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 218 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 219 | } 220 | } 221 | } 222 | 223 | void THClTensor_div(THClState* state, THClTensor *self_, THClTensor *src_, float value) 224 | { 225 | THAssert(THClTensor_checkGPU(state, 2, self_, src_)); 226 | THArgCheck(value != 0.0f, 3, "divide by zero"); 227 | 228 | if (self_ == src_) { 229 | TensorMulConstantOp op(1.0f / value); 230 | if (!THClTensor_pointwiseApply1(state, self_, &op)) { 231 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 232 | } 233 | } else { 234 | THClTensor_resizeAs(state, self_, src_); 235 | 236 | TensorMulConstantOp op(1.0f / value); 237 | if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) { 238 | THArgCheck(false, 2, CLTORCH_DIM_WARNING); 239 | } 240 | } 241 | } 242 | 243 | -------------------------------------------------------------------------------- /src/lib/THClTensorMathPointwise.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | class TensorSigmoidOp : public HasOperator1, public HasOperator2 { 4 | public: 5 | TensorSigmoidOp() { 6 | } 7 | std::string operator1() const { 8 | return "*out = 1.0f / (1 + exp(- (*out)))"; 9 | } 10 | std::string operator2() const { 11 | return "*out = 1.0f / (1 + exp(- (*in1)))"; 12 | } 13 | }; 14 | 15 | class TensorGenOp : public HasOperator1, public HasOperator2 { 16 | public: 17 | std::string cfun; 18 | TensorGenOp( std::string cfun ) { 19 | this->cfun = cfun; 20 | } 21 | std::string operator1() const { 22 | return "*out =" + cfun + "( *out )"; 23 | } 24 | std::string operator2() const { 25 | return "*out = " + cfun + "( *in1 )"; 26 | } 27 | }; 28 | 29 | class TensorGenOpFullInline1 : public HasOperator1 { 30 | public: 31 | std::string cfun; 32 | TensorGenOpFullInline1( std::string cfun ) { 33 | this->cfun = cfun; 34 | } 35 | std::string operator1() const { 36 | return cfun; 37 | } 38 | }; 39 | 40 | class TensorGenOpFullInline2 : public HasOperator2 { 41 | public: 42 | std::string cfun; 43 | TensorGenOpFullInline2( std::string cfun ) { 44 | this->cfun = cfun; 45 | } 46 | std::string operator2() const { 47 | return cfun; 48 | } 49 | }; 50 | 51 | class TensorGenOpFullInline3 : public HasOperator3 { 52 | public: 53 | std::string cfun; 54 | TensorGenOpFullInline3( std::string cfun ) { 55 | this->cfun = cfun; 56 | } 57 | std::string operator3() const { 58 | return cfun; 59 | } 60 | }; 61 | 62 | // used for maxall etc 63 | class MaxOp : public HasOperator2, public HasOperator3 { 64 | public: 65 | std::string operator2() const { 66 | return "*out = fmax(*out, *in1)"; 67 | } 68 | std::string operator3() const { 69 | return "*out = fmax(*in1, *in2)"; 70 | } 71 | }; 72 | 73 | // used for minall etc 74 | class MinOp : public HasOperator2, public HasOperator3 { 75 | public: 76 | std::string operator2() const { 77 | return "*out = fmin(*out, *in1)"; 78 | } 79 | std::string operator3() const { 80 | return "*out = fmin(*in1, *in2)"; 81 | } 82 | }; 83 | 84 | class TensorAddOp : public HasOperator2, public HasOperator3 { 85 | public: 86 | std::string operator2() const { 87 | return "*out += *in1"; 88 | } 89 | std::string operator3() const { 90 | return "*out = *in1 + *in2"; 91 | } 92 | }; 93 | 94 | class TensorCAddOp : public HasOperator2, public HasOperator3, public HasScalars { 95 | public: 96 | int getNumScalars() const { return 1; } 97 | float getScalar(int index) const { return val; } 98 | TensorCAddOp(float v) : val(v) {} 99 | std::string operator2() const { 100 | return "*out += val1 * *in1"; 101 | } 102 | std::string operator3() const { 103 | return "*out = *in1 + val1 * *in2"; 104 | } 105 | float val; 106 | }; 107 | 108 | class TensorSubOp : public HasOperator2, public HasOperator3 { 109 | public: 110 | std::string operator2() const { 111 | return "*out -= *in1"; 112 | } 113 | std::string operator3() const { 114 | return "*out = *in1 - *in2"; 115 | } 116 | }; 117 | 118 | class TensorCSubOp : public HasOperator2, public HasOperator3, public HasScalars { 119 | public: 120 | int getNumScalars() const { return 1; } 121 | float getScalar(int index) const { return val; } 122 | TensorCSubOp(float v) : val(v) {} 123 | std::string operator2() const { 124 | return "*out -= val1 * *in1"; 125 | } 126 | std::string operator3() const { 127 | return "*out = *in1 - val1 * *in2"; 128 | } 129 | float val; 130 | }; 131 | 132 | class TensorMulOp : public HasOperator2, public HasOperator3 { 133 | public: 134 | std::string operator2() const { 135 | return "*out *= *in1"; 136 | } 137 | std::string operator3() const { 138 | return "*out = (*in1) * (*in2)"; 139 | } 140 | }; 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /src/lib/THClTensorMathScan.cl: -------------------------------------------------------------------------------- 1 | // from lib/THC/THCTensorMathScan.cu: 2 | 3 | /* Perform an inclusive scan along an outer dimension of a tensor. 4 | * 5 | * - num_orows is the size of the flattened outer dimensions; 6 | * - num_irows is the size of the flattened inner dimensions; 7 | * - row_size is the size of the dimension along which to compute the variance; 8 | * 9 | * The dimensions to the outside and inside of the specified dimension are considered as flattened. 10 | * Thread blocks with the same get_group_id(1) process an "outer row" (i.e. an element of the flattened 11 | * outer dimensions, which contains several "inner rows"). 12 | * Each thread processes a single inner row at a time. 13 | */ 14 | 15 | static inline float binary_op(float _in1, float _in2) { 16 | // hope the compiler can handle this :-P 17 | float _out; 18 | float *out = &_out; 19 | float *in1 = &_in1; 20 | float *in2 = &_in2; 21 | *out = 10 * (*in2) * (*in1); 22 | {{operator3}}; 23 | // *out = (*in1) * (*in2); 24 | return _out; 25 | } 26 | 27 | kernel void THClTensor_kernel_scanOuterDim( 28 | global float *tgt_data, int tgt_offset, 29 | global float *src_data, int src_offset, 30 | int num_orows, int num_irows, int row_size, 31 | float init) 32 | { 33 | for (unsigned orow = get_group_id(0); (int)orow < num_orows; orow += get_num_groups(0)) { 34 | for (unsigned irow = get_group_id(1) * get_local_size(0) + get_local_id(0); (int)irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { 35 | global float *src = src_data + src_offset + orow * row_size * num_irows + irow; 36 | global float *tgt = tgt_data + tgt_offset + orow * row_size * num_irows + irow; 37 | float acc = init; 38 | 39 | for (unsigned col = 0; (int)col < row_size; ++col) { 40 | acc = binary_op(acc, *src); 41 | // binary_op(&acc, &acc, src); 42 | *tgt = acc; 43 | 44 | src += num_irows; 45 | tgt += num_irows; 46 | } 47 | } 48 | } 49 | } 50 | 51 | /* Perform an inclusive scan along the innermost dimension of a tensor. 52 | * 53 | * - num_rows is the size of the flattened outer dimensions; 54 | * - row_size is the size of the innermost dimension; 55 | * 56 | * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is 57 | * considered as having 'num_rows' rows of size 'row_size'. 58 | * Each thread block processes one or more sets of contiguous rows (processing multiple rows 59 | * per thread block is quicker than processing a single row, especially for short rows). 60 | */ 61 | //template 62 | kernel void THClTensor_kernel_scanInnermostDim( 63 | global float *tgt_data, int tgt_offset, 64 | global float *src_data, int src_offset, 65 | int num_rows, int row_size, 66 | float init) 67 | { 68 | local float sbuf[{{num_threads_y}}][2 * {{num_threads_x}}]; 69 | 70 | local float* row_buf = sbuf[get_local_id(1)]; 71 | 72 | for (int block_row = get_group_id(0) * get_local_size(1); 73 | block_row < num_rows; 74 | block_row += get_local_size(1) * get_num_groups(0)) { 75 | int row = block_row + get_local_id(1); 76 | float block_total = init; 77 | 78 | global float *row_src = src_data + src_offset + row * row_size; 79 | global float *row_tgt = tgt_data + tgt_offset + row * row_size; 80 | 81 | // Perform scan on one block at a time, keeping track of the total value of 82 | // all blocks processed so far. 83 | for (int block_col = 0; block_col < (int)row_size; block_col += 2 * {{num_threads_x}}) { 84 | // Load data into shared memory (two values per thread). 85 | int col1 = block_col + get_local_id(0); 86 | int col2 = block_col + {{num_threads_x}} + get_local_id(0); 87 | if (row < num_rows) { 88 | if (col1 < row_size) { 89 | row_buf[get_local_id(0)] = row_src[col1]; 90 | } else { 91 | row_buf[get_local_id(0)] = init; 92 | } 93 | 94 | if (col2 < row_size) { 95 | row_buf[{{num_threads_x}} + get_local_id(0)] = row_src[col2]; 96 | } else { 97 | row_buf[{{num_threads_x}} + get_local_id(0)] = init; 98 | } 99 | 100 | // Add the total value of all previous blocks to the first value of this block. 101 | if (get_local_id(0) == 0) { 102 | row_buf[0] = binary_op(row_buf[0], block_total); 103 | // binary_op(row_buf, row_buf, &block_total); 104 | } 105 | } 106 | barrier(CLK_LOCAL_MEM_FENCE); 107 | 108 | // Parallel reduction (up-sweep). 109 | for (int s = {{num_threads_x}}, d = 1; s >= 1; s >>= 1, d <<= 1) { 110 | if (row < num_rows && (int)get_local_id(0) < s) { 111 | int offset = (2 * get_local_id(0) + 1) * d - 1; 112 | row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]); 113 | // binary_op(row_bufer + offset + d, row_buf + offset, row_buf + offset + d); 114 | } 115 | barrier(CLK_LOCAL_MEM_FENCE); 116 | } 117 | 118 | // Down-sweep. 119 | for (int s = 2, d = {{num_threads_x}} / 2; d >= 1; s <<= 1, d >>= 1) { 120 | if (row < num_rows && (int)get_local_id(0) < s - 1) { 121 | int offset = 2 * (get_local_id(0) + 1) * d - 1; 122 | row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]); 123 | // binary_op(row_buff + offset + d, row_buf + offset, row_buf + offset + d); 124 | } 125 | barrier(CLK_LOCAL_MEM_FENCE); 126 | } 127 | 128 | 129 | // Write back to output. 130 | if (row < num_rows) { 131 | if (col1 < row_size) row_tgt[col1] = row_buf[get_local_id(0)]; 132 | if (col2 < row_size) row_tgt[col2] = row_buf[{{num_threads_x}} + get_local_id(0)]; 133 | } 134 | block_total = row_buf[2 * {{num_threads_x}} - 1]; 135 | barrier(CLK_LOCAL_MEM_FENCE); 136 | 137 | } 138 | } 139 | } 140 | 141 | -------------------------------------------------------------------------------- /src/lib/THClTensorMathTransformReduce.cl: -------------------------------------------------------------------------------- 1 | // from lib/THC/THCTensorMathTransformReduce.cu: 2 | 3 | typedef struct Pair { 4 | float first; 5 | float second; 6 | } Pair; 7 | 8 | static Pair binary_op( Pair a, Pair b ) { 9 | {{pair_operator2}}; 10 | } 11 | 12 | /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index). 13 | These are useful when you not only have to do a reduction, but you might have 14 | to preserve the location of contention (for example min/max operations). 15 | The structure of the kernels follows the structure of the reduction kernels. 16 | */ 17 | kernel void THClTensor_kernel_transformReduceOuterDimIndex( 18 | global float *tgt1_data, int tgt1_offset, 19 | global float *tgt2_data, int tgt2_offset, 20 | global float *src__data, int src__offset, 21 | int num_orows, int num_irows, int row_size 22 | ) { 23 | global float *tgt1 = tgt1_data + tgt1_offset; 24 | global float *tgt2 = tgt2_data + tgt2_offset; 25 | global float *src_ = src__data + src__offset; 26 | 27 | for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) { 28 | for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) { 29 | global float *src = src_ + orow * row_size * num_irows + irow; 30 | Pair acc = {.first={{init}}, .second=-1}; 31 | for (int col = 0; col < row_size; ++col) { 32 | Pair lhs = {*src, col+1}; 33 | acc = binary_op( lhs, acc); 34 | // acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing 35 | src += num_irows; 36 | } 37 | tgt1[orow * num_irows + irow] = acc.first; 38 | tgt2[orow * num_irows + irow] = acc.second; 39 | } 40 | } 41 | } 42 | 43 | /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index)) 44 | * 45 | * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension: 46 | * 47 | * - block.x is the innermost dimension, i.e. dimension 0; 48 | * - block.y and grid.y make up dimension 1; and 49 | * - grid.x and grid z are the remaining two outer dimensions (if any) 50 | * 51 | * Reduction along other dimensions is handled in a separate kernel. 52 | */ 53 | kernel void THClTensor_kernel_transformReduceInnermostDimIndex( 54 | global float *tgt1_data, int tgt1_offset, 55 | global float *tgt2_data, int tgt2_offset, 56 | global float *src__data, int src__offset, 57 | int num_rows, int row_size 58 | ) { 59 | global float *tgt1 = tgt1_data + tgt1_offset; 60 | global float *tgt2 = tgt2_data + tgt2_offset; 61 | global float *src_ = src__data + src__offset; 62 | 63 | local float sbuf[{{y_threads}}][{{x_threads}}]; 64 | local float ibuf[{{y_threads}}][{{x_threads}}]; 65 | 66 | for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) { 67 | int row = block_row + get_local_id(1); 68 | // thrust::pair acc = init; 69 | Pair acc = { .first={{init}}, .second=-1 }; 70 | if (row < num_rows) { 71 | global float *src = src_ + row * row_size; 72 | // Sequential reduction within a thread. 73 | for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) { 74 | Pair lhs = {src[col], col+1}; 75 | acc = binary_op(lhs, acc); 76 | } 77 | } 78 | 79 | sbuf[get_local_id(1)][get_local_id(0)] = acc.first; 80 | ibuf[get_local_id(1)][get_local_id(0)] = acc.second; 81 | 82 | // Reduce intermediate values to single value. 83 | local float* sline = &sbuf[get_local_id(1)][0]; 84 | local float* iline = &ibuf[get_local_id(1)][0]; 85 | for (int s = 8; s > 0; s >>= 1) { 86 | if (row < num_rows && (int)get_local_id(0) < s) { 87 | Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]}; 88 | Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]}; 89 | Pair res = binary_op(arg1, arg2); 90 | sline[get_local_id(0)] = res.first; 91 | iline[get_local_id(0)] = res.second; 92 | } 93 | barrier(CLK_LOCAL_MEM_FENCE); 94 | } 95 | 96 | if (row < num_rows && get_local_id(0) == 0) { 97 | tgt1[row] = sline[0]; 98 | tgt2[row] = iline[0]; 99 | } 100 | barrier(CLK_LOCAL_MEM_FENCE); 101 | } 102 | } 103 | 104 | -------------------------------------------------------------------------------- /src/lib/THClTypeParseTraits.cpp: -------------------------------------------------------------------------------- 1 | #include "THClTypeParseTraits.h" 2 | 3 | #define REGISTER_PARSE_TYPE(X) template <> struct TypeParseTraits \ 4 | { static const char* name; } ; const char* TypeParseTraits::name = #X 5 | 6 | #define REGISTER_PARSE_TYPE_DEFINITION(X) \ 7 | const char* TypeParseTraits::name = #X 8 | 9 | 10 | REGISTER_PARSE_TYPE_DEFINITION(unsigned int); 11 | REGISTER_PARSE_TYPE_DEFINITION(unsigned long long); 12 | 13 | 14 | -------------------------------------------------------------------------------- /src/lib/THClTypeParseTraits.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // adapted from http://stackoverflow.com/questions/1055452/c-get-name-of-type-in-template 4 | template 5 | struct TypeParseTraits; 6 | 7 | #define REGISTER_PARSE_TYPE_DECLARATION(X) template <> struct TypeParseTraits \ 8 | { static const char* name; } ; 9 | 10 | 11 | REGISTER_PARSE_TYPE_DECLARATION(unsigned int); 12 | REGISTER_PARSE_TYPE_DECLARATION(unsigned long); 13 | REGISTER_PARSE_TYPE_DECLARATION(unsigned long long); 14 | 15 | -------------------------------------------------------------------------------- /src/test/run-test-device.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: should be called from root directory 4 | 5 | source ~/torch/activate || exit 1 6 | luarocks make rocks/cltorch-scm-1.rockspec || exit 1 7 | 8 | if [[ ! -v LUAEXE ]]; then { 9 | LUAEXE=luajit 10 | } fi 11 | echo using luaexe: ${LUAEXE} 12 | 13 | if [[ x${RUNGDB} == x1 ]]; then { 14 | rungdb.sh ${LUAEXE} test/test-device.lua 15 | } else { 16 | ${LUAEXE} test/test-device.lua 17 | } fi 18 | 19 | -------------------------------------------------------------------------------- /src/test/run-test-perf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: should be called from root directory 4 | 5 | source ~/torch/activate || exit 1 6 | luarocks make rocks/cltorch-scm-1.rockspec || exit 1 7 | 8 | if [[ ! -v LUAEXE ]]; then { 9 | LUAEXE=luajit 10 | } fi 11 | echo using luaexe: ${LUAEXE} 12 | 13 | if [[ x${RUNGDB} == x1 ]]; then { 14 | rungdb.sh ${LUAEXE} test/test-perf.lua 15 | } else { 16 | ${LUAEXE} test/test-perf.lua 17 | } fi 18 | 19 | -------------------------------------------------------------------------------- /src/test/run-test-tensor.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Note: should be called from root directory 4 | 5 | source ~/torch/activate || exit 1 6 | luarocks make rocks/cltorch-scm-1.rockspec || exit 1 7 | 8 | if [[ ! -v LUAEXE ]]; then { 9 | LUAEXE=luajit 10 | } fi 11 | echo using luaexe: ${LUAEXE} 12 | 13 | if [[ x${RUNGDB} == x1 ]]; then { 14 | rungdb.sh ${LUAEXE} test/test-tensor.lua 15 | } else { 16 | ${LUAEXE} test/test-tensor.lua 17 | } fi 18 | 19 | -------------------------------------------------------------------------------- /src/test/test-device.lua: -------------------------------------------------------------------------------- 1 | print("running require cltorch...") 2 | require 'cltorch' 3 | print("... require cltorch done") 4 | 5 | numDevices = cltorch.getDeviceCount() 6 | print('num devices:', numDevices) 7 | 8 | for device=1,numDevices do 9 | props = cltorch.getDeviceProperties(device) 10 | print('device properties, device', device) 11 | for k,v in pairs(props) do 12 | print(' ', k, v) 13 | end 14 | end 15 | 16 | for device=1,numDevices do 17 | cltorch.setDevice(device) 18 | c = torch.ClTensor{7,-4,5} 19 | print('c1\n', c) 20 | print(c:abs()) 21 | end 22 | 23 | --c = torch.ClTensor{7,4,5} 24 | --print('c1\n', c) 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /src/test/test-perf.lua: -------------------------------------------------------------------------------- 1 | require 'cltorch' 2 | require 'sys' 3 | 4 | function test_apply1(its) 5 | a = torch.ClTensor(50, 500) 6 | a:uniform() 7 | a:add(1) 8 | cltorch.dumpProfiling() 9 | cltorch.dumpTimings() 10 | for it=1,its do 11 | a:add(it) 12 | end 13 | cltorch.dumpTimings() 14 | end 15 | 16 | function test_apply2(its) 17 | a = torch.ClTensor(50, 500) 18 | a:uniform() 19 | b = torch.ClTensor(50, 500) 20 | b:uniform() 21 | a:add(b) 22 | cltorch.dumpProfiling() 23 | cltorch.dumpTimings() 24 | for it=1,its do 25 | a:add(b) 26 | end 27 | cltorch.dumpTimings() 28 | end 29 | 30 | function test_scatterFill(its) 31 | idx = torch.multinomial(torch.range(1,10):reshape(10,1):expand(10,10):t(),10):t():cl() 32 | a = torch.Tensor(10,10):uniform():mul(100):int():cl() 33 | c = a:scatter(1,idx,3) 34 | cltorch.dumpProfiling() 35 | cltorch.dumpTimings() 36 | for it=1,its do 37 | a:scatter(1,idx,it) 38 | end 39 | cltorch.dumpProfiling() 40 | cltorch.dumpTimings() 41 | end 42 | 43 | function test_apply3(its, size) 44 | its = its or 900 45 | size = size or 6400 46 | a = torch.ClTensor(size) 47 | a:uniform() 48 | b = torch.ClTensor(size) 49 | b:uniform() 50 | c = torch.ClTensor(size) 51 | c:uniform() 52 | a:cmul(b,c) 53 | cltorch.finish() 54 | sys.tic() 55 | cltorch.dumpProfiling() 56 | cltorch.dumpTimings() 57 | for it=1,its do 58 | a:cmul(b,c) 59 | end 60 | cltorch.finish() 61 | print(sys.toc() * 1000) 62 | print('after mul', its, size) 63 | cltorch.dumpTimings() 64 | cltorch.dumpProfiling() 65 | end 66 | 67 | function test_apply3b(its, size) 68 | its = its or 900 69 | size = size or 6400 70 | a = torch.ClTensor(size) 71 | a:uniform() 72 | b = torch.ClTensor(size) 73 | b:uniform() 74 | c = torch.ClTensor(size) 75 | c:uniform() 76 | d = torch.ClTensor(size) 77 | d:uniform() 78 | a:cmul(b,c) 79 | a:add(b,c) 80 | e = torch.ClTensor(1000,1000) 81 | cltorch.finish() 82 | cltorch.dumpProfiling() 83 | cltorch.dumpTimings() 84 | sys.tic() 85 | for it=1,its do 86 | --d:cmul(b,c) 87 | f = e:sum() 88 | c:add(a,d) 89 | end 90 | cltorch.finish() 91 | print('sys.toc', sys.toc() * 1000) 92 | print('after mul', its, size) 93 | cltorch.dumpTimings() 94 | cltorch.dumpProfiling() 95 | end 96 | 97 | cltorch.setAddFinish(1) 98 | cltorch.setDevice(1) 99 | --cltorch.setProfiling(1) 100 | cltorch.setTiming(1) 101 | --test_apply1(500) 102 | --test_apply2(500) 103 | -- test_scatterFill(10000) 104 | --test_apply3(900, 6400) 105 | test_apply3b(900, 6400) 106 | test_apply3(900, 64000) 107 | --test_apply3(900, 64000) 108 | --test_apply3(900, 640000) 109 | --cltorch.dumpProfiling() 110 | 111 | 112 | -------------------------------------------------------------------------------- /src/test/test-zsh.zsh: -------------------------------------------------------------------------------- 1 | 2 | ps 3 | source ~/torch/install/bin/torch-activate 4 | env | grep LD 5 | env | grep PATH 6 | env | grep LUA 7 | luajit -e 'print("hello")' 8 | luajit -l torch -e 'print(torch.Tensor(3,2):uniform())' 9 | luajit -l cltorch -e 'cltorch.setAllowNonGpus(1); print(torch.ClTensor(3,2):uniform())' 10 | 11 | -------------------------------------------------------------------------------- /src/test/test_userkernel.lua: -------------------------------------------------------------------------------- 1 | require 'cltorch' 2 | 3 | k = torch.ClKernel({input={nElements='int', input='torch.ClTensor'},output={output='torch.ClTensor'},src=[[ 4 | int linearId = get_global_id(0); 5 | if(linearId < nElements) { 6 | output_data[linearId] = input_data[linearId] + 3.0f; 7 | } 8 | ]]}) 9 | print('k', k) 10 | k:print() 11 | 12 | x = torch.ClTensor({3,5,2}) 13 | y = torch.ClTensor({6,4,2}) 14 | print('x before\n', x) 15 | print('y before\n', y) 16 | 17 | k:run({nElements=3, input=x, output=y}, {numWorkgroups=10, workgroupSize=32}) 18 | 19 | print('y after\n', y) 20 | 21 | -------------------------------------------------------------------------------- /src/test/unit_storage.lua: -------------------------------------------------------------------------------- 1 | require 'string' 2 | 3 | local runtests = false 4 | if not cltorch then 5 | print('requiring cltorch') 6 | require 'cltorch' 7 | runtests = true 8 | end 9 | 10 | if not cltorch.tests then 11 | cltorch.tests = {} 12 | end 13 | 14 | cltorch.tests.storage = {} 15 | 16 | local function assertStrContains(target, value ) 17 | local res = string.find(target, value) 18 | if res == nil then 19 | print('assertStrContains fail: [' .. string.gsub(target, '\n', '\\n\n') .. '] not contains [' .. string.gsub(value, '\n', '\\n\n') .. ']') 20 | tester:assert(string.find(target, value) ~= nil) 21 | end 22 | end 23 | 24 | function cltorch.tests.storage.test_basic() 25 | tester:asserteq('[torch.ClStorage of size 0]\n', tostring(torch.ClStorage()), '') 26 | assertStrContains(tostring(torch.ClStorage(3)), '%[torch.ClStorage of size 3%]\n') 27 | tester:asserteq(tostring(torch.ClStorage{4,9,2}), ' 4\n 9\n 2\n[torch.ClStorage of size 3]\n') 28 | tester:asserteq(tostring(torch.ClStorage{1.5,2.4,5.3}), ' 1.5000\n 2.4000\n 5.3000\n[torch.ClStorage of size 3]\n') 29 | 30 | c = torch.ClStorage{4,9,2} 31 | c:fill(7) 32 | tester:asserteq(tostring(c), ' 7\n 7\n 7\n[torch.ClStorage of size 3]\n', '') 33 | 34 | c = torch.ClStorage{4,9,2} 35 | c:copy(torch.Storage{1.5,2.4,5.3}) 36 | tester:asserteq(tostring(c), ' 1.5000\n 2.4000\n 5.3000\n[torch.ClStorage of size 3]\n') 37 | 38 | a = torch.Storage(3) 39 | c = torch.ClStorage{4,9,2} 40 | a:copy(c) 41 | tester:asserteq(tostring(a), ' 4\n 9\n 2\n[torch.DoubleStorage of size 3]\n') 42 | 43 | -- removed, since copies whole buffer :-( 44 | -- c = torch.ClStorage{4,9,2} 45 | -- c[2] = 21 46 | -- tester:asserteq(tostring(c), '\n 4\n 21\n 2\n[torch.ClStorage of size 3]\n') 47 | 48 | c = torch.ClStorage{4,9,2} 49 | d = torch.ClStorage(3) 50 | d:copy(c) 51 | tester:asserteq(tostring(d), ' 4\n 9\n 2\n[torch.ClStorage of size 3]\n') 52 | tester:asserteq(3, #d) 53 | tester:asserteq(3, d:size()) 54 | 55 | c:resize(5) 56 | tester:asserteq(5, #c) 57 | c:fill(1) 58 | tester:asserteq(tostring(c), ' 1\n 1\n 1\n 1\n 1\n[torch.ClStorage of size 5]\n') 59 | end 60 | 61 | function cltorch.tests.storage.test_get() 62 | -- we probably should support this. specifically, without this, lbfgs doesnt work :-P 63 | -- a = torch.Storage(10000) 64 | acl = torch.ClStorage(10000) 65 | tester:asserteq('torch.ClStorage', torch.type(acl)) 66 | acl[2] = 72 67 | acl[500] = 104 68 | acl[7500] = 1040 69 | -- acl:copy(a) 70 | tester:asserteq(72, acl[2]) 71 | tester:asserteq(104, acl[500]) 72 | tester:asserteq(1040, acl[7500]) 73 | end 74 | 75 | local function setUp() 76 | -- cltorch.setDevice(1) 77 | print('') 78 | end 79 | 80 | local test = torch.TestSuite() 81 | for k,v in pairs(cltorch.tests.storage) do 82 | test[k] = function() 83 | setUp() 84 | v() 85 | end 86 | end 87 | 88 | function cltorch.tests.storage.test() 89 | tester = torch.Tester() 90 | tester:add(test) 91 | tester:run(tests) 92 | print('#tester.errors', #tester.errors) 93 | return #tester.errors == 0 94 | end 95 | 96 | if runtests then 97 | return cltorch.tests.storage.test() 98 | end 99 | 100 | -------------------------------------------------------------------------------- /src/torch/utils.c: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | THLongStorage* cltorch_checklongargs(lua_State *L, int index) 4 | { 5 | THLongStorage *storage; 6 | int i; 7 | int narg = lua_gettop(L)-index+1; 8 | 9 | if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage")) 10 | { 11 | THLongStorage *storagesrc = luaT_toudata(L, index, "torch.LongStorage"); 12 | storage = THLongStorage_newWithSize(storagesrc->size); 13 | THLongStorage_copy(storage, storagesrc); 14 | } 15 | else 16 | { 17 | storage = THLongStorage_newWithSize(narg); 18 | for(i = index; i < index+narg; i++) 19 | { 20 | if(!lua_isnumber(L, i)) 21 | { 22 | THLongStorage_free(storage); 23 | luaL_argerror(L, i, "number expected"); 24 | } 25 | THLongStorage_set(storage, i-index, lua_tonumber(L, i)); 26 | } 27 | } 28 | return storage; 29 | } 30 | 31 | int cltorch_islongargs(lua_State *L, int index) 32 | { 33 | int narg = lua_gettop(L)-index+1; 34 | 35 | if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage")) 36 | { 37 | return 1; 38 | } 39 | else 40 | { 41 | int i; 42 | 43 | for(i = index; i < index+narg; i++) 44 | { 45 | if(!lua_isnumber(L, i)) 46 | return 0; 47 | } 48 | return 1; 49 | } 50 | return 0; 51 | } 52 | 53 | struct THClState* cltorch_getstate(lua_State* L) 54 | { 55 | lua_getglobal(L, "cltorch"); 56 | lua_getfield(L, -1, "_state"); 57 | struct THClState *state = lua_touserdata(L, -1); 58 | lua_pop(L, 2); 59 | return state; 60 | } 61 | -------------------------------------------------------------------------------- /src/torch/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef CLTORCH_UTILS_INC 2 | #define CLTORCH_UTILS_INC 3 | 4 | #include "luaT.h" 5 | #include "TH.h" 6 | 7 | #ifdef __cplusplus 8 | # define TORCH_EXTERNC extern "C" 9 | #else 10 | # define TORCH_EXTERNC extern 11 | #endif 12 | 13 | #ifdef WIN32 14 | # ifdef torch_EXPORTS 15 | # define TORCH_API TORCH_EXTERNC __declspec(dllexport) 16 | # else 17 | # define TORCH_API TORCH_EXTERNC __declspec(dllimport) 18 | # endif 19 | #else 20 | # define TORCH_API TORCH_EXTERNC 21 | #endif 22 | 23 | #if LUA_VERSION_NUM == 501 24 | /* 25 | ** Adapted from Lua 5.2.0 26 | */ 27 | static void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) { 28 | luaL_checkstack(L, nup+1, "too many upvalues"); 29 | for (; l->name != NULL; l++) { /* fill the table with given functions */ 30 | int i; 31 | lua_pushstring(L, l->name); 32 | for (i = 0; i < nup; i++) /* copy upvalues to the top */ 33 | lua_pushvalue(L, -(nup+1)); 34 | lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */ 35 | lua_settable(L, -(nup + 3)); 36 | } 37 | lua_pop(L, nup); /* remove upvalues */ 38 | } 39 | #endif 40 | 41 | 42 | TORCH_API THLongStorage* cltorch_checklongargs(lua_State *L, int index); 43 | TORCH_API int cltorch_islongargs(lua_State *L, int index); 44 | 45 | struct THClState; 46 | TORCH_API struct THClState* cltorch_getstate(lua_State* L); 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /src/travis/install-torch.sh: -------------------------------------------------------------------------------- 1 | if false; then { 2 | git clone https://github.com/torch/distro.git ~/torch 3 | cd ~/torch 4 | 'for pkg in cudnn cunn cunnx cutorch qttorch trepl graph optim sdl2 threads submodule graphicsmagick audio fftw3 signal nnx qtlua gnuplot dok iTorch argcheck image xlua; do { sed -i -e "s/\(.*$pkg.*\)/echo skipping $pkg # \1/" install.sh; } done' 5 | 'awk ''NR==2{print "set -x"}1'' install.sh > ~install.sh' 6 | mv ~install.sh install.sh 7 | chmod +x install.sh 8 | cat install.sh 9 | for pkg in exe/luajit-rocks extra/nn pkg/cwrap pkg/paths pkg/sundown pkg/sys pkg/torch pkg/paths extra/lua-cjson extra/luaffifb extra/luafilesystem extra/penlight; do { git submodule update --quiet --init $pkg; } done 10 | sed -i -e 's/$(MAKE)/$(MAKE) -j 4/' pkg/torch/rocks/torch-scm-1.rockspec 11 | ./install.sh -b >/dev/null 12 | } else { 13 | mkdir -p ~/torch 14 | cd ~/torch 15 | wget https://s3.amazonaws.com/hughperkinstravis/hughperkins/distro/3/3.1/torch-install.tar.bz2 16 | tar -xf torch-install.tar.bz2 17 | } fi 18 | 19 | sed -i -e 's/^export LD_LIBRARY_PATH/# export LD_LIBRARY_PATH/' ~/torch/install/bin/torch-activate 20 | sed -i -e 's/^export DYLD_LIBRARY_PATH/# export LD_LIBRARY_PATH/' ~/torch/install/bin/torch-activate 21 | source ~/torch/install/bin/torch-activate 22 | luajit -l torch -e 'print(torch.Tensor(3,2):uniform())' 23 | 24 | -------------------------------------------------------------------------------- /src/util/port.py: -------------------------------------------------------------------------------- 1 | """ 2 | This does a first cut port from `../cutorch-goodies2` directory, 3 | into the `port` subdirectory. 4 | I've never actually used it for porting whole files yet, but 5 | it does make using `meld` against newer cutorch branches, such as 6 | `goodies2` much more possible. 7 | 8 | Possible future enhancements: 9 | - make it automatically move kernels and device functions into a '.cl' 10 | file (plausibly anything with host goes both into the .h/.cpp, and also 11 | into the .cl) 12 | """ 13 | 14 | from __future__ import print_function 15 | import sys 16 | import os 17 | from os.path import join as jp 18 | from os import path 19 | 20 | src_dir = '../cutorch' # directory to port from 21 | 22 | def process_block(block): 23 | if block.find(' operator()') >= 0: 24 | # its an Op struct, we are not writing these as kernels 25 | # but using Apply instead, and passing in the appropriate code 26 | # as strings into the kernel templates 27 | return (block, False) 28 | if block.find('__global__') >= 0 or block.find('__device__') >= 0: 29 | # kernel method, probably 30 | block = block.replace('gridDim.x', 'get_num_groups(0)') 31 | block = block.replace('gridDim.y', 'get_num_groups(1)') 32 | block = block.replace('blockDim.x', 'get_local_size(0)') 33 | block = block.replace('blockDim.y', 'get_local_size(1)') 34 | block = block.replace('blockIdx.x', 'get_group_id(0)') 35 | block = block.replace('blockIdx.y', 'get_group_id(1)') 36 | block = block.replace('threadIdx.x', 'get_local_id(0)') 37 | block = block.replace('threadIdx.y', 'get_local_id(1)') 38 | block = block.replace('__global__', 'kernel') 39 | block = block.replace('__shared__', 'local') 40 | block = block.replace('__syncthreads()', 'barrier(CLK_LOCAL_MEM_FENCE)') 41 | block = block.replace('warpSize', '{{WarpSize}}') 42 | block = block.replace('IndexType', '{{IndexType}}') 43 | block = block.replace('__device__', '/*__device__*/') 44 | block = block.replace('__forceinline__', '/*__forceline__*/') 45 | return (block, True) 46 | return (block, False) 47 | 48 | def process_dir(cutorch_dir, port_dir, rel_dir): 49 | cutorch_src = jp(cutorch_dir, rel_dir) 50 | cltorch_dst = jp(port_dir, rel_dir).replace('THC', 'THCl') 51 | if not path.isdir(cltorch_dst): 52 | os.makedirs(cltorch_dst) 53 | for filename in os.listdir(cltorch_dst): 54 | filepath = jp(cltorch_dst, filename) 55 | if path.isfile(filepath): 56 | os.remove(filepath) 57 | out_filenames = [] 58 | for filename in os.listdir(cutorch_src): 59 | original_filename = filename 60 | print('filename', filename) 61 | original_filepath = jp(cutorch_src, filename) 62 | if not path.isfile(original_filepath): 63 | continue 64 | f = open(jp(cutorch_src, filename), 'r') 65 | contents = f.read() 66 | f.close() 67 | base_name = filename.split('.')[0].replace('THC', 'THCl') 68 | suffix = '.' + filename.split('.')[1] 69 | if suffix == '.cuh': 70 | suffix = '.h' 71 | if suffix == '.cu': 72 | suffix = '.cpp' 73 | if suffix == '.c': 74 | suffix = '.cpp' 75 | filename = '{base}{suffix}'.format( 76 | base=base_name, 77 | suffix=suffix) 78 | if filename in out_filenames: 79 | print('warning: filename conflict: {filename}'.format( 80 | filename=filename)) 81 | contents = contents.replace('CUDA', 'CL') 82 | contents = contents.replace('Cuda', 'Cl') 83 | contents = contents.replace('#include "THC', '#include "THCl') 84 | contents = contents.replace('THC_', 'THCL_') 85 | contents = contents.replace('THCState', 'THClState') 86 | contents = contents.replace('CUTORCH', 'CLTORCH') 87 | contents = contents.replace('THCBlasState', 'THClBlasState') 88 | contents = contents.replace('cublasOperation_t', 'clblasTranspose') 89 | contents = contents.replace('cublas', 'clblas') 90 | contents = contents.replace('cutorch', 'cltorch') 91 | 92 | # line by line: 93 | new_contents = '' 94 | new_cl = '' 95 | scope_dead = False 96 | depth = 0 97 | block = '' 98 | for line in contents.split('\n'): 99 | if line.startswith('#include = 0: 102 | line = '// ' + line 103 | scope_dead = True 104 | if line.find('{') >= 0: 105 | depth += 1 106 | if line.find('#include = 0: 107 | line = '' 108 | if line.strip() == 'THClCheck(cudaGetLastError());': 109 | line = '' 110 | if scope_dead and line.find('return') >= 0: 111 | line = (' THError("Not implemented");\n' + 112 | ' return 0;\n // ' + 113 | line) 114 | scope_dead = False 115 | if line.find('}') >= 0: 116 | if scope_dead: 117 | line = (' THError("Not implemented");\n' + 118 | line) 119 | scope_dead = False 120 | depth -= 1 121 | block += line + '\n' 122 | if line.strip() == '' and depth == 0: 123 | block, is_cl = process_block(block) 124 | if is_cl: 125 | new_cl += block 126 | else: 127 | new_contents += block 128 | block = '' 129 | block, is_cl = process_block(block) 130 | if is_cl: 131 | new_cl += block 132 | else: 133 | new_contents += block 134 | block = '' 135 | if new_contents.strip() != "": 136 | f = open(jp(cltorch_dst, filename), 'a') 137 | f.write('// from lib/THC/{filename}:\n\n'.format( 138 | filename=original_filename)) 139 | f.write(new_contents) 140 | f.close() 141 | out_filenames.append(filename) 142 | if new_cl.strip() != '': 143 | clfilename = original_filename.replace('.cuh', '.cl') 144 | clfilename = clfilename.replace('.cu', '.cl') 145 | clfilename = clfilename.replace('THC', 'THCl') 146 | clfilepath = jp(cltorch_dst, clfilename) 147 | f = open(clfilepath, 'a') 148 | f.write('// from {rel_dir}/{filename}:\n\n'.format( 149 | rel_dir=rel_dir, 150 | filename=original_filename)) 151 | f.write(new_cl) 152 | f.close() 153 | 154 | process_dir(src_dir, 'port', 'lib/THC') 155 | process_dir(src_dir, 'port', 'torch') 156 | process_dir(src_dir, 'port', 'torch/generic') 157 | # cutorch_dir = '../cutorch-goodies2' 158 | 159 | # cutorch_src = '{cutorch_dir}/lib/THC'.format( 160 | # cutorch_dir=cutorch_dir) 161 | 162 | # port_dir = 'port' 163 | 164 | --------------------------------------------------------------------------------