├── .gitignore
├── .gitmodules
├── .travis.yml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── doc
    ├── ContributorGuidelines.md
    ├── ImplementedDetails.md
    ├── MigrateStatusByFile.md
    ├── OlderChanges.md
    ├── img
    │   ├── gpupipelinemultiple.png
    │   ├── reduceall_pipelinestall.png
    │   └── singlegpuoperation.png
    └── torch
    │   ├── cuda-to-opencl.md
    │   ├── cutorch-apply.md
    │   └── torch.md
├── rocks
    └── cltorch-scm-1.rockspec
└── src
    ├── CMakeLists.txt
    ├── FFI.lua
    ├── Random.lua
    ├── Storage.cpp
    ├── Tensor.cpp
    ├── Tensor.lua
    ├── TensorMath.lua
    ├── TensorOperator.c
    ├── Test.lua
    ├── UserKernel.cpp
    ├── UserKernel.h
    ├── cmake
        ├── build_EasyCL.cmake
        └── build_clBLAS.cmake
    ├── init.cpp
    ├── init.lua
    ├── lib
        ├── CMakeLists.txt
        ├── THCl.h
        ├── THClApply.cl
        ├── THClApply.cpp
        ├── THClApply.h
        ├── THClBlas.cpp
        ├── THClBlas.h
        ├── THClDeviceUtils.cl
        ├── THClDeviceUtils.cpp
        ├── THClDeviceUtils.h
        ├── THClGather.cl
        ├── THClGather.cpp
        ├── THClGeneral.cpp
        ├── THClGeneral.h
        ├── THClKernels.cpp
        ├── THClKernels.h
        ├── THClOperators.h
        ├── THClReduce.cl
        ├── THClReduce.cpp
        ├── THClReduce.h
        ├── THClReduceAll.cl
        ├── THClReduceAll.cpp
        ├── THClReduceAll.h
        ├── THClReduceApplyUtils.cl
        ├── THClReduceApplyUtils.cpp
        ├── THClReduceApplyUtils.h
        ├── THClScatter.cl
        ├── THClScatter.cpp
        ├── THClStorage.cpp
        ├── THClStorage.h
        ├── THClStorageCopy.cpp
        ├── THClStorageCopy.h
        ├── THClStorageGet.cl
        ├── THClStorageSet.cl
        ├── THClTensor.cpp
        ├── THClTensor.h
        ├── THClTensorCopy.cpp
        ├── THClTensorCopy.h
        ├── THClTensorIndex.cl
        ├── THClTensorIndex.cpp
        ├── THClTensorInfoCl.cl
        ├── THClTensorMasked.cl
        ├── THClTensorMasked.cpp
        ├── THClTensorMath.cpp
        ├── THClTensorMath.h
        ├── THClTensorMath2.cl
        ├── THClTensorMath2.cpp
        ├── THClTensorMathBlas.cpp
        ├── THClTensorMathCompare.cpp
        ├── THClTensorMathCompare.h
        ├── THClTensorMathCompareT.cpp
        ├── THClTensorMathPairwise.cpp
        ├── THClTensorMathPointwise.cpp
        ├── THClTensorMathPointwise.h
        ├── THClTensorMathScan.cl
        ├── THClTensorMathScan.cpp
        ├── THClTensorMathTransformReduce.cl
        ├── THClTensorMathTransformReduce.cpp
        ├── THClTypeParseTraits.cpp
        └── THClTypeParseTraits.h
    ├── test
        ├── run-test-device.sh
        ├── run-test-perf.sh
        ├── run-test-tensor.sh
        ├── test-device.lua
        ├── test-perf.lua
        ├── test-tensor.lua
        ├── test-zsh.zsh
        ├── test_userkernel.lua
        ├── unit_storage.lua
        └── unit_tensor.lua
    ├── torch
        ├── generic
        │   ├── Storage.cpp
        │   └── Tensor.cpp
        ├── utils.c
        └── utils.h
    ├── travis
        └── install-torch.sh
    └── util
        └── port.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | *_generated.*
 7 | 
 8 | port/
 9 | /lua
10 | /luaT
11 | /TH
12 | 
13 | # Precompiled Headers
14 | *.gch
15 | *.pch
16 | 
17 | # Compiled Dynamic libraries
18 | *.so
19 | *.dylib
20 | *.dll
21 | 
22 | # Fortran module files
23 | *.mod
24 | 
25 | # Compiled Static libraries
26 | *.lai
27 | *.la
28 | *.a
29 | *.lib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | 
36 | build/
37 | share/
38 | /share
39 | /back
40 | *~
41 | 
42 | generated/
43 | non-templated/
44 | templated-auto/
45 | templates.manual/
46 | 
47 | build-*/
48 | 
49 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/EasyCL"]
 2 | 	path = src/EasyCL
 3 | 	url = https://github.com/hughperkins/EasyCL.git
 4 | [submodule "src/clMathLibraries/clBLAS"]
 5 | 	path = src/clMathLibraries/clBLAS
 6 | 	url = https://github.com/hughperkins/clBLAS
 7 | [submodule "src/boost-headers-lite"]
 8 | 	path = src/boost-headers-lite
 9 | 	url = https://github.com/hughperkins/boost-headers-lite
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: objective-c
 2 | matrix:
 3 |   include:
 4 |     - env: OSX=10.11
 5 |       os: osx
 6 |       osx_image: osx10.11
 7 |       rvm: system
 8 | 
 9 | before_install:
10 |   - if [ -f ".git/shallow" ]; then travis_retry git fetch --unshallow; fi
11 |   - whoami
12 |   - pwd
13 |   - bash src/travis/install-torch.sh
14 |   - source ~/torch/install/bin/torch-activate
15 |   - luajit -l torch -e 'print(torch.Tensor(3,2):uniform())'
16 |   - mkdir ~/git
17 |   - cd ~/git
18 |   - cd ~/build/hughperkins/cltorch
19 | 
20 | script:
21 |   - cat ~/torch/install/bin/torch-activate
22 |   - source ~/torch/install/bin/torch-activate
23 |   - luarocks make rocks/cltorch-scm-1.rockspec
24 |   - otool -L ~/torch/install/lib/lua/5.1/libcltorch.so
25 |   - otool -l ~/torch/install/lib/lua/5.1/libcltorch.so | grep RPATH -A2
26 |   - luajit -e 'require "cltorch"'
27 |   - luajit -l cltorch -e "cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())"
28 |   - 'luajit -e "require ''cltorch''; cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())"'
29 |   - luajit -l cltorch -e "cltorch.setAllowNonGpus(1); props = cltorch.getDeviceProperties(1); for k,v in pairs(props) do print(k,v) end"
30 |   - export TEST_EXCLUDES=test_blas,test_cumprod,test_cumsum,test_equals,test_indexcopy,test_indexfill,test_matrixwide,test_max2,test_mean,test_meanall,test_min2,test_norm,test_prod,test_prodall,test_sum_t,test_sumallt,test_reduceAll,test_sum,test_sum_t_offset,test_sumall
31 |   - 'luajit -l cltorch -e "cltorch.setAllowNonGpus(1); cltorch.test()"'
32 |   - zsh src/test/test-zsh.zsh
33 |   - #zsh
34 |   - #ps
35 |   - #source ~/torch/install/bin/torch-activate
36 |   - #luajit -e 'require "cltorch"'
37 |   - #luajit -l cltorch -e "cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())"
38 |   - #'luajit -e "require ''cltorch''; cltorch.setAllowNonGpus(1); print(cltorch.getDeviceCount())"'
39 |   - #luajit -l cltorch -e "cltorch.setAllowNonGpus(1); props = cltorch.getDeviceProperties(1); for k,v in pairs(props) do print(k,v) end"
40 |   - #export TEST_EXCLUDES=test_blas,test_cumprod,test_cumsum,test_equals,test_indexcopy,test_indexfill,test_matrixwide,test_max2,test_mean,test_meanall,test_min2,test_norm,test_prod,test_prodall,test_sum_t,test_sumallt,test_reduceAll,test_sum,test_sum_t_offset,test_sumall
41 | 
42 | notifications:
43 |   email:
44 |     on_success: never
45 |     on_failure: never
46 | 
47 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 2.8.12)
  2 | 
  3 | OPTION(DEV_RUN_COG "cltorch maintainers only, otherwise set to 'OFF'." OFF)
  4 | 
  5 | if("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux")
  6 |   set(ON_LINUX 1)
  7 | endif()
  8 | if("${CMAKE_SYSTEM_NAME}" STREQUAL "Windows")
  9 |   set(ON_WINDOWS 1)
 10 | endif()
 11 | 
 12 | # https://cmake.org/Wiki/CMake_RPATH_handling
 13 | SET(CMAKE_INSTALL_RPATH "${Torch_INSTALL_LUA_CPATH_SUBDIR}}/../..")
 14 | SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
 15 | SET(CMAKE_MACOSX_RPATH TRUE)
 16 | 
 17 | #SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")
 18 | 
 19 | FIND_PACKAGE(Torch REQUIRED)
 20 | 
 21 | SET(CMAKE_C_FLAGS "-std=c99 -Werror=implicit-function-declaration")
 22 | SET(CMAKE_CXX_FLAGS "-std=c++0x -Wall")
 23 | 
 24 | SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/src/cmake")
 25 | 
 26 | if(UNIX)
 27 |     execute_process(COMMAND git --version RESULT_VARIABLE git_test ERROR_QUIET)
 28 |     if(${git_test} EQUAL 0)
 29 |         if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git")
 30 |             message("using git setting USE_GIT 1")
 31 |             SET(USE_GIT 1)
 32 |         endif()
 33 |     endif()
 34 | endif()
 35 | 
 36 | if(UNIX)
 37 |   execute_process(COMMAND bash -c "${CMAKE_PREFIX_PATH}/bin/luarocks list distrocheck | grep distrocheck"
 38 |     RESULT_VARIABLE DISTROCHECK)
 39 |   if(${DISTROCHECK} EQUAL "1")
 40 |     message("CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH}")
 41 |     execute_process(COMMAND bash -c "${CMAKE_PREFIX_PATH}/bin/luarocks list")
 42 |     message(FATAL_ERROR "
 43 |     Please install cltorch from https://github.com/hughperkins/distro-cl")
 44 |   endif()
 45 | else()  # windows?
 46 |   execute_process(COMMAND "luarocks list distrocheck | find /i \"distrocheck\""
 47 |     RESULT_VARIABLE DISTROCHECK)
 48 |   if(${DISTROCHECK} EQUAL "1")
 49 |     message("CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH}")
 50 |     execute_process(COMMAND "luarocks list")
 51 |     message(FATAL_ERROR "
 52 |     Please install cltorch from https://github.com/hughperkins/distro-cl")
 53 |   endif()
 54 | endif()
 55 | 
 56 | if(USE_GIT)
 57 |     message("using git")
 58 |     execute_process(COMMAND bash -c "echo $PWD"
 59 |       WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
 60 |     execute_process(COMMAND git status
 61 |       WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}")
 62 |         execute_process(
 63 |           COMMAND git submodule update --init --force --recursive
 64 |       WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
 65 |           )
 66 | endif()
 67 | 
 68 | INCLUDE("${CMAKE_MODULE_PATH}/build_clBLAS.cmake")
 69 | INCLUDE_DIRECTORIES(${clBLAS_INCLUDE_DIRS})
 70 | #LINK_DIRECTORIES(${CLBLAS_LIBRARY_DIR})
 71 | 
 72 | INCLUDE("${CMAKE_MODULE_PATH}/build_EasyCL.cmake")
 73 | INCLUDE_DIRECTORIES(${EasyCL_INCLUDE_DIRS})
 74 | #LINK_DIRECTORIES(${EASYCL_LIBRARY_DIR})
 75 | 
 76 | ADD_SUBDIRECTORY(src)
 77 | 
 78 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src")
 79 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src/lib")
 80 | INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/src/torch")
 81 | 
 82 | SET(src src/UserKernel.cpp src/init.cpp src/torch/utils.c src/Storage.cpp src/Tensor.cpp TensorMath.c
 83 |   src/TensorOperator.c)
 84 | SET(luasrc src/init.lua src/FFI.lua src/Tensor.lua src/Random.lua src/Test.lua src/test/unit_storage.lua src/test/unit_tensor.lua)
 85 | 
 86 | ADD_TORCH_WRAP(cltorchtensormathwrap src/TensorMath.lua)
 87 | 
 88 | ADD_TORCH_PACKAGE(cltorch "${src}" "${luasrc}")
 89 | #ADD_DEPENDENCIES(cltorch clBLAS)
 90 | #ADD_DEPENDENCIES(cltorch EasyCL)
 91 | ADD_DEPENDENCIES(cltorch EasyCL-external)
 92 | ADD_DEPENDENCIES(cltorchtensormathwrap EasyCL)
 93 | #add_custom_target(
 94 | 
 95 | execute_process(COMMAND git log -n 1 --pretty=%h OUTPUT_VARIABLE git_commit OUTPUT_STRIP_TRAILING_WHITESPACE)
 96 | #execute_process(COMMAND echo string commit="${git_commit}" > ${CMAKE_CURRENT_SOURCE_DIR}/commit.h)
 97 | file(GENERATE OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/cltorch_commit_generated.h 
 98 |   CONTENT "const char *cltorch_commit=\"${git_commit}\";\n"
 99 | )
100 | 
101 | if(DEV_RUN_COG)
102 |     add_custom_target(
103 |         cog_cltorch
104 |         python ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/cogapp/cog.py -q -I ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/cog-batteries -r ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/*.h
105 |         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
106 |     )
107 |     add_dependencies(cltorch cog_cltorch)
108 | endif(DEV_RUN_COG)
109 | 
110 | 
111 | if(USE_GIT)
112 |     add_custom_target(
113 |         git_cltorch_rec
114 |         git submodule update --init --force --recursive
115 | #            COMMAND sed -i -e "s/-pedantic//" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt
116 | #            COMMAND sed -i -e "s/-Wall -Wextra/-w/" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt
117 | #            COMMAND sed -i -e "s/-Wstrict-prototypes/-w/" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt
118 | #            COMMAND sed -i -e "s/${CMAKE_CXX_FLAGS}/${CMAKE_CXX_FLAGS} -w /" ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src/CMakeLists.txt
119 |         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
120 |     )
121 |     add_dependencies(cltorch git_cltorch_rec)
122 |     add_dependencies(clBLAS-external git_cltorch_rec)
123 |     add_dependencies(EasyCL-external git_cltorch_rec)
124 | endif()
125 | 
126 | TARGET_LINK_LIBRARIES(cltorch luaT THCl)
127 | TARGET_LINK_LIBRARIES(cltorch EasyCL)
128 | TARGET_LINK_LIBRARIES(cltorch clBLAS)
129 | 
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Hugh Perkins (Hugh Perkins)
 2 | Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
 3 | Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
 4 | Copyright (c) 2011-2013 NYU (Clement Farabet)
 5 | Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 6 | Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 7 | Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
 8 | 
 9 | All rights reserved.
10 | 
11 | Redistribution and use in source and binary forms, with or without
12 | modification, are permitted provided that the following conditions are met:
13 | 
14 | 1. Redistributions of source code must retain the above copyright
15 |    notice, this list of conditions and the following disclaimer.
16 | 
17 | 2. Redistributions in binary form must reproduce the above copyright
18 |    notice, this list of conditions and the following disclaimer in the
19 |    documentation and/or other materials provided with the distribution.
20 | 
21 | 3. Neither the names of NEC Laboratories American and IDIAP Research
22 |    Institute nor the names of its contributors may be used to endorse or
23 |    promote products derived from this software without specific prior
24 |    written permission.
25 | 
26 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 | POSSIBILITY OF SUCH DAMAGE.
37 | 


--------------------------------------------------------------------------------
/doc/ContributorGuidelines.md:
--------------------------------------------------------------------------------
 1 | # Contributor Guidelines
 2 | 
 3 | This doc describes some of the things I use as guidelines when writing cltorch
 4 | 
 5 | Cutorch is an awesome, excellent high-performance implementation.  Whenever a functionality is available in cutorch, I think that porting the cutorch implementation to cltorch is going to tend to have several advantages:
 6 | * can integrate future improvements from cutorch
 7 | * development will tend to be much faster.  Probably at least 4-10 times to port something existing, than to write it from scratch
 8 | * correctness will likely be high, not too many logical bugs
 9 | * performance will likely be reasonable
10 | 
11 | Some things are not directly portable.  Notable examples are:
12 | * thrust
13 | * cuda templated kernels
14 | 
15 | Thrust is a cuda-specific library.  There is something similar for OpenCL, which is VexCL. For now, I prefer not to use VexCL, because it uses Boosts, which, rightly or wrongly, I somehow feel is a bit of a sledgehammer-to-crack-a-nut, really hard to build on Windows.  My last experience with boost was probably 11 years ago, so it might have changed :-P
16 | 
17 | So, generally, for thrust, there are a few options I've used up till now:
18 | * for the reduceall method, it turned out there was a thrust-free implementation in `goodies` branch of cutorch, so I ported that across, and seems to work great :-)
19 | * for the MSECriterion implementation in `clnn`, I simply wrote the operations on the lua side, using the cltorch `pow` and so on implementations.  I'm pretty sure performance will be ok
20 | * thrust is used all over the place.  Some creativity will be required.  Please get in touch if you come across a new situation, so we can discuss together.  I mean, you dont have to, but maybe could be a good idea :-)
21 | 
22 | Cuda C++ templates dont exist in OpenCL, at least not in the OpenCL 1.1 implementation I'm targeting.  Interestingly, OpenCL kernels are compiled at runtime, which some might see as a disadvantage, but it actually gives us a lot of flexibility.  And you can see that cltorch loads really quickly, whereas cutorch spends a while caching every possible kernel, when loaded.  This has good or bad points, for either really.
23 | 
24 | So... for the kernel templates, I quite like the compile-at-runtime approach.  I'm using a lua engine to instantiate the kernel templates for cltorch at runtime, at the point where they are first needed.  They are reused across multiple calls.  Nvidia caches these compiled kernels to disk, where they can be reused almost immediately, even after restarting the process.
25 | 
26 | # Porting utilities
27 | 
28 | There is a python script in `src/util` called 'port.py'.  It can help do a first-cut port of files, or facilitate meld of existing files.  Run it as follows:
29 | * in the parent directory of 'cltorch', clone the 'cutorch' repository
30 |   * currently, it expects the cutorch 'goodies2' branch to be cloned into `cutorch-goodies2` directory, but obviously you can hack port.py a bit to change the exact directory
31 | * from cltorch directory, run `python src/util/port.py`
32 | * A first-cut port of the files in ../cutorch-goodies2 will pop out in the `port` directory
33 | * .cuh files will become .h files
34 | * .cu files will become .cpp files
35 | * any kernels and kernel functions will plausibly be split into .cl files (with the same basename as the original .cu or .cuh file)
36 | 
37 | # Adding original functionality, not present in cutorch
38 | 
39 | For now, I haven't really come across this situation :-P  The only brush I had with this was considering adding ClByteTensors, but for now, I've shelved the idea of implementing those initially in cltorch.
40 | 
41 | I think that on the whole, for now, cutorch is the 'reference' implementation, and will probably remain so for a while.  There is a whole team of incredibly talented, hard-working, motivated individuals maintaining, and improving cutorch. For the foreseeable future, I think cltorch will be following cutorch, though you never know :-)
42 | 
43 | Therefore, on the whole, if I dont have to implement the original functionality myself, my recommendation would be: first add it to the cutorch side, then port it across to cltorch.
44 | 
45 | On the other hand, in fairness, if it was me, I'd probably do it on the cltorch side, and plausibly in a way totally unlikely to encourage back-port into cutorch :-P  So, anyway, if you want to implement something original in cltorch, perhaps you can discuss with me, and on the torch7 newsgroup?
46 | 
47 | # Operator classes
48 | 
49 | On the subject or original functionality, or at least, original implementations, in cutorch, operators, ie AddOp etc, are structs, which are injected directly into the nvcc compiler.  In OpenCL, we dont have c++ in the kernels, it should be C99.  So... well, that doesnt mean we couldnt use structs actually but ... we cant just take a struct from our .cpp/.h file and inject it into a kernel.  We need to provide it as a text file.  Again, thinking this through as I write, there's no particular reason why we cant provide structs to the OpenCL kernels.
50 | 
51 | Anyway.... rightly or wrongly :-P  what I've done for now is to change the operator structs into C++ classes, which derive from HasOperator1, HasOperator2, HasOperator3 and/or HasScalars.  These are interfaces. HasOperator2 has a function called 'operator2()', which returns a string.  The string will be injected into our OpenCL kernel templates.
52 | 
53 | I think it works quite nicely, and it's easy to convert the structs into classes, and visa versa, though it is admittedly a slight deviation from the cutorch design.
54 | 
55 | # cogapp
56 | 
57 | Oh yes, by the way, I'm using [cogapp](https://bitbucket.org/ned/cog) to help do some of the templating.  It needs a python environment.  By default, it doesnt run, but if you want to modify any of the cl files, you'll need to rerun stringify.  To get this to work:
58 | * make sure you have python available
59 | * cd into `build` directory, and do `ccmake ..`
60 | * set option `DEV_RUN_COG` to `ON`
61 | * and do `configure` then `generate`
62 | * => from now on, cogapp will run automatically, when you build, and reimport the .cl files into the corresponding .cpp file
63 | 
64 | 


--------------------------------------------------------------------------------
/doc/MigrateStatusByFile.md:
--------------------------------------------------------------------------------
 1 | # Migration status by file
 2 | 
 3 | Porting status by file, compared with original cutorch files.  Note that `.cpp` here could have been ported from `.c`, `.cpp`, or `.cu`.
 4 | 
 5 | | File | Migration status |
 6 | |---|---|
 7 | | THClTensorMathCompare.cpp | Done |
 8 | | THClTensormathCompareT.cpp | Done |
 9 | | THClTensorMathPairwise.cpp | Done |
10 | | THClTensor.h | Done |
11 | | THClTensorCopy.h | Done |
12 | | THClTensorMath.h | Done |
13 | | THClTensor.cpp | 90% |
14 | | THClTensorCopy.cpp | 50% |
15 | | THClTensorMath.cpp | 50% |
16 | | THClTensorIndex.cpp | 0% |
17 | | THClTensorMath2.cpp | 20% |
18 | | THClTensorMathBlas.cpp | 30% |
19 | | THClBlas.cpp | 50% |
20 | | THClReduce.* | 90% |
21 | | THClReduceAll.* | 70% |
22 | | THClGeneral.* | 30% |
23 | | THClTensorMathTransformReduce.* | 0% |
24 | 
25 | 


--------------------------------------------------------------------------------
/doc/OlderChanges.md:
--------------------------------------------------------------------------------
  1 | # Older changes
  2 | 
  3 | This page contains older changes, that have been moved from the [Recent Changes](https://github.com/hughperkins/cltorch#recent-changes) section on the main page.
  4 | 
  5 | For the most recent changes please see [Recent Changes](https://github.com/hughperkins/cltorch#recent-changes)
  6 | 
  7 | * 23rd July:
  8 |   * Fixed memory leak on Intel HD Graphics
  9 | * 22th July:
 10 |   * Performance improvement:
 11 |     * All per-element operations are around 2-5 faster on NVIDIA and AMD now
 12 |     * In the specific, this means that times for Karpathy's [char-rnn](http://github.com/karpathy/char-rnn) are around 2-3 times faster on NVIDIA and AMD cards, compared to before
 13 |   * [colesbury](https://github.com/colesbury)'s pull request [#176](https://github.com/torch/cutorch/pull/176) ported to cltorch, 'Allow CudaTensors as indices'
 14 |   * [andresy](https://github.com/andresy)'s pull request [#203](https://github.com/torch/cutorch/pull/203) ported to cltorch, 'expose retain and free for CudaStorage/CudaTensor'
 15 | * 19th July:
 16 |   * Upgrade EasyCL version
 17 |   * Need to explicitly enable timing now (just in case impacts performance)
 18 |   * DumpTimings now shows count of number of calls, as well as timings
 19 | * 18th July:
 20 |   * Added custom user kernels
 21 | * 16th July:
 22 |   * Did some cleaning:
 23 |     * source code now all in `src` directory, to keep the front page on github clean
 24 |     * moved a bunch of stuff from this page to other pages, ie older changes, and list of what works
 25 |     * 20x speed boost for Apply kernel, and char-rnn, on Intel HD5500 GPU
 26 | * 15th July:
 27 |   * can pass point ClTensor now also to `:lt()`, `:gt()`, `:le()`, `:ge()`, `:eq()`, `:ne()`
 28 |   * added profiling:
 29 |     * `cltorch.setProfiling(1)` to enable (has a performance hit obviously, whilst enabled)
 30 |     * `cltorch.dumpProfiling()` to dump timings since last dump
 31 |       * timings are cumulative over kernel filename/kernelname combination
 32 | * 14th July:
 33 |   * created point tensors:
 34 |     * `:sum()` can return a point tensor, which stays on the GPU, eliminating gpu pipeline stall, see presentation above
 35 |     * `add()`, `csub()`, `mul` and `div` can all accept a point tensor in place of their scalar argument
 36 |   * `:prod()` can return a point tensor too now, as can `:max()`, `:min()`, `:all()`, and `:any()`
 37 |   * can pass point ClTensor also to `:fill()` now
 38 | * 13th July:
 39 |   * possible to use tensors without `:setDevice()` to same device as them first.  Tested with `:sum()`, `:sum(1)`, and `:sum(2)` for now
 40 | * 12th July:
 41 |   * add `cltorch.about()`, to provide build information
 42 | * 10th July:
 43 |   * added cmin, cmax, for tensors and scalars (as per https://github.com/torch/cutorch/pull/198/files )
 44 | * 5th July:
 45 |   * fixed some Mac build/load issues, so builds/loads on Mac now (thank you to mlajtos, szagouyko, centime, luo123n, and pdhvip for their enormous help with fixing this :-) )
 46 |   * getDeviceProperties and so on now only show GPU and APU devices, ignores pure CPU devices (which pure CPU devices are not supported by cltorch at this time)
 47 |   * added `cltorch.test()`, which runs unit tests
 48 | * 4th July:
 49 |   * `torch.save` and `torch.load` implemented
 50 | * 27th June:
 51 |   * fixed more bugs involving Tensor copy.  Hopefully should be fixed permanently now :-P
 52 |   * added `cltorch.dumpTimings()`, which will dump cumulative timings for various parts of the engine.  It's mostly for usage by maintainers / optimizers.
 53 |   * massive optimization for anything involving apply, reduce, reduceall, index etc => this makes the ltsm script at [karpathy/char-rnn](https://github.com/karpathy/char-rnn) run significantly faster when using OpenCL now :-)
 54 | * 26th June:
 55 |   * add addcmul, and unit test
 56 |   * add addcdiv, and unit test
 57 |   * added `apply2` and `apply3` as synonyms for `map` and `map2`
 58 |   * can use `x`, `y`, `z` instead of `*out`, `*in1` and `*in2`, in `apply`, `map`, etc
 59 |   * fix a buffer copy bug (note: implies updating EasyCL, and rebuilding EasyCL, see notes on updating above)
 60 | * 25th June:
 61 |   * added bernoulli (generates on host-side for now, but I guess this is fast enough for many things?)
 62 | * 24th June:
 63 |   * added tests for `gather`, and removed some spam
 64 |   * added `scatter` (for both tensor or float source)
 65 | * 23rd June:
 66 |   * Fixed bug where operations such as apply and map on tensors with non-zero offset didnt work correctly (ie, `fill` etc after `narrow` or similar)
 67 |   * Added `gather`
 68 | * 22nd June:
 69 |   * Under the hood:
 70 |     * Moved marking a buffer dirty, ie modified on the GPU, from [THClTensorMathBlas.cpp](https://github.com/hughperkins/cltorch/blob/9133fb4f0a23a86c48dcb5dc9cc7d44f44850a3f/lib/THCl/THClTensorMathBlas.cpp#L202) to [THClBlas.cpp](https://github.com/hughperkins/cltorch/blob/9133fb4f0a23a86c48dcb5dc9cc7d44f44850a3f/lib/THCl/THClBlas.cpp#L424)
 71 |       * This fixes a bug in [clnn](https://github.com/hughperkins/clnn), where the results of a convolutional layer were not being written back to the output tensor
 72 |   * tests pass now on an AMD gpu (actually I managed to scrounge access to a W9100 :-D )
 73 | * 21st June:
 74 |   * Under the hood:
 75 |     * Upgraded new THClKernels class to handle `THClTensorInfo`
 76 |     * migrated Reduce, ReduceAll, etc to use THClKernels
 77 |     * upgraded EasyCL to handle `uint`, `long`, `ulong`
 78 |   * added `cltorch.finish()` and `cltorch.synchronize()`, both do same thing, which is a `clFinish()`, on current device
 79 |   * made it possible to require both cutorch and cltorch, as long as one requires cutorch followed by cltorch, in that order
 80 | * 20th June:
 81 |   * rename new `sub` method to `csub` so doesnt collide with existing `sub`
 82 |   * added `cltorch.setTrace(1|0)`, which prints out every allocate or copy of gpu buffers (named 'wrapper's)
 83 |   * removed `set` and `get` methods, because cause repeated gpu buffer copy (actually, get not too bad, but does copy whole buffer; set copies whole buffer, repeatedly :-P )
 84 |   * modifed `ClStorage.__string__` to first copy whole storage to FloatStorage, once, then convert this to string, rather than using now non-existent `get`
 85 |   * `torch.ClTensor{3,5,2}` will now first create this as a `FloatTensor` then call `copy` on this, to convert whole Tensor/Storage to `ClTensor` (avoids repeated `set` calls)
 86 |   * added `normall`, ie can do `torch.norm(c)`, `torch.norm(c, exponent)`
 87 |   * added `prod`, `prod(1)`, `prod(2)`
 88 |   * `max(1)` and `min(1)` now return the indices too, as well as the max.  Ditto for dimension 2.
 89 |   * added `:all()` and `:any()`
 90 |   * added `:indexFill()`
 91 |   * added `:indexCopy()`
 92 |   * added `:indexSelect()`
 93 |   * added `torch.cumsum(x,2)` and `torch.cumsum(x,1)`
 94 |   * added `torch.cumprod(x,2)` and `torch.cumprod(x,1)`
 95 |   * Under the hood:
 96 |     * created new THClKernels class:
 97 |       * handles THClTensor kernel input
 98 |       * provides `run` method that takes a dim3 `grid` and `block` input, as for cutorch kernel launches
 99 |       * migrated TensorIndexed to use THClKernels
100 | * 19th June:
101 |   * fixed a compile bug in EasyCL, when lua5.2/5.3 header files are present (not tested yet)
102 |   * added `a:sub(b)` method, which does element-wise subtraction of b from a, and puts results in a
103 |   * migrated to new version of EasyCL, with one fewer waitforevents, to try to boost perf a bit
104 |   * added `apply`, `map`, `map2` :-)  (which run on GPU, at full speed)
105 |   * added 2-pass reduceall, ie can do reduceall on much larger tensors now
106 | * 18th June:
107 |   * fixed a bug in clBLAS sger that meant that sger crashed on even tiny 5x5 matrices on nvidia, using either rowmajor or columnmajor :-)  https://github.com/clMathLibraries/clBLAS/pull/109
108 |   * note that you will need to `git submodule update`, and `rm -Rf build/clBLAS`, in order to pick up the new version of clBLAS
109 |   * moved clBLAS initialization code out of inner loops => huge speed boost
110 |   * added `:neg()` operator, which negates the tensor (like `-` but without reallocation, I think)
111 | * 15th-17th June:
112 |   * pow(x,y) no longer returns undefined values for x containing, or being, negative
113 |   * pow(x,y) now uses `pown` when y is an exact integer scalar (ie where (float)((int)y) == y)
114 |   * when no opencl-enabled devices enabled, now raise a THError, with a clear error message, rather than throwing a C++ exception, with no error message output
115 |   * under the hood: added cltorch.getState()
116 |   * renamed libTHCL.so to libTHCl.so
117 |   * added THCl include files to `install` section
118 |   * masked fill works now
119 |   * torch.addr works now
120 | * 15th June:
121 |   * C:t() working
122 | * 14th June:
123 |   * ReduceAll working :-)  For now means: sometensor:sum() works
124 |   * sometensor:sum(1) and sometensor:sum(2) working too now :-)
125 |   * A:min(), A:max() added
126 |   * created unit tests, in [test](test) directory, [cltorch-unit-tensor.lua](test/cltorch-unit-tensor.lua) which pass
127 | * 13th June:
128 |   * added `cltorch.setDevice`/`cltorch.getDevice`, see [test-device.lua](test/test-device.lua) for an example
129 |   * added EasyCL includes to EasyCL install section, to remove build errors with "EasyCL.h" not found, etc
130 | 
131 | 


--------------------------------------------------------------------------------
/doc/img/gpupipelinemultiple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hughperkins/cltorch/3b47f53f36e9c752620672213203fec22d86bc20/doc/img/gpupipelinemultiple.png


--------------------------------------------------------------------------------
/doc/img/reduceall_pipelinestall.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hughperkins/cltorch/3b47f53f36e9c752620672213203fec22d86bc20/doc/img/reduceall_pipelinestall.png


--------------------------------------------------------------------------------
/doc/img/singlegpuoperation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hughperkins/cltorch/3b47f53f36e9c752620672213203fec22d86bc20/doc/img/singlegpuoperation.png


--------------------------------------------------------------------------------
/doc/torch/cuda-to-opencl.md:
--------------------------------------------------------------------------------
 1 | # CUDA to OpenCL
 2 | 
 3 | Useful CUDA intro/info:
 4 | - http://www.nvidia.com/docs/IO/116711/sc11-cuda-c-basics.pdf
 5 | - http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#kernels
 6 | - http://developer.amd.com/tools-and-sdks/opencl-zone/opencl-resources/programming-in-opencl/porting-cuda-applications-to-opencl/
 7 | 
 8 | notes:
 9 | - `__global__` is a kernel, equivalent to OpenCL `kernel`
10 | - `mykernel<<<num_workgroups, workgroup_size>>>(param1, param2, ...)` , with the triple brackets, is a *kernel launch* (equivalent to OpenCL `run(dims, num_workgroups * workgroup_size, workgroup_size)` (ish...)
11 |   - `num_workgroups` and `workgroup_size` can be integers, or `dim3`
12 |   - where there are 4 launch parameters, the fourth is the stream, ie `<<<num_workgroups, workgroup_size, 0, stream>>>`
13 | - `__shared__` means local memory, ie `__local__` in OpenCL
14 | - `__syncthreads()` is like `barrier(CLK_LOCAL_MEM_FENCE)` in OpenCL
15 | - `cudaDeviceSynchronize()` is like `clFinish()`
16 | - `__device__` means a function that can be called from a kernel
17 | - `__host__` means a function that can be called from the host, ie from c/c++ main program
18 |   - possible to add both `__device__` and `__host__`, just to be really confusing :-P
19 | 
20 | ## Indexing
21 | 
22 | |CUDA | OpenCL |
23 | |---|---|
24 | |gridDim | get_num_groups() |
25 | | blockDim | get_local_size() |
26 | | blockIdx | get_group_id() |
27 | | threadIdx | get_local_id() |
28 | |   | get_global_id() |
29 | |   | get_global_size() |
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/doc/torch/cutorch-apply.md:
--------------------------------------------------------------------------------
  1 | # cutorch-apply
  2 | 
  3 | ## THCApply
  4 | 
  5 | ```
  6 | typedef struct THCudaTensor
  7 | {
  8 |     long *size;
  9 |     long *stride;
 10 |     int nDimension;
 11 | 
 12 |     THCudaStorage *storage;
 13 |     long storageOffset;
 14 |     int refcount;
 15 | 
 16 |     char flag;
 17 | 
 18 | } THCudaTensor;
 19 | ```
 20 | 
 21 | From THCReduceApplyUtils.h:
 22 | ```
 23 | #define MAX_CUTORCH_DIMS 25
 24 | ```
 25 | 
 26 | From THCReduceApplyUtils.h:
 27 | ```
 28 | // CUDA kernel argument that defines tensor layout
 29 | template <typename IndexType>
 30 | struct TensorInfo {
 31 |   // Extracts size/stride information for the kernel.
 32 |   // Successive dimensions can be collapsed if the size/strides match
 33 |   // up and thus there are no holes between the dimensions. This is used
 34 |   // to reduce the complexity of the problem.
 35 |   // The optional `reduceDim` indicates a reduction dimension for the
 36 |   // given tensor, so that the output size for this dimension will be 1.
 37 |   TensorInfo(THCState* state, THCudaTensor* t, int reduceDim = -1);
 38 | 
 39 |   // Contiguous tensors of more than one dimension are collapsed down
 40 |   // to one tensor
 41 |   // note: since both __host__ and __device__, this is available from both main
 42 |   // c++ code, and from kernels
 43 |   __host__ __device__ inline bool isContiguous() const {
 44 |     return (dims == 1 && strides[0] == 1);
 45 |   }
 46 | 
 47 |   float* data;
 48 |   IndexType sizes[MAX_CUTORCH_DIMS];
 49 |   IndexType strides[MAX_CUTORCH_DIMS];
 50 |   int dims;
 51 | };
 52 | ```
 53 | 
 54 | ```
 55 | // Translate a linear index for the apply to a float* offset;
 56 | // specialized on `Dims` to reduce nvcc compilation time
 57 | template <typename IndexType, int Dims>
 58 | struct IndexToOffset {
 59 |   static __host__ __device__ IndexType get(
 60 |     IndexType linearId,
 61 |     const TensorInfo<IndexType>& info) {
 62 |     IndexType offset = 0;
 63 | 
 64 |     // Use static dims
 65 |     for (int i = Dims - 1; i >= 0; --i) {
 66 |       IndexType curDimIndex = linearId % info.sizes[i];
 67 |       IndexType curDimOffset = curDimIndex * info.strides[i];
 68 |       offset += curDimOffset;
 69 | 
 70 |       if (i > 0) {
 71 |         linearId /= info.sizes[i];
 72 |       }
 73 |     }
 74 | 
 75 |     return offset;
 76 |   }
 77 | };
 78 | ```
 79 | 
 80 | ```
 81 | // This is the kernel entry point, since it is marked with `__global__`
 82 | template <typename Op, typename IndexType, int ADims, int BDims, int CDims>
 83 | __global__ void
 84 | THCudaTensor_pointwiseApply3(TensorInfo<IndexType> a,
 85 |                              TensorInfo<IndexType> b,
 86 |                              TensorInfo<IndexType> c,
 87 |                              IndexType totalElements,
 88 |                              Op op)
 89 | ```
 90 | 
 91 | ```
 92 | // This is a normal C++ host-side method, not kernel or anything
 93 | // It happens to launch the kernel though, ie launches 
 94 | // THCudaTensor_pointwiseApply3, above
 95 | template <typename Op>
 96 | bool THCudaTensor_pointwiseApply3(THCState* state,
 97 |                                   THCudaTensor* a,
 98 |                                   THCudaTensor* b,
 99 |                                   THCudaTensor* c,
100 |                                   const Op& op,
101 |                                   TensorArgType aType = ReadWrite,
102 |                                   TensorArgType bType = ReadOnly,
103 |                                   TensorArgType cType = ReadOnly) {
104 |   ...
105 |   // triple quotes, so this is a kernel *launch*
106 |   THCudaTensor_pointwiseApply3<Op, TYPE, A, B, C>
107 |     <<<grid, block, 0, THCState_getCurrentStream(state)>>>(
108 |       aInfo, bInfo, cInfo, (TYPE) totalElements, op);
109 |   ...
110 | }
111 | ```
112 | 
113 | 


--------------------------------------------------------------------------------
/doc/torch/torch.md:
--------------------------------------------------------------------------------
  1 | # torch
  2 | 
  3 | ## root /
  4 | 
  5 | * init.c
  6 |   * contains luaopen_libtorch
  7 |   * creates `torch` table
  8 |   * calls torch_(type)(Storage|Tensor)_init
  9 | * init.lua
 10 |   * requires libtorch
 11 |   * includes (Tensor|File|FFI|...).lua
 12 |   * defines torch.(type|class|include|...)
 13 | * Storage.c
 14 |   * torch_Storage_(NAME) => torch_(Type)Storage_(NAME)
 15 |   * torch_Storage => torch.(Type)Storage
 16 |   * includes generic/Storage.c and THGenerateAllTypes.h
 17 | * Tensor.c
 18 |   * includes generic/Tensor.c and THGenerateAllTypes.h
 19 | * Tensor.lua
 20 |   * A few Tensor utility methods, like print, expand, totable
 21 |   * Tensor.(typeAs|byte|char|short|int|long|float|double) methods
 22 | 
 23 | ## /generic
 24 | 
 25 | * Storage.c
 26 |   * handles the lua types/interfaces, then calls THStorage methods
 27 |   * for char,byte,int,double,float, ...:
 28 |       * torch_(Type)Storage_new 
 29 |          * calls TH(Type)Storage_new , with appropriate size
 30 |          * calls TH(Type)Storage_set, with values
 31 |       * torch_(Type)Storage_free
 32 |          * calls TH(Type)Storage_free
 33 |       * torch_(Type)Storage_resize
 34 |          * calls TH(Type)Storage_resize
 35 |          * (does little else)
 36 |       * torch_(Type)Storage_copy
 37 |          * if statement, baseed on source type, passed in as argument
 38 |          * calls appropriate THStorage_copy, based on source type
 39 |       * torch_(Type)Storage_fill
 40 |          * calls TH(Type)Storage_fill
 41 |       * torch_(Type)Storage_newindx
 42 |          * calls TH(Type)Storage_set
 43 |       * torch_(Type)Storage_index
 44 |          * calls TH(Type)Storage_get
 45 |       * torch_(Type)Storage_factory
 46 |          * calls TH(Type)Storage_new
 47 |       * torch_(type)Storage_init
 48 |          * registers the methods above
 49 | * Tensor.c
 50 |   * for char,byte,int,double,float, ...:
 51 |       * torch_(Type)Tensor_new 
 52 |          * creates a THLongStorage to read size of each dimension
 53 |          * creates a new TH(Type)Storage, and resizes it
 54 |          * calls TH(Type)Storage_set on each item
 55 | 
 56 | ## lib/TH
 57 | 
 58 | * TH.h
 59 |   * includes lib/TH/TH*.h
 60 | * THGeneral.c/h
 61 |   * THAlloc
 62 |   * THFree
 63 |   * THRealloc
 64 |   * THSetArgErrorHandler
 65 |   * (a few others)
 66 | * THStorage.h
 67 |   * THStorage => TH(Type)Storage
 68 |   * THStorage_(Name) => TH(Type)Storage_(Name)
 69 |   * includes lib/TH/generic/THStorage.h and THGenerateAllTypes.h
 70 |   * includes lib/TH/generic/THStorageCopy.h and THGenerateAllTypes.h
 71 | * THStorage.c
 72 |   * includes lib/TH/generic/THStorage.c and all types
 73 |   * includes lib/TH/generic/THStorageCopy.c and all types
 74 | * THTensor.c/h
 75 |   * generates all types for lib/TH/generic/THTensor*.h/c
 76 | 
 77 | ## lib/TH/generic
 78 | 
 79 | * THStorage.c/h
 80 |   * TH(Type)Storage_(new,newWithSize,newWithAllocator,free,newWithData,
 81 |     resize, fill, set, get)
 82 | * THTensor.c/h
 83 |   * TH(Type)Tensor_(storage,storageOffset,size,stride,data,rawInit,
 84 |        new,newWithTensor,newWithStorage,newWithSize,newClone, resize, ...)
 85 | 
 86 | # cutorch
 87 | 
 88 | ## root /
 89 | 
 90 | * init.lua
 91 |   * require libcutorch
 92 |   * include Tensor.lua, FFI.lua, test,la
 93 | * init.c
 94 |   * defines and registers global functions, eg synchronize, getNumStreams, setDevice
 95 |   * calls cutorch_Cuda(Storage|Tensor|TensorMath|TensorOperator)_init(L)
 96 |   * intializes THCState, and stores it as _state
 97 | * Storage.c
 98 |   * calls generic/Storage.c for Real=Cuda
 99 |   * defines cutorch_CudaStorage_copy for src type (Cuda|Byte|...)
100 |   * defines cutorch_(Type)Storage_copy for all src types
101 |   * registers the copy methods as 'copy' method of torch.ByteStorage etc
102 | 
103 |   * seems like since generic/Storage.c just calls appropriate THCuda method, that generic/Storage.c doesnt need much modificatoin?
104 | * Tensor.c
105 |   * as for Storage.c: include generic/Tensor.c for every type, just overwite `copy` methods
106 | * Tensor.lua
107 |   * injects a `cuda()` method to each of the other Tensor types
108 |   * adds 'double()', 'float()' etc method to torch.CudaTensor type
109 | * FFI.lua
110 |   * almost empty
111 |   * contains the structs, ie:
112 |     * THCState
113 |     * THCudaStorage
114 |     * THCudaTensor
115 |   * adds `cdata` and `data` methods (?) to Storage and Tensor
116 | 
117 | ## torch/generic
118 | 
119 | * Storage.c and Tensor.c from torch/generic, -no change-, but modified some... eg THCState instead of THState, and cutorch_getState, instead of checkudata and not the only difference :-(
120 | 
121 | ## lib/THC
122 | 
123 | * THC.h
124 |   * includes lib/THC/TH*.h
125 | * THCGeneral.h/c
126 |   * includes cuda.h etc
127 |   * defines THAssert
128 |   * defines THC_API, THC_EXTERNC
129 |   * defines THCState struct
130 |   * implementation for global functions, like:
131 |     * THCudaInit
132 |     * THCudaBlas_init
133 |     * THCState_getNumDevices
134 | * THCStorage.c/h/cu
135 |   * defines THCudaStorage struct, containing allocator, refcount, ..
136 |   * defines THCudaStorage_(new,set,get,free,fill,resize,data)
137 |   * `fill` and `resize` are in .cu, presumably because these need kernels (seems like .cu is just more definitions of what are in the .h file though, some in the .c, some in the .cu)
138 |   * other methods just use cudaMalloc, cudaFree, cudaMemcpy, etc
139 | * THCTensor.c/h/cu
140 |   * various methods like retain, free, set1d/2d/..., get1d/2d/...
141 |     squeeze, storage, new, data, lots of `new` methods, `resize` methods
142 |   * meld shows there's basically no difference between the .c file and the original torch one, in lib/TH/generic/THTensor.c
143 |   * the cu has two functions:
144 |     * THCudaTensor_getDevice
145 |     * THCudaTensor_getTextureObject
146 | * (No lib/THC/generic)
147 | 
148 | 


--------------------------------------------------------------------------------
/rocks/cltorch-scm-1.rockspec:
--------------------------------------------------------------------------------
 1 | package = "cltorch"
 2 | version = "scm-1"
 3 | 
 4 | source = {
 5 |    url = "git://github.com/hughperkins/cltorch.git",
 6 | }
 7 | 
 8 | description = {
 9 |    summary = "OpenCL backend for Torch",
10 |    detailed = [[
11 |    ]],
12 |    homepage = "https://github.com/hughperkins/cltorch",
13 |    license = "BSD"
14 | }
15 | 
16 | dependencies = {
17 |    "torch >= 7.0",
18 | }
19 | 
20 | build = {
21 |    type = "command",
22 |    build_command = [[
23 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) -j$(getconf _NPROCESSORS_ONLN) install
24 | ]],
25 |     platforms = {
26 |       windows = {
27 |            build_command = [[
28 | cmake -E make_directory build && cd build && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_PREFIX_PATH="$(LUA_BINDIR)/.." -DCMAKE_INSTALL_PREFIX="$(PREFIX)" && $(MAKE) install
29 | ]]
30 |       }
31 |    },
32 |    install_command = "cd build"
33 | }
34 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 2.8)
2 | 
3 | ADD_SUBDIRECTORY(lib)
4 | 
5 | 


--------------------------------------------------------------------------------
/src/FFI.lua:
--------------------------------------------------------------------------------
 1 | local ok, ffi = pcall(require, 'ffi')
 2 | if ok then
 3 | 
 4 |    local cdefs = [[
 5 | 
 6 | typedef struct THClStorage
 7 | {
 8 |   int device;
 9 |   float *data;
10 |   void *cl;
11 |   void *wrapper;
12 |   long size;
13 |   int refcount;
14 |   char flag;
15 |   void *allocator;
16 |   void *allocatorContext;
17 |   struct THClStorage *view;
18 | } THClStorage;
19 | 
20 | typedef struct THClTensor
21 | {
22 |     long *size;
23 |     long *stride;
24 |     int nDimension;
25 | 
26 |     THClStorage *storage;
27 |     long storageOffset;
28 |     int refcount;
29 | 
30 |     char flag;
31 | 
32 |     int device;
33 | } THClTensor;
34 | 
35 | ]]
36 |    ffi.cdef(cdefs)
37 | 
38 |    local Storage = torch.getmetatable('torch.ClStorage')
39 |    local Storage_tt = ffi.typeof('THClStorage**')
40 | 
41 |    rawset(Storage, "cdata", function(self) return Storage_tt(self)[0] end)
42 |    rawset(Storage, "data", function(self) return Storage_tt(self)[0].data end)
43 |    -- Tensor
44 |    local Tensor = torch.getmetatable('torch.ClTensor')
45 |    local Tensor_tt = ffi.typeof('THClTensor**')
46 | 
47 |    rawset(Tensor, "cdata", function(self) return Tensor_tt(self)[0] end)
48 | 
49 |    rawset(Tensor, "data",
50 |           function(self)
51 |              self = Tensor_tt(self)[0]
52 |              return self.storage ~= nil and self.storage.data + self.storageOffset or nil
53 |           end
54 |    )
55 | 
56 | end
57 | 


--------------------------------------------------------------------------------
/src/Random.lua:
--------------------------------------------------------------------------------
 1 | -- I reckon that generating on host side and copying to gpu
 2 | -- will work just fine for many scenarios, eg dropout
 3 | -- I cite as evidence the answer by Klerik at
 4 | -- http://stackoverflow.com/questions/9912143/how-to-get-a-random-number-in-opencl
 5 | 
 6 | function torch.ClTensor:bernoulli(p)
 7 |    if p ~= nil then
 8 |       self:copy(torch.Tensor(self:size()):bernoulli(p))
 9 |    else
10 |       self:copy(torch.Tensor(self:size()):bernoulli())
11 |    end
12 |    return self
13 | end
14 | 
15 | function torch.ClTensor:uniform(a, b)
16 |    if a == nil then
17 |       a = 0
18 |    end
19 |    if b == nil then
20 |       b = 1
21 |    end
22 |    self:copy(torch.Tensor(self:size()):uniform(a, b))
23 |    return self
24 | end
25 | 
26 | 


--------------------------------------------------------------------------------
/src/Storage.cpp:
--------------------------------------------------------------------------------
  1 | #include "torch/utils.h"
  2 | #include "THCl.h"
  3 | //#include "THFile.h"
  4 | #include "luaT.h"
  5 | 
  6 | extern "C" {
  7 |   void cltorch_ClStorage_init(lua_State* L);
  8 | }
  9 | 
 10 | #define EXCEPT_TO_THERROR(method) \
 11 | try { \
 12 |   method; \
 13 | } catch(exception &e) { \
 14 |   THError("Something went wrong: %s", e.what()); \
 15 | }
 16 | 
 17 | /* everything is as the generic Storage.c, except few things (see below) */
 18 | 
 19 | #define real float
 20 | #define Real Cl
 21 | #define TH_GENERIC_FILE "generic/Storage.c"
 22 | 
 23 | #define torch_Storage_(NAME) TH_CONCAT_4(torch_,Real,Storage_,NAME)
 24 | 
 25 | #define THFile_readRealRaw(file, data, size)                            \
 26 |   {                                                                     \
 27 |     float *fdata = (float*)THAlloc(sizeof(float)*size);                 \
 28 |     THError("Not implemented"); \
 29 |     THFile_readFloatRaw(file, fdata, size);                             \
 30 | /*    THClCheck(clMemcpy(data, fdata, size * sizeof(float), clMemcpyHostToDevice));*/ \
 31 |     THFree(fdata);                                                      \
 32 |   }
 33 | 
 34 | #define THFile_writeRealRaw(file, data, size)                           \
 35 |   {                                                                     \
 36 |     float *fdata = (float*)THAlloc(sizeof(float)*size);                 \
 37 |     THError("Not implemented"); \
 38 | /*    THClCheck(clMemcpy(fdata, data, size * sizeof(float), clMemcpyDeviceToHost));*/ \
 39 |     THFile_writeFloatRaw(file, fdata, size);                            \
 40 |     THFree(fdata);                                                      \
 41 |   }
 42 | 
 43 | #define torch_Storage TH_CONCAT_STRING_3(torch.,Real,Storage)
 44 | 
 45 | #include "generic/Storage.cpp"
 46 | 
 47 | #undef real
 48 | #undef Real
 49 | #undef TH_GENERIC_FILE
 50 | 
 51 | /* now we overwrite some methods specific to ClStorage */
 52 | 
 53 | static int cltorch_ClStorage_copy(lua_State *L)
 54 | {
 55 |   THClState *state = cltorch_getstate(L);
 56 |   THClStorage *storage = static_cast<THClStorage *>(luaT_checkudata(L, 1, "torch.ClStorage"));
 57 |   void *src;
 58 |   if( (src = luaT_toudata(L, 2, "torch.ClStorage")) ) {
 59 |     EXCEPT_TO_THERROR(THClStorage_copy(state, storage, static_cast<THClStorage *>(src)));
 60 |   } else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) ) {
 61 |     EXCEPT_TO_THERROR(THClStorage_copyByte(state, storage, static_cast<THByteStorage *>(src)));
 62 |   } else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) ) {
 63 |     EXCEPT_TO_THERROR(THClStorage_copyChar(state, storage, static_cast<THCharStorage *>(src)));
 64 |   } else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) ) {
 65 |     EXCEPT_TO_THERROR(THClStorage_copyShort(state, storage, static_cast<THShortStorage *>(src)));
 66 |   } else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) ) {
 67 |     EXCEPT_TO_THERROR(THClStorage_copyInt(state, storage, static_cast<THIntStorage *>(src)));
 68 |   } else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) ) {
 69 |     EXCEPT_TO_THERROR(THClStorage_copyLong(state, storage, static_cast<THLongStorage *>(src)));
 70 |   } else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) ) {
 71 |     EXCEPT_TO_THERROR(THClStorage_copyFloat(state, storage, static_cast<THFloatStorage *>(src)));
 72 |   } else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) ) {
 73 |     EXCEPT_TO_THERROR(THClStorage_copyDouble(state, storage, static_cast<THDoubleStorage *>(src)));
 74 |   } else if( (src = luaT_toudata(L, 2, "torch.ClStorage")) ) {
 75 |     EXCEPT_TO_THERROR(THClStorage_copyCl(state, storage, static_cast<THClStorage *>(src)));
 76 |   } else
 77 |     luaL_typerror(L, 2, "torch.*Storage");
 78 | 
 79 |   lua_settop(L, 1);
 80 |   return 1;
 81 | }
 82 | 
 83 | #define CL_IMPLEMENT_STORAGE_COPY(TYPEC)                              \
 84 |   static int cltorch_##TYPEC##Storage_copy(lua_State *L)                \
 85 |   {                                                                     \
 86 |     TH##TYPEC##Storage *storage = static_cast<TH##TYPEC##Storage *>(luaT_checkudata(L, 1, "torch." #TYPEC "Storage")); \
 87 |     void *src;                                                          \
 88 |     if( (src = luaT_toudata(L, 2, "torch." #TYPEC "Storage")) )         \
 89 |       TH##TYPEC##Storage_copy(storage, static_cast<TH##TYPEC##Storage *>(src));                            \
 90 |     else if( (src = luaT_toudata(L, 2, "torch.ByteStorage")) )          \
 91 |       TH##TYPEC##Storage_copyByte(storage, static_cast<THByteStorage *>(src));                        \
 92 |     else if( (src = luaT_toudata(L, 2, "torch.CharStorage")) )          \
 93 |       TH##TYPEC##Storage_copyChar(storage, static_cast<THCharStorage *>(src));                        \
 94 |     else if( (src = luaT_toudata(L, 2, "torch.ShortStorage")) )         \
 95 |       TH##TYPEC##Storage_copyShort(storage, static_cast<THShortStorage *>(src));                       \
 96 |     else if( (src = luaT_toudata(L, 2, "torch.IntStorage")) )           \
 97 |       TH##TYPEC##Storage_copyInt(storage, static_cast<THIntStorage *>(src));                         \
 98 |     else if( (src = luaT_toudata(L, 2, "torch.LongStorage")) )          \
 99 |       TH##TYPEC##Storage_copyLong(storage, static_cast<THLongStorage *>(src));                        \
100 |     else if( (src = luaT_toudata(L, 2, "torch.FloatStorage")) )         \
101 |       TH##TYPEC##Storage_copyFloat(storage, static_cast<THFloatStorage *>(src));                       \
102 |     else if( (src = luaT_toudata(L, 2, "torch.DoubleStorage")) )        \
103 |       TH##TYPEC##Storage_copyDouble(storage, static_cast<THDoubleStorage *>(src));                      \
104 |     else if( (src = luaT_toudata(L, 2, "torch.ClStorage")) )          \
105 |       TH##TYPEC##Storage_copyCl(cltorch_getstate(L), storage, static_cast<THClStorage *>(src));   \
106 |     else                                                                \
107 |       luaL_typerror(L, 2, "torch.*Storage");                            \
108 |                                                                         \
109 |     lua_settop(L, 1);                                                   \
110 |     return 1;                                                           \
111 | }
112 | 
113 | CL_IMPLEMENT_STORAGE_COPY(Byte)
114 | CL_IMPLEMENT_STORAGE_COPY(Char)
115 | CL_IMPLEMENT_STORAGE_COPY(Short)
116 | CL_IMPLEMENT_STORAGE_COPY(Int)
117 | CL_IMPLEMENT_STORAGE_COPY(Long)
118 | CL_IMPLEMENT_STORAGE_COPY(Float)
119 | CL_IMPLEMENT_STORAGE_COPY(Double)
120 | 
121 | void cltorch_ClStorage_init(lua_State* L)
122 | {
123 |   /* the standard stuff */
124 |   torch_ClStorage_init(L);
125 | 
126 |   /* the copy methods */
127 |   {
128 |     int i;
129 | 
130 |     const char* tnames[8] = {"torch.ByteStorage",
131 |                              "torch.CharStorage",
132 |                              "torch.ShortStorage",
133 |                              "torch.IntStorage",
134 |                              "torch.LongStorage",
135 |                              "torch.FloatStorage",
136 |                              "torch.DoubleStorage",
137 |                              "torch.ClStorage"};
138 | 
139 |     static int (*funcs[8])(lua_State*) = {cltorch_ByteStorage_copy,
140 |                                           cltorch_CharStorage_copy,
141 |                                           cltorch_ShortStorage_copy,
142 |                                           cltorch_IntStorage_copy,
143 |                                           cltorch_LongStorage_copy,
144 |                                           cltorch_FloatStorage_copy,
145 |                                           cltorch_DoubleStorage_copy,
146 |                                           cltorch_ClStorage_copy};
147 | 
148 |     for(i = 0; i < 8; i++)
149 |     {
150 |       luaT_pushmetatable(L, tnames[i]);
151 |       lua_pushcfunction(L, funcs[i]);
152 |       lua_setfield(L, -2, "copy");
153 |       lua_pop(L, 1);
154 |     }
155 |   }
156 | }
157 | 


--------------------------------------------------------------------------------
/src/Tensor.lua:
--------------------------------------------------------------------------------
 1 | -- this is misleading, since it copies onto cpu, and does it on cpu
 2 | function torch.ClTensor.apply(self, func)
 3 |    local x = torch.FloatTensor(self:size()):copy(self)
 4 |    x:apply(func)
 5 |    self:copy(x)
 6 |    return self
 7 | end
 8 | 
 9 | local function Tensor__type(self,type)
10 |    local current = torch.typename(self)
11 | if not type then return current end
12 |    if type ~= current then
13 |       local new = torch.getmetatable(type).new()
14 |       if self:nElement() > 0 then
15 |          new:resize(self:size()):copy(self)
16 |       end
17 |       return new
18 |    else
19 |       return self
20 |    end
21 | end
22 | local function Tensor__typeAs(self,tensor)
23 |    return self:type(tensor:type())
24 | end
25 | local function Tensor__cl(self)
26 |    return self:type('torch.ClTensor')
27 | end
28 | local function Tensor__double(self)
29 |    return self:type('torch.DoubleTensor')
30 | end
31 | local function Tensor__float(self)
32 |    return self:type('torch.FloatTensor')
33 | end
34 | 
35 | local function Tensor__byte(self)
36 |    return self:type('torch.ByteTensor')
37 | end
38 | 
39 | local function Tensor__char(self)
40 |    return self:type('torch.CharTensor')
41 | end
42 | 
43 | local function Tensor__int(self)
44 |    return self:type('torch.IntTensor')
45 | end
46 | 
47 | local function Tensor__short(self)
48 |    return self:type('torch.ShortTensor')
49 | end
50 | 
51 | local function Tensor__long(self)
52 |    return self:type('torch.LongTensor')
53 | end
54 | 
55 | rawset(torch.getmetatable('torch.DoubleTensor'), 'cl', Tensor__cl)
56 | rawset(torch.getmetatable('torch.FloatTensor'), 'cl', Tensor__cl)
57 | rawset(torch.getmetatable('torch.ByteTensor'), 'cl', Tensor__cl)
58 | rawset(torch.getmetatable('torch.CharTensor'), 'cl', Tensor__cl)
59 | rawset(torch.getmetatable('torch.IntTensor'), 'cl', Tensor__cl)
60 | rawset(torch.getmetatable('torch.ShortTensor'), 'cl', Tensor__cl)
61 | rawset(torch.getmetatable('torch.LongTensor'), 'cl', Tensor__cl)
62 | rawset(torch.getmetatable('torch.ClTensor'), 'cl', Tensor__cl)
63 | 
64 | rawset(torch.getmetatable('torch.ClTensor'), 'type', Tensor__type)
65 | rawset(torch.getmetatable('torch.ClTensor'), 'typeAs', Tensor__typeAs)
66 | rawset(torch.getmetatable('torch.ClTensor'), 'double', Tensor__double)
67 | rawset(torch.getmetatable('torch.ClTensor'), 'float', Tensor__float)
68 | rawset(torch.getmetatable('torch.ClTensor'), 'byte', Tensor__byte)
69 | rawset(torch.getmetatable('torch.ClTensor'), 'char', Tensor__char)
70 | rawset(torch.getmetatable('torch.ClTensor'), 'int', Tensor__int)
71 | rawset(torch.getmetatable('torch.ClTensor'), 'short', Tensor__short)
72 | rawset(torch.getmetatable('torch.ClTensor'), 'long', Tensor__long)
73 | 
74 | do
75 |    local metatable = torch.getmetatable('torch.ClTensor')
76 |    -- hmmm, maybe these are running on cpu? :-P
77 |    for _,func in pairs{'expand', 'expandAs', 'view', 'viewAs', 'repeatTensor',
78 |       'permute', 'split', 'chunk'} do
79 |       rawset(metatable, func, torch[func])
80 |    end
81 | end
82 | 
83 | 


--------------------------------------------------------------------------------
/src/TensorOperator.c:
--------------------------------------------------------------------------------
  1 | #include "torch/utils.h"
  2 | #include "luaT.h"
  3 | #include "THCl.h"
  4 | 
  5 | static int cltorch_ClTensorOperator___add__(lua_State *L)
  6 | {
  7 |   THClTensor *tensor1 = luaT_toudata(L, 1, "torch.ClTensor");
  8 |   THClTensor *tensor2 = luaT_toudata(L, 2, "torch.ClTensor");
  9 |   THClTensor *r;
 10 |   THClState *state = cltorch_getstate(L);
 11 |   THAssert(THClTensor_checkGPU(state, 2, tensor1, tensor2));
 12 | 
 13 |   if(!tensor1 && !tensor2)
 14 |     luaL_error(L, "expecting two Tensors or one Tensor and one number");
 15 |   else
 16 |   {
 17 |     int device = -1;
 18 |     if(tensor1) {
 19 |       device = tensor1->storage->device;
 20 |     } else {
 21 |       device = tensor2->storage->device;
 22 |     }
 23 |     r = THClTensor_newv2(state, device);
 24 |     luaT_pushudata(L, r, "torch.ClTensor");
 25 | 
 26 |     if(!tensor1 && tensor2)
 27 |     {
 28 |       THClTensor_resizeAs(state, r, tensor2);
 29 |       THClTensor_copy(state, r, tensor2);
 30 |       THClTensor_add(state, r, r, luaL_checknumber(L, 1));
 31 |     }
 32 |     else if(tensor1 && !tensor2)
 33 |     {
 34 |       THClTensor_resizeAs(state, r, tensor1);
 35 |       THClTensor_copy(state, r, tensor1);
 36 |       THClTensor_add(state, r, r, luaL_checknumber(L, 2));
 37 |     }
 38 |     else
 39 |     {
 40 |       THClTensor_resizeAs(state, r, tensor1);
 41 |       THClTensor_copy(state, r, tensor1);
 42 |       THClTensor_cadd(state, r, r, 1, tensor2);
 43 |     }
 44 |   }
 45 |   return 1;
 46 | }
 47 | 
 48 | static int cltorch_ClTensorOperator___sub__(lua_State *L)
 49 | {
 50 |   THClTensor *tensor1 = luaT_toudata(L, 1, "torch.ClTensor");
 51 |   THClTensor *tensor2 = luaT_toudata(L, 2, "torch.ClTensor");
 52 |   THClTensor *r;
 53 |   THClState *state = cltorch_getstate(L);
 54 |   THAssert(THClTensor_checkGPU(state, 2, tensor1, tensor2));
 55 | 
 56 |   if(!tensor1 && !tensor2)
 57 |     luaL_error(L, "expecting two Tensors or one Tensor and one number");
 58 |   else
 59 |   {
 60 |     int device = -1;
 61 |     if(tensor1) {
 62 |       device = tensor1->storage->device;
 63 |     } else {
 64 |       device = tensor2->storage->device;
 65 |     }
 66 |     r = THClTensor_newv2(state, device);
 67 |     luaT_pushudata(L, r, "torch.ClTensor");
 68 | 
 69 |     if(!tensor1 && tensor2)
 70 |     {
 71 |       THClTensor_resizeAs(state, r, tensor2);
 72 |       THClTensor_fill(state, r, luaL_checknumber(L, 1));
 73 |       THClTensor_cadd(state, r, r, -1, tensor2);
 74 |     }
 75 |     else if(tensor1 && !tensor2)
 76 |     {
 77 |       THClTensor_resizeAs(state, r, tensor1);
 78 |       THClTensor_copy(state, r, tensor1);
 79 |       THClTensor_add(state, r, r, -luaL_checknumber(L, 2));
 80 |     }
 81 |     else
 82 |     {
 83 |       THClTensor_resizeAs(state, r, tensor1);
 84 |       THClTensor_copy(state, r, tensor1);
 85 |       THClTensor_cadd(state, r, r, -1, tensor2);
 86 |     }
 87 |   }
 88 |   return 1;
 89 | }
 90 | 
 91 | static int cltorch_ClTensorOperator___unm__(lua_State *L)
 92 | {
 93 |   THClTensor *tensor = luaT_checkudata(L, 1, "torch.ClTensor");
 94 |   THClTensor *r;
 95 |   THClState *state = cltorch_getstate(L);
 96 |   THAssert(THClTensor_checkGPU(state, 1, tensor));
 97 | 
 98 |   r = THClTensor_newv2(state, tensor->storage->device);
 99 |   luaT_pushudata(L, r, "torch.ClTensor");
100 |   THClTensor_resizeAs(state, r, tensor);
101 |   THClTensor_copy(state, r, tensor);
102 |   THClTensor_mul(state, r, r, -1);
103 | 
104 |   return 1;
105 | }
106 | 
107 | static int cltorch_ClTensorOperator___mul__(lua_State *L)
108 | {
109 |   THClTensor *tensor1 = luaT_toudata(L, 1, "torch.ClTensor");
110 |   THClTensor *tensor2 = luaT_toudata(L, 2, "torch.ClTensor");
111 |   THClTensor *r;
112 |   THClState *state = cltorch_getstate(L);
113 |   THAssert(THClTensor_checkGPU(state, 2, tensor1, tensor2));
114 | 
115 |   if(!tensor1 && !tensor2)
116 |     luaL_error(L, "expecting two Tensors or one Tensor and one number");
117 |   else
118 |   {
119 |     int device = -1;
120 |     if(tensor1) {
121 |       device = tensor1->storage->device;
122 |     } else {
123 |       device = tensor2->storage->device;
124 |     }
125 |     r = THClTensor_newv2(state, device);
126 |     luaT_pushudata(L, r, "torch.ClTensor");
127 | 
128 |     if(!tensor1 && tensor2)
129 |     {
130 |       THClTensor_resizeAs(state, r, tensor2);
131 |       THClTensor_copy(state, r, tensor2);
132 |       THClTensor_mul(state, r, r, luaL_checknumber(L, 1));
133 |     }
134 |     else if(tensor1 && !tensor2)
135 |     {
136 |       THClTensor_resizeAs(state, r, tensor1);
137 |       THClTensor_copy(state, r, tensor1);
138 |       THClTensor_mul(state, r, r, luaL_checknumber(L, 2));
139 |     }
140 |     else
141 |     {
142 |       int dimt = tensor1->nDimension;
143 |       int dims = tensor2->nDimension;
144 | 
145 |       if(dimt == 1 && dims == 1)
146 |         lua_pushnumber(L, THClTensor_dot(state, tensor1, tensor2)); /* ok, we wasted r, but who cares */
147 |       else if(dimt == 2 && dims == 1)
148 |       {
149 |         THClTensor_resize1d(state, r, tensor1->size[0]);
150 |         THClTensor_zero(state, r);
151 |         THClTensor_addmv(state, r, 1, r, 1, tensor1, tensor2);
152 |       }
153 |       else if(dimt == 2 && dims == 2)
154 |       {
155 |         THClTensor_resize2d(state, r, tensor1->size[0], tensor2->size[1]);
156 |         THClTensor_zero(state, r);
157 |         THClTensor_addmm(state, r, 1, r, 1, tensor1, tensor2);
158 |       }
159 |       else
160 |         luaL_error(L, "multiplication between %dD and %dD tensors not yet supported", tensor1->nDimension, tensor2->nDimension);
161 |     }
162 |   }
163 |   return 1;
164 | }
165 | 
166 | static int cltorch_ClTensorOperator___div__(lua_State *L)
167 | {
168 |   THClTensor *tensor = luaT_checkudata(L, 1, "torch.ClTensor");
169 |   THClTensor *r;
170 |   THClState *state = cltorch_getstate(L);
171 |   THAssert(THClTensor_checkGPU(state, 1, tensor));
172 | 
173 |   luaL_argcheck(L, lua_isnumber(L,2), 2, "number expected");
174 | 
175 |   r = THClTensor_newv2(state, tensor->storage->device);
176 |   luaT_pushudata(L, r, "torch.ClTensor");
177 | 
178 |   THClTensor_resizeAs(state, r, tensor);
179 |   THClTensor_copy(state, r, tensor);
180 |   THClTensor_mul(state, r, r, 1/lua_tonumber(L, 2));
181 | 
182 |   return 1;
183 | }
184 | 
185 | static const struct luaL_Reg cltorch_ClTensorOperator__ [] = {
186 |   {"__add__", cltorch_ClTensorOperator___add__},
187 |   {"__sub__", cltorch_ClTensorOperator___sub__},
188 |   {"__unm__", cltorch_ClTensorOperator___unm__},
189 |   {"__mul__", cltorch_ClTensorOperator___mul__},
190 |   {"__div__", cltorch_ClTensorOperator___div__},
191 |   {NULL, NULL}
192 | };
193 | 
194 | void cltorch_ClTensorOperator_init(lua_State *L)
195 | {
196 |   luaT_pushmetatable(L, "torch.ClTensor");
197 |   luaL_setfuncs(L, cltorch_ClTensorOperator__, 0);
198 |   lua_pop(L, 1);
199 | }
200 | 


--------------------------------------------------------------------------------
/src/Test.lua:
--------------------------------------------------------------------------------
 1 | function cltorch.test()
 2 |    print('running tests...')
 3 |    -- luaunit = require('luaunit')
 4 | 
 5 |    require('cltorch.unit_storage')
 6 |    print('aftter requiring cltorch.unit_storage')
 7 |    -- test_basic()
 8 |    local res = cltorch.tests.storage.test()
 9 |    print('res', res)
10 |    assert(res == true)
11 | 
12 |    require('cltorch.unit_tensor')
13 |    print('aftter requiring cltorch.unit_tensor')
14 |    -- test_basic()
15 |    res = cltorch.tests.tensor.test()
16 |    assert(res == true)
17 | 
18 |    print('all tests finished')
19 | end
20 | 
21 | 


--------------------------------------------------------------------------------
/src/UserKernel.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | extern "C" {
4 |   #include "lua.h"
5 |   void cltorch_UserKernel_init(lua_State *L);
6 | }
7 | 
8 | 


--------------------------------------------------------------------------------
/src/cmake/build_EasyCL.cmake:
--------------------------------------------------------------------------------
 1 | INCLUDE(ExternalProject)
 2 | 
 3 | message("CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}")
 4 | ExternalProject_Add(
 5 |     EasyCL-external
 6 |     STAMP_DIR ${CMAKE_BINARY_DIR}/EasyCL/stamp
 7 |     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL
 8 |     PREFIX ${CMAKE_BINARY_DIR}/EasyCL
 9 |     INSTALL_DIR ${CMAKE_INSTALL_PREFIX}
10 |     CMAKE_CACHE_ARGS 
11 |     -DBUILD_TESTS:BOOL=OFF
12 |     -DPROVIDE_LUA_ENGINE:BOOL=OFF
13 |     -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX}
14 |     -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
15 |     )
16 | 
17 | ADD_LIBRARY(EasyCL SHARED IMPORTED)
18 | ADD_DEPENDENCIES(EasyCL EasyCL-external)
19 | #SET(EASYCL_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/include/deepcl ${CMAKE_INSTALL_PREFIX}/include/easycl )
20 | SET(EasyCL_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include )
21 | #SET(EasyCL_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clew${CMAKE_SHARED_LIBRARY_SUFFIX})
22 | #SET(EasyCL_FOUND TRUE)
23 | 
24 | #set_property(TARGET EasyCL
25 | # PROPERTY INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}
26 | #)
27 | 
28 | set_target_properties(EasyCL PROPERTIES
29 | #  INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}
30 |   IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}
31 | )
32 | 
33 | #set_property(TARGET EasyCL
34 | #  PROPERTY IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}
35 | #  IMPORTED_LINK_INTERFACE_LIBRARIES_DEBUG "EasyCL;clBLAS;/usr/lib/x86_64-linux-gnu/libjpeg.so"
36 | #  IMPORTED_LOCATION_DEBUG "/home/ubuntu/git/DeepCL/build/libDeepCL.so"
37 | #  IMPORTED_SONAME_DEBUG "libDeepCL.so"
38 | #)
39 | 
40 | #set_target_properties(EasyCL PROPERTIES
41 | #  IMPORTED_LINK_INTERFACE_LIBRARIES_DEBUG "EasyCL;clBLAS;/usr/lib/x86_64-linux-gnu/libjpeg.so"
42 | #  IMPORTED_LOCATION_DEBUG "/home/ubuntu/git/DeepCL/build/libDeepCL.so"
43 | #  IMPORTED_SONAME_DEBUG "libDeepCL.so"
44 | #)
45 | 
46 | add_custom_target(easycl_delete_stamp ALL 
47 |   COMMAND ${CMAKE_COMMAND} -E  remove_directory "${CMAKE_BINARY_DIR}/EasyCL/stamp"
48 | )
49 | add_dependencies(EasyCL-external easycl_delete_stamp)
50 | 
51 | 


--------------------------------------------------------------------------------
/src/cmake/build_clBLAS.cmake:
--------------------------------------------------------------------------------
 1 | INCLUDE(ExternalProject)
 2 | 
 3 | message("CMAKE_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX}")
 4 | ExternalProject_Add(
 5 |     clBLAS-external
 6 |     #GIT_REPOSITORY git@github.com:clMathLibraries/clBLAS.git
 7 |     #GIT_TAG master
 8 |     #DOWNLOAD_DIR ${CMAKE_CURRENT_SOURCE_DIR}/clMathLibraries/clBLAS
 9 |     #GIT_SUBMODULES clBLAS
10 |     STAMP_DIR ${CMAKE_BINARY_DIR}/clBLAS/stamp
11 |     SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src
12 |     PREFIX ${CMAKE_BINARY_DIR}/clBLAS
13 |     INSTALL_DIR ${CMAKE_INSTALL_PREFIX}
14 |     #CONFIGURE_COMMAND ${CMAKE_COMMAND} -Wno-dev "-G${CMAKE_GENERATOR}" <SOURCE_DIR>
15 |     #-DCMAKE_CXX_COMPILER:FILEPATH=${CMAKE_CXX_COMPILER}
16 |     #"-DCMAKE_CXX_FLAGS:STRING=${CMAKE_CXX_FLAGS} -w -fPIC"
17 |     #-DCMAKE_C_COMPILER:FILEPATH=${CMAKE_C_COMPILER}
18 |     #"-DCMAKE_C_FLAGS:STRING=${CMAKE_C_FLAGS} -w -fPIC"
19 |     #-DCMAKE_BUILD_TYPE:STRING=Release
20 |     CMAKE_CACHE_ARGS 
21 |     -DCMAKE_INSTALL_PREFIX:PATH=${CMAKE_INSTALL_PREFIX}
22 |     -DOPENCL_INCLUDE_DIRS:STRING=${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include;${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include/proxy-opencl
23 |     -DOPENCL_LIBRARIES:STRING=${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clew${CMAKE_SHARED_LIBRARY_SUFFIX}
24 |     -DBUILD_SHARED_LIBS:BOOL=ON
25 |     -DBUILD_CLIENT:BOOL=OFF
26 |     -DBUILD_TEST:BOOL=OFF
27 |     -DBUILD_KTEST:BOOL=OFF
28 |     # -DADD_RPATH:BOOL=ON
29 |     -DCMAKE_MACOSX_RPATH:BOOL=ON
30 |     -DSUFFIX_LIB:STRING=
31 |     -DCORR_TEST_WITH_ACML:BOOL=OFF
32 |     -DCMAKE_BUILD_TYPE:STRING=RelWithDebInfo
33 |     )
34 | 
35 | #ExternalProject_Get_Property(clBLAS-external install_dir)
36 | ADD_LIBRARY(clBLAS SHARED IMPORTED)
37 | #SET_TARGET_PROPERTIES(clBLAS PROPERTIES IMPORTED_LOCATION ${clBLAS_location})
38 | ADD_DEPENDENCIES(clBLAS clBLAS-external)
39 | #SET(CLBLAS_INCLUDE_DIRS ${CMAKE_INSTALL_PREFIX}/include)
40 | SET(clBLAS_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/src/clMathLibraries/clBLAS/src ${CMAKE_CURRENT_SOURCE_DIR}/src/EasyCL/thirdparty/clew/include/proxy-opencl)
41 | #SET(CLBLAS_LIBRARIES ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX})
42 | #SET(CLBLAS_FOUND ON)
43 | 
44 | set_target_properties(clBLAS PROPERTIES
45 | #  INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}EasyCL${CMAKE_SHARED_LIBRARY_SUFFIX}
46 |   MACOSX_RPATH TRUE
47 |   INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib
48 |   INSTALL_RPATH_USE_LINK_PATH TRUE
49 |   IMPORTED_LOCATION ${CMAKE_INSTALL_PREFIX}/lib/${CMAKE_SHARED_LIBRARY_PREFIX}clBLAS${CMAKE_SHARED_LIBRARY_SUFFIX}
50 | )
51 | 
52 | 
53 | add_custom_target(clblas_delete_stamp clBLAS-external 
54 |   ${CMAKE_COMMAND} -E  remove_directory "${CMAKE_BINARY_DIR}/clBLAS/stamp"
55 | )
56 | 
57 | 


--------------------------------------------------------------------------------
/src/init.cpp:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <iostream>
  3 | #include "EasyCL.h"
  4 | using namespace std;
  5 | 
  6 | #include "util/StatefulTimer.h"
  7 | 
  8 | #include "cltorch_commit_generated.h"
  9 | 
 10 | //#include "THClTensorRandom.h"
 11 | 
 12 | extern "C" {
 13 |   #include "lua.h"
 14 |   #include "utils.h"
 15 |   #include "luaT.h"
 16 |   int luaopen_libcltorch( lua_State *L );
 17 |   extern void cltorch_ClStorage_init(lua_State* L);
 18 |   extern void cltorch_ClTensor_init(lua_State* L);
 19 |   extern void cltorch_ClTensorMath_init(lua_State* L);
 20 |   extern void cltorch_ClTensorOperator_init(lua_State* L);
 21 |   extern void cltorch_UserKernel_init(lua_State*L);
 22 | }
 23 | 
 24 | #include "THClGeneral.h"
 25 | #include "THClStorage.h"
 26 | 
 27 | namespace cltorch {
 28 |   void setProperty(lua_State *L, string name, int value)
 29 |   {
 30 |     lua_pushnumber(L, value);
 31 |     lua_setfield(L, -2, name.c_str());
 32 |   }
 33 |   void setProperty(lua_State *L, string name, string value)
 34 |   {
 35 |     lua_pushstring(L, value.c_str());
 36 |     lua_setfield(L, -2, name.c_str());
 37 |   }
 38 |   static int cltorch_setAllowNonGpus(lua_State *L)
 39 |   {
 40 |     THClState *state = cltorch_getstate(L);
 41 |     int allowNonGpus = luaL_checknumber(L, 1);
 42 |     THClSetAllowNonGpus(state, allowNonGpus);
 43 |     return 0;
 44 |   }
 45 |   static int cltorch_getDeviceCount(lua_State *L)
 46 |   {
 47 |     THClState *state = cltorch_getstate(L);
 48 |     int count = THClState_getNumDevices(state);
 49 |     lua_pushnumber(L, count);
 50 |     return 1;
 51 |   }
 52 |   static int cltorch_getDevice(lua_State *L)
 53 |   {
 54 |     THClState *state = cltorch_getstate(L);
 55 |     int device = state->currentDevice;
 56 |     lua_pushnumber(L, device+1);
 57 |     return 1;
 58 |   }
 59 |   static int cltorch_setDevice(lua_State *L)
 60 |   {
 61 |     THClState *state = cltorch_getstate(L);
 62 |     if(state->initialized == 0) {
 63 |       THCl_initializeState(state);
 64 |     }
 65 |     int device = luaL_checknumber(L, 1) - 1;
 66 |     if(device < 0 || device >= state->allocatedDevices) {
 67 |        THError("Device doesnt exist");
 68 |     }
 69 |     state->currentDevice = device;
 70 |     return 0;
 71 |   }
 72 |   static int cltorch_synchronize(lua_State *L)
 73 |   {
 74 |     THClState *state = cltorch_getstate(L);
 75 |     EasyCL *cl = THClState_getClv2(state, state->currentDevice);
 76 |     cl->finish();
 77 |     return 0;
 78 |   }
 79 |   static int cltorch_getDeviceProperties(lua_State *L)
 80 |   {
 81 |     THClState *state = cltorch_getstate(L);
 82 |     int device = (int)luaL_checknumber(L, 1)-1;
 83 |     int count = THClState_getNumDevices(state);
 84 |     if(device < 0 || device >= count) {
 85 |        THError("Device doesnt exist");
 86 |     }
 87 | 
 88 |     easycl::DeviceInfo deviceInfo;
 89 |     if(state->allowNonGpus) {
 90 |         deviceInfo = easycl::DevicesInfo::getDeviceInfo( device );
 91 |     } else {
 92 |         deviceInfo = easycl::DevicesInfo::getGpuInfo( device );
 93 |     }
 94 |     lua_newtable(L);
 95 | 
 96 |     setProperty(L, "maxWorkGroupSize", deviceInfo.maxWorkGroupSize);
 97 |     setProperty(L, "platformVendor", deviceInfo.platformVendor);
 98 |     string deviceTypeString = "";
 99 |     if( deviceInfo.deviceType == 4 ) {
100 |         deviceTypeString = "GPU";
101 |     }
102 |     if( deviceInfo.deviceType == 2 ) {
103 |         deviceTypeString = "CPU";
104 |     }
105 |     if( deviceInfo.deviceType == 8 ) {
106 |         deviceTypeString = "Accelerator";
107 |     }
108 |     setProperty(L, "deviceType", deviceTypeString);
109 |     setProperty(L, "globalMemSizeMB", deviceInfo.globalMemSize / 1024 / 1024);
110 |     setProperty(L, "localMemSizeKB", deviceInfo.localMemSize / 1024);
111 |     setProperty(L, "globalMemCachelineSizeKB", deviceInfo.globalMemCachelineSize / 1024 );
112 |     setProperty(L, "maxMemAllocSizeMB", deviceInfo.maxMemAllocSize / 1024 / 1024);
113 |     setProperty(L, "maxComputeUnits", deviceInfo.maxComputeUnits);
114 |     setProperty(L, "maxWorkGroupSize", deviceInfo.maxWorkGroupSize);
115 |     setProperty(L, "deviceName", deviceInfo.deviceName);
116 |     setProperty(L, "openClCVersion", deviceInfo.openClCVersion);
117 |     setProperty(L, "deviceVersion", deviceInfo.deviceVersion);
118 |     setProperty(L, "maxClockFrequency", deviceInfo.maxClockFrequency);
119 | 
120 |     return 1;
121 |   }
122 | 
123 |   static int cltorch_getState(lua_State *L)
124 |   {
125 |     lua_getglobal(L, "cltorch");
126 |     lua_getfield(L, -1, "_state");
127 |     lua_remove(L, -2);
128 |     return 1;
129 |   }
130 |   static int cltorch_dumpTimings(lua_State *L)
131 |   {
132 |      StatefulTimer::timeCheck("before dump");
133 |      StatefulTimer::dump( true );
134 |      StatefulTimer::timeCheck("after dump");
135 |     return 0;
136 |   }
137 |   // note: this is global, not per-device
138 |   static int cltorch_setEnableTiming(lua_State *L)
139 |   {
140 |     int trace = luaL_checknumber(L, 1);
141 |     StatefulTimer::setEnabled(trace);
142 |     if(trace) {
143 |       cout << "Timing activated" << endl;
144 |     } else {
145 |       cout << "Timing disabled" << endl;
146 |     }
147 |     return 0;
148 |   }
149 |   static int cltorch_dumpProfiling(lua_State *L)
150 |   {
151 |     THClState *state = cltorch_getstate(L);
152 |     EasyCL *cl = THClState_getClv2(state, state->currentDevice);
153 |     cl->dumpProfiling();
154 |     return 0;
155 |   }
156 |   // if you turn this to 1, you will see all copies of data between
157 |   // host and gpu
158 |   // useful for checking we're not doing this too often...
159 |   static int cltorch_setTrace(lua_State *L)
160 |   {
161 |     THClState *state = cltorch_getstate(L);
162 |     int trace = luaL_checknumber(L, 1);
163 |     state->trace = trace;
164 |     return 0;
165 |   }
166 |   static int cltorch_setProfiling(lua_State *L)
167 |   {
168 |     THClState *state = cltorch_getstate(L);
169 |     int trace = luaL_checknumber(L, 1);
170 |     EasyCL *cl = THClState_getClv2(state, state->currentDevice);
171 |     cl->setProfiling(trace);
172 |     if(trace) {
173 |       cout << "Profiling activated" << endl;
174 |     } else {
175 |       cout << "Profiling disabled" << endl;
176 |     }
177 |     return 0;
178 |   }
179 |   static int cltorch_setAddFinish(lua_State *L)
180 |   {
181 |     THClState *state = cltorch_getstate(L);
182 |     int addFinish = luaL_checknumber(L, 1);
183 |     state->addFinish = addFinish;
184 |     if(addFinish) {
185 |       cout << "AddFinish activated" << endl;
186 |     } else {
187 |       cout << "AddFinish disabled" << endl;
188 |     }
189 |     return 0;
190 |   }
191 |   static int cltorch_setDetailedTimings(lua_State *L)
192 |   {
193 |     THClState *state = cltorch_getstate(L);
194 |     int detailedTimings = luaL_checknumber(L, 1);
195 |     state->detailedTimings = detailedTimings;
196 |     return 0;
197 |   }
198 |   static int cltorch_about(lua_State *L)
199 |   {
200 |     cout << "cltorch.  OpenCL backend for Torch" << endl;
201 |     cout << "Built from commit " << cltorch_commit << endl;
202 |     cout << "More info, doc: https://github.com/hughperkins/cltorch" << endl;
203 |     cout << "Issues: https://github.com/hughperkins/cltorch/issues" << endl;
204 |     return 0;
205 |   }
206 | 
207 |   static const struct luaL_Reg cltorch_stuff__ [] = {
208 |     {"setAllowNonGpus", cltorch_setAllowNonGpus},
209 |     {"getDevice", cltorch_getDevice},
210 |     {"setDevice", cltorch_setDevice},
211 |     {"synchronize", cltorch_synchronize},
212 |     {"finish", cltorch_synchronize},
213 |     {"getDeviceCount", cltorch_getDeviceCount},
214 |     {"getDeviceProperties", cltorch_getDeviceProperties},
215 |     {"getState", cltorch_getState},
216 |     {"setTrace", cltorch_setTrace},
217 |     {"setAddFinish", cltorch_setAddFinish},
218 |     {"dumpTimings", cltorch_dumpTimings},
219 |     {"setProfiling", cltorch_setProfiling},
220 |     {"setEnableTiming", cltorch_setEnableTiming},
221 |     {"setDetailedTimings", cltorch_setDetailedTimings},
222 |     {"setTiming", cltorch_setEnableTiming},
223 |     {"dumpProfiling", cltorch_dumpProfiling},
224 |     {"about", cltorch_about},
225 |     {NULL, NULL}
226 |   };
227 | }
228 | 
229 | int luaopen_libcltorch( lua_State *L ) {
230 |   try {
231 |     lua_newtable(L);
232 |     luaL_setfuncs(L, cltorch::cltorch_stuff__, 0);
233 | 
234 |     THClState* state = (THClState*)malloc(sizeof(THClState));
235 |     THClInit(state);
236 | 
237 |     cltorch_ClStorage_init(L);
238 |     cltorch_ClTensor_init(L);
239 |     cltorch_ClTensorMath_init(L);
240 |     cltorch_ClTensorOperator_init(L);
241 |     cltorch_UserKernel_init(L);
242 | 
243 |     lua_pushlightuserdata(L, state);
244 |     lua_setfield(L, -2, "_state");
245 |   } catch(runtime_error &e) {
246 |     THError("Something went wrong: %s", e.what());
247 |   }
248 |   return 1;
249 | }
250 | 
251 | 


--------------------------------------------------------------------------------
/src/init.lua:
--------------------------------------------------------------------------------
 1 | require "torch"
 2 | 
 3 | -- check we are installed from distro, otherwise error message and exit...
 4 | 
 5 | require 'os'
 6 | 
 7 | xpcall(function()
 8 |   require 'distrocheck'
 9 | end, function()
10 |   print('Please install cltorch from distro, per instructions at https://github.com/hughperkins/cltorch')
11 |   os.exit(1)
12 | end)
13 | 
14 | -- store old copy functions, in case cutorch has been loaded
15 | -- note that this only works if cutorch is loaded first
16 | 
17 | local torchtypes = {}
18 | table.insert(torchtypes, torch.DoubleTensor)
19 | table.insert(torchtypes, torch.FloatTensor)
20 | table.insert(torchtypes, torch.IntTensor)
21 | table.insert(torchtypes, torch.ByteTensor)
22 | table.insert(torchtypes, torch.CharTensor)
23 | table.insert(torchtypes, torch.ShortTensor)
24 | table.insert(torchtypes, torch.LongTensor)
25 | 
26 | for i,torchtype in ipairs(torchtypes) do
27 |    torchtype.cloldcopy = torchtype.copy
28 | end
29 | 
30 | cltorch = paths.require("libcltorch")
31 | 
32 | for i,torchtype in ipairs(torchtypes) do
33 |    torchtype.clnewcopy = torchtype.copy
34 | end
35 | 
36 | for i,torchtype in ipairs(torchtypes) do
37 |    torchtype.copy = function (self, two)
38 |    if(torch.type(two) == "torch.ClTensor") then
39 |       torchtype.clnewcopy(self, two)
40 |    else
41 |       torchtype.cloldcopy(self, two)
42 |    end
43 |    return self
44 | end
45 | end
46 | 
47 | -- convert to FloatStorage first, rather than repeatedly
48 | -- calling 'get' on ClStorage
49 | function torch.ClStorage.__tostring__(self)
50 | floatstorage = torch.FloatStorage(self:size())
51 | floatstorage:copy(self)
52 | return string.gsub(floatstorage:__tostring__(), 'FloatStorage', 'ClStorage')
53 | end
54 | 
55 | function torch.ClTensor.__tostring__(self)
56 | if self:size():size() ~= 0 then
57 |    return torch.FloatTensor.__tostring__(self)
58 | else
59 |    return tostring(self:s()) .. '\n[torch.ClTensor of 0 dimensions]'
60 | end
61 | end
62 | 
63 | --torch.ClStorage.__tostring__ = torch.FloatStorage.__tostring__
64 | --torch.ClTensor.__tostring__ = torch.FloatTensor.__tostring__
65 | 
66 | include('Test.lua')
67 | include('Tensor.lua')
68 | include('Random.lua')
69 | include('FFI.lua')
70 | --include('test.lua')
71 | 
72 | --local unpack = unpack or table.unpack
73 | 
74 | return cltorch
75 | 
76 | 


--------------------------------------------------------------------------------
/src/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | 
 3 | OPTION(DEV_RUN_COG "cltorch maintainers only, otherwise set to 'OFF'." OFF)
 4 | 
 5 | SET(CMAKE_C_FLAGS "-std=c99")
 6 | #SET(src
 7 | #    THCGeneral.c THCAllocator.c THCStorage.c THCStorageCopy.c THCTensor.c THCTensorCopy.c)
 8 | SET(src
 9 |     THClGeneral.cpp THClStorage.cpp THClStorageCopy.cpp THClTensor.cpp THClTensorCopy.cpp THClTensorMath.cpp 
10 |     THClTensorMathPointwise.cpp THClReduceApplyUtils.cpp THClApply.cpp
11 |     THClTensorMathCompare.cpp THClTensorMathCompareT.cpp
12 |     THClTensorMathPairwise.cpp THClTensorMath2.cpp
13 |     THClBlas.cpp THClTensorMathBlas.cpp THClBlas.cpp THClReduce.cpp
14 |     THClTypeParseTraits.cpp THClReduceAll.cpp THClDeviceUtils.cpp
15 |     THClTensorMasked.cpp THClTensorMathTransformReduce.cpp
16 |     THClTensorIndex.cpp THClKernels.cpp THClTensorMathScan.cpp THClGather.cpp
17 |     THClScatter.cpp )
18 | set(src-cl)
19 | 
20 | message("CLBLAS_INCLUDE_DIRS ${CLBLAS_INCLUDE_DIRS}")
21 | INCLUDE_DIRECTORIES(${CLBLAS_INCLUDE_DIRS})
22 | message("${CMAKE_CURRENT_SOURCE_DIR}/src/boost-headers-lite")
23 | INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../boost-headers-lite)
24 | 
25 | add_definitions(-DCL_USE_DEPRECATED_OPENCL_1_1_APIS)  # this affects clew...
26 | 
27 | ADD_LIBRARY(THCl SHARED ${src} ${src-cl})
28 | TARGET_LINK_LIBRARIES(THCl TH )
29 | #message("DEEPCL_LIBRARIES ${EASYCL_LIBRARIES}")
30 | #TARGET_LINK_LIBRARIES( THCl ${EASYCL_LIBRARIES} )
31 | target_link_libraries(THCl EasyCL)
32 | target_link_libraries(THCl clew)
33 | TARGET_LINK_LIBRARIES( THCl clBLAS)
34 | #add_dependencies( THCl clBLAS )
35 | #add_dependencies( THCl EasyCL )
36 | add_dependencies( THCl EasyCL-external )
37 | add_dependencies( THCl clBLAS-external )
38 | 
39 | if(DEV_RUN_COG)
40 |     add_custom_target(
41 |         cog_thcl
42 |         python ${CMAKE_CURRENT_SOURCE_DIR}/../EasyCL/thirdparty/cogapp/cog.py -q -I ${CMAKE_CURRENT_SOURCE_DIR}/../EasyCL/cog-batteries -r ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/*.h
43 |         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
44 |     )
45 |     add_dependencies( THCl cog_thcl )
46 | endif(DEV_RUN_COG)
47 | 
48 | INSTALL(FILES
49 |           THCl.h
50 |           THClGeneral.h
51 |           THClBlas.h
52 |           THClStorage.h
53 | 	  THClStorageCopy.h
54 |           THClTensor.h
55 | 	  THClTensorCopy.h
56 | #          THClTensorRandom.h
57 |           THClTensorMath.h
58 | #          THClTensorConv.h
59 | #          THClTensorSort.h
60 |           THClApply.h
61 |           THClReduce.h
62 |           THClReduceApplyUtils.h
63 |           THClKernels.h
64 |           THClOperators.h
65 | #	  THClAllocator.h
66 |           DESTINATION "${Torch_INSTALL_INCLUDE_SUBDIR}/THCl")
67 | 
68 | INSTALL(TARGETS THCl
69 |           RUNTIME DESTINATION "${Torch_INSTALL_BIN_SUBDIR}"
70 |           LIBRARY DESTINATION "${Torch_INSTALL_LIB_SUBDIR}"
71 |           ARCHIVE DESTINATION "${Torch_INSTALL_LIB_SUBDIR}")
72 | 
73 | 


--------------------------------------------------------------------------------
/src/lib/THCl.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCL_INC
 2 | #define THCL_INC
 3 | 
 4 | #include "THClGeneral.h"
 5 | //#include "THClAllocator.h"
 6 | //#include "THClBlas.h"
 7 | #include "THClStorage.h"
 8 | #include "THClStorageCopy.h"
 9 | #include "THClTensor.h"
10 | #include "THClTensorCopy.h"
11 | //#include "THClTensorRandom.h"
12 | #include "THClTensorMath.h"
13 | //#include "THClTensorConv.h"
14 | //#include "THClTensorSort.h"
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/src/lib/THClApply.cl:
--------------------------------------------------------------------------------
  1 | // OpenCL kernels....
  2 | 
  3 | // expected templated values:
  4 | // dims (vector of unique dimension values)
  5 | // operation
  6 | // dim1
  7 | // dim2
  8 | // dim3
  9 | // ... dimD
 10 | // num_input_tensors
 11 | // include_scalar_input
 12 | //
 13 | // maybe should add:
 14 | // IndexType (hardcoded to int for now)
 15 | // MAX_CUTORCH_DIMS (hardcoded to 25 for now)
 16 | 
 17 | // (Ported from cutorch's THCApply.cuh)
 18 | 
 19 | // Maximum number of dimensions allowed for cutorch
 20 | // #define MAX_CUTORCH_DIMS 25
 21 | 
 22 | // Enum that indicates whether tensor arguments are read/write or
 23 | // read-only
 24 | //enum TensorArgType { ReadWrite, ReadOnly };
 25 | 
 26 | {%
 27 |  local total_opsize = num_tensors
 28 |  if include_scalar_input then 
 29 |       total_opsize = total_opsize + 1
 30 |    end
 31 |  %}
 32 | 
 33 | static inline void op( global float *out
 34 |   {% for t=1,(num_tensors-1) do %}
 35 |   , global float *in{{t}}
 36 |   {% end %}
 37 |   {% for s=1,(num_scalars) do %}
 38 |   , float val{{s}}
 39 |   {% end %}
 40 |    {% for pt=1,num_point_tensors do %}
 41 |    , global float *pointTensor{{pt}}
 42 |    {% end %}
 43 | ) {
 44 |     {{operation}};
 45 | }
 46 | 
 47 | kernel void
 48 | THClTensor_pointwiseApplyD(
 49 |    {% for t=1,num_tensors do %}
 50 |     int offset_{{t}},
 51 |     {% local thisdims = loadstring('return dims' .. t)() %}
 52 |     {% for d=1,thisdims do %}
 53 |       int size_{{t}}_{{d}},
 54 |       int stride_{{t}}_{{d}},
 55 |     {% end %}
 56 |     global float*data_{{t}},
 57 |    {% end %}
 58 |    {% for i=1,num_scalars do %}
 59 |    float val{{i}},
 60 |    {% end %}
 61 |    {% for i=1,num_point_tensors do %}
 62 |    global float *pointTensor{{i}},
 63 |    {% end %}
 64 |    int totalElements) {
 65 |    int linearIndex = get_global_id(0);
 66 |    if(linearIndex < totalElements ) {
 67 |     {% if declare_linear_index then %}
 68 |     int thisLinearId;
 69 |     {% end %}
 70 |     {% for t=1,num_tensors do %}
 71 |       {% local thisdims = loadstring('return dims' .. t)() %}
 72 |       {% if thisdims == -2 then %}
 73 |          int derived_offset_{{t}} = linearIndex + offset_{{t}};
 74 |       {% else %}
 75 |          {{IndexType}} derived_offset_{{t}} = offset_{{t}};
 76 |          thisLinearId = linearIndex;
 77 |         {% for d=thisdims,1,-1 do %}  // bake this in....
 78 |           derived_offset_{{t}} += (thisLinearId % size_{{t}}_{{d}}) * stride_{{t}}_{{d}};
 79 |           {% if d > 0 then %}
 80 |             thisLinearId /= size_{{t}}_{{d}};
 81 |           {% end %}
 82 |         {% end %}
 83 | 
 84 |       {% end %}
 85 |     {% end %}
 86 | 
 87 |     op( 
 88 |       {% for t=1,num_tensors do %}
 89 |          {% if t > 1 then %} , {% end %}
 90 |          &(data_{{t}}[derived_offset_{{t}}])
 91 |       {% end %}
 92 | 
 93 |       {% for s=1,num_scalars do %}
 94 |       , val{{s}}
 95 |       {% end %}
 96 | 
 97 |        {% for pt=1,num_point_tensors do %}
 98 |        , pointTensor{{pt}}
 99 |        {% end %}
100 |     );
101 |   }
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/src/lib/THClApply.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCL_APPLY_INC
 2 | #define THCL_APPLY_INC
 3 | 
 4 | #include "THClGeneral.h"
 5 | #include "THClTensor.h"
 6 | #include "THClOperators.h"
 7 | #include "THClReduceApplyUtils.h"
 8 | 
 9 | //
10 | // This file contains pointwise operation functions and kernels that
11 | // work on both contiguous and non-contiguous tensor arguments of
12 | // arbitrary (up to MAX_CLTORCH_DIMS) dimensioned arguments without
13 | // copying or temporary storage.
14 | //
15 | 
16 | int getWorkgroupSize(THClState *state, int device);
17 | dim3 getApplyBlock(THClState *state, int device);
18 | dim3 getApplyGrid(THClState* state, int device, long totalElements);
19 | 
20 | bool THClTensor_pointwiseApply1(THClState* state,
21 |                                   THClTensor* a,
22 |                                   const HasOperator1 *op,
23 |                                   TensorArgType aType = ReadWrite);
24 | bool THClTensor_pointwiseApply2(THClState* state,
25 |                                   THClTensor* a,
26 |                                   THClTensor* b,
27 |                                   const HasOperator2 *op,
28 |                                   TensorArgType aType = ReadWrite,
29 |                                   TensorArgType bType = ReadOnly);
30 | bool THClTensor_pointwiseApply3(THClState* state,
31 |                                   THClTensor* a,
32 |                                   THClTensor* b,
33 |                                   THClTensor* c,
34 |                                   const HasOperator3 *op,
35 |                                   TensorArgType aType = ReadWrite,
36 |                                   TensorArgType bType = ReadOnly,
37 |                                   TensorArgType cType = ReadOnly);
38 | 
39 | #endif // THCL_APPLY_INC
40 | 
41 | 


--------------------------------------------------------------------------------
/src/lib/THClBlas.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCL_BLAS_INC
 2 | #define THCL_BLAS_INC
 3 | 
 4 | #include "THClGeneral.h"
 5 | 
 6 | //class THClTensor;
 7 | struct THClTensor;
 8 | class CLWrapper;
 9 | 
10 | typedef struct THClBlasState {
11 | //  cublasHandle_t* handles;
12 | //  cublasHandle_t* current_handle;
13 | //  int n_devices;
14 | } THClBlasState;
15 | 
16 | /* Level 1 */
17 | THCL_API void THClBlas_swap(THClState *state, long n, float *x, long incx, float *y, long incy);
18 | THCL_API void THClBlas_scal(THClState *state, long n, float a, float *x, long incx);
19 | THCL_API void THClBlas_copy(THClState *state, long n, float *x, long incx, float *y, long incy);
20 | THCL_API void THClBlas_axpy(THClState *state, long n, float a, float *x, long incx, float *y, long incy);
21 | THCL_API float THClBlas_dot(THClState *state, long n, 
22 |     CLWrapper *xwrapper, long xoffset, long incx, 
23 |     CLWrapper *ywrapper, long yoffset, long incy);
24 | 
25 | /* Level 2 */
26 | THCL_API void THClBlas_gemv(THClState *state, char trans, long m, long n, float alpha,
27 |   THClTensor *a, long lda, 
28 |   THClTensor *x, long incx, 
29 |   float beta,
30 |   THClTensor *y, long incy);
31 | 
32 | THCL_API void THClBlas_ger(THClState *state, long m, long n, float alpha, 
33 |     THClTensor *x, long incx,
34 |     THClTensor *y, long incy,
35 |     THClTensor *a, long lda);
36 | 
37 | /* Level 3 */
38 | THCL_API void THClBlas_gemm(THClState *state, char transa, char transb, 
39 |   long m, long n, long k, float alpha,
40 |   THClTensor *a, long lda, THClTensor *b, long ldb, float beta, THClTensor *c, long ldc);
41 | 
42 | THCL_API void THClBlas_gemmBatched(THClState *state, char transa, char transb, long m, long n, long k,
43 |                                     float alpha, CLWrapper *aWrapper, long lda, CLWrapper *bWrapper, long ldb,
44 |                                     float beta, CLWrapper *cWrapper, long ldc, long batchCount);
45 | 
46 | #endif
47 | 
48 | 


--------------------------------------------------------------------------------
/src/lib/THClDeviceUtils.cl:
--------------------------------------------------------------------------------
1 | static inline {{IndexType}} THClCeilDiv({{IndexType}} a, {{IndexType}} b) {
2 |   return (a + b - 1) / b;
3 | }
4 | 
5 | 


--------------------------------------------------------------------------------
/src/lib/THClDeviceUtils.cpp:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #include "THClDeviceUtils.h"
 4 | 
 5 | #define DEFINE_THCLCEILDIV(TYPE) \
 6 | TYPE THClCeilDiv(TYPE a, TYPE b) { \
 7 |   return (a + b - 1) / b; \
 8 | }
 9 | 
10 | DEFINE_THCLCEILDIV(uint32);
11 | DEFINE_THCLCEILDIV(uint64);
12 | DEFINE_THCLCEILDIV(int32);
13 | DEFINE_THCLCEILDIV(int64);
14 | 
15 | //template uint64 THClCeilDiv<uint64>(uint64 a, uint64 b);
16 | //template uint32 THClCeilDiv<uint32>(uint32 a, uint32 b);
17 | //template int64 THClCeilDiv<int64>(int64 a, int64 b);
18 | //template int32 THClCeilDiv<int32>(int32 a, int32 b);
19 | 
20 | std::string THClDeviceUtils_getKernelTemplate() {
21 |   // [[[cog
22 |   // import stringify
23 |   // stringify.write_kernel( "kernel", "THClDeviceUtils.cl" )
24 |   // ]]]
25 |   // generated using cog, from THClDeviceUtils.cl:
26 |   const char * kernelSource =  
27 |   "static inline {{IndexType}} THClCeilDiv({{IndexType}} a, {{IndexType}} b) {\n" 
28 |   "  return (a + b - 1) / b;\n" 
29 |   "}\n" 
30 |   "\n" 
31 |   "";
32 |   // [[[end]]]
33 |   return kernelSource;
34 | }
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/src/lib/THClDeviceUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCL_DEVICE_UTILS_INC
 2 | #define THCL_DEVICE_UTILS_INC
 3 | 
 4 | #include "THClGeneral.h"
 5 | #include <string>
 6 | 
 7 | /**
 8 |    Computes ceil(a / b)
 9 | */
10 | //template <typename T>
11 | //T THClCeilDiv(T a, T b);
12 | 
13 | #define DECLARE_THCLCEILDIV(TYPE) \
14 | TYPE THClCeilDiv(TYPE a, TYPE b);
15 | 
16 | DECLARE_THCLCEILDIV(uint32);
17 | DECLARE_THCLCEILDIV(uint64);
18 | DECLARE_THCLCEILDIV(int32);
19 | DECLARE_THCLCEILDIV(int64);
20 | 
21 | std::string THClDeviceUtils_getKernelTemplate();
22 | 
23 | #endif // THCL_DEVICE_UTILS_INC
24 | 
25 | 


--------------------------------------------------------------------------------
/src/lib/THClGather.cl:
--------------------------------------------------------------------------------
 1 | // probably should put this on its own somewhere, so we 
 2 | // dont have to either ocpy/paste, or include entire THClReduceApplyUtils
 3 | typedef struct TensorInfoCl {
 4 |   unsigned int sizes[{{MAX_CLTORCH_DIMS}}];
 5 |   unsigned int strides[{{MAX_CLTORCH_DIMS}}];
 6 |   int offset;
 7 |   int dims;
 8 | } TensorInfoCl;
 9 | 
10 | kernel void THClTensor_kernel_gather(
11 |     global TensorInfoCl *dst_info, global float*dst_data,
12 |     global const TensorInfoCl *src_info, global float*src_data,
13 |    int dim,
14 |     global const TensorInfoCl *idx_info, global float*idx_data,
15 |    int totalElements
16 | )
17 | {
18 |   for (int _linearId = get_global_id(0);
19 |        _linearId < totalElements;
20 |        _linearId += get_global_size(0)) {
21 | 
22 |       // plan is:
23 |       // based on our linearIndex, this gets us a spot in the index
24 |       // tensor
25 |       // this is also a spot in the tgt_data (at least, if we can
26 |       // convert into actual coordinates, then it is the coordinates
27 |       // in the target tensor
28 |       // the coordinates in the source are teh same, except that
29 |       // we replace that of dimension dim with the value from
30 |       // the index tensor
31 |       //
32 |       // so, everything hinges on us getting the coordinates, I think?
33 |       // so, lets do that :-)
34 |       int idxOffset = idx_info->offset;
35 |       int srcOffset = src_info->offset;
36 |       int dstOffset = dst_info->offset;
37 |       int linearId = _linearId; // copy it, since we'll modify it
38 | //      for(int d={{dims}}-1; d >= 0; d--) {  // just use slow, unbkaed loop for now, to
39 |                                    // get it working
40 |         int curDimIndex;
41 |         {% for d=dims-1,0,-1 do %}
42 |           curDimIndex = linearId % idx_info->sizes[{{d}}];
43 |           idxOffset += curDimIndex * idx_info->strides[{{d}}];
44 |           dstOffset += curDimIndex * dst_info->strides[{{d}}];
45 |           if( {{d}} != dim ) { // this only matters for the source, the others are 
46 |                            // unaffected by which dimension we are on. I think.
47 |             srcOffset += curDimIndex * src_info->strides[{{d}}];
48 |           }
49 |           linearId /= idx_info->sizes[{{d}}];
50 |         {% end %}
51 | //      }
52 |       // now we have the idxoffset.  get the value at that location
53 |       int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based
54 |       // then use this to get the final value for srcOffset
55 |       srcOffset += idxValue * src_info->strides[dim];
56 |       // get the value...
57 |       float value = src_data[srcOffset];
58 |       // and save it up...
59 |       dst_data[dstOffset] = value;
60 |       // thats it?
61 |   }
62 | }
63 | 
64 | 


--------------------------------------------------------------------------------
/src/lib/THClGather.cpp:
--------------------------------------------------------------------------------
  1 | #include "THClTensorMath.h"
  2 | #include "THClGeneral.h"
  3 | //#include "THClBlas.h"
  4 | #include "THClTensorCopy.h"
  5 | //#include "THClTensorRandom.h"
  6 | #include "THClApply.h"
  7 | #include "THClReduce.h"
  8 | #include "THClKernels.h"
  9 | #include "THClReduceApplyUtils.h"
 10 | 
 11 | #include <iostream>
 12 | #include <string>
 13 | using namespace std;
 14 | 
 15 | static std::string getTemplate();
 16 | 
 17 | THCL_API void THClTensor_gather(THClState *state, THClTensor *self, THClTensor *src, long dim, THClTensor *index) {
 18 |   StatefulTimer::timeCheck("THClTensor_kernel_Gather START");
 19 | 
 20 |   // src will be ndims
 21 |   // index will be ndims too, though one of the dims should have length 1
 22 |   // self will be ndims
 23 |   int nDims = src->nDimension;
 24 | 
 25 |   THArgCheck(nDims >= 2, 2, "Tensors should have at least 2 dimensions"); // I guess?
 26 |   THArgCheck(src->nDimension == nDims, 2, "All tensors should have same number of dims");
 27 |   THArgCheck(index->nDimension == nDims, 4, "All tensors should have same number of dims");
 28 |   THArgCheck(dim < nDims, 4, "dim out of bounds");
 29 |   THArgCheck(dim >= 0, 4, "dim out of bounds");
 30 |   THArgCheck(nDims < MAX_CLTORCH_DIMS, 2, "Tensors should have less than %i dimensions", MAX_CLTORCH_DIMS); // I guess?
 31 | 
 32 | //  THLongStorage *newSize;
 33 | 
 34 |   for( int i = 0; i < nDims; i++ ) {
 35 |     if( i != dim ) {
 36 |       THArgCheck(THClTensor_size(state, src, i) == THClTensor_size(state, index, i), 3, ("index tensor must have same dimensions as source tensor, but dimension " + easycl::toString(i) + " doesnt match").c_str());
 37 |     }
 38 |   }
 39 | 
 40 |   const int device = src->storage->device;
 41 | 
 42 |   // hmmm ,I wonder if we need this any more, after migration to TensorMath.lua
 43 | //  if( self != src ) {
 44 | //    newSize = THLongStorage_newWithSize(index->nDimension);
 45 | //    THLongStorage_rawCopy(newSize, index->size);
 46 | //    THClTensor_resize(state, self, newSize, NULL);
 47 | //    THLongStorage_free(newSize);
 48 | //  }
 49 | 
 50 |   // since self is write-only, and index and src are read-only, ie none are read-write
 51 |   // so, we dnot need to worry about contiguity (at least, not from point of view of correctness)
 52 |   
 53 | 
 54 |   std::string uniqueName = __FILE__ ":gather:" + easycl::toString(nDims);
 55 |   EasyCL *cl = THClTensor_getCl(state, src);
 56 |   CLKernel *kernel = 0;
 57 |   if(cl->kernelExists(uniqueName)) {
 58 |     kernel = cl->getKernel(uniqueName);
 59 |     StatefulTimer::timeCheck("Apply3 1aa");
 60 |   } else {
 61 |     TemplatedKernel kernelBuilder(cl);
 62 |     kernelBuilder.set("IndexType", "unsigned int");
 63 |     kernelBuilder.set("dims", nDims);
 64 |     kernelBuilder.set("MAX_CLTORCH_DIMS", MAX_CLTORCH_DIMS);
 65 |     kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getTemplate(), "THClTensor_kernel_gather" );
 66 |   }
 67 | 
 68 |   TensorInfoCl selfInfoCl(self);
 69 |     TensorInfoCl srcInfoCl(src);
 70 |     TensorInfoCl indexInfoCl(index);
 71 | 
 72 |   const dim3 block = getApplyBlock(state, device);
 73 | 
 74 |   long totalElements = THClTensor_nElement(state, index);
 75 |   dim3 grid = getApplyGrid(state, device, totalElements);
 76 | 
 77 |   THClKernels k(state, kernel);
 78 |   kernel->in(1, &selfInfoCl);
 79 |   kernel->out(self->storage->wrapper);
 80 |   kernel->in(1, &srcInfoCl);
 81 |   kernel->in(src->storage->wrapper);
 82 |   k.in((int)dim);
 83 |   kernel->in(1, &indexInfoCl);
 84 |   kernel->in(index->storage->wrapper);
 85 |   if( totalElements > ( 1l << 30 )) {
 86 |     throw std::runtime_error("Error: out of bounds for totalelements=" + easycl::toString(totalElements));
 87 |   }
 88 |   k.in( (int)totalElements );
 89 |   k.run(grid, block);
 90 | 
 91 |   if(state->addFinish) cl->finish();
 92 |   StatefulTimer::timeCheck("THClTensor_kernel_Gather END");
 93 | }
 94 | 
 95 | static std::string getTemplate() {
 96 |   // [[[cog
 97 |   // import stringify
 98 |   // stringify.write_kernel( "kernel", "THClGather.cl" )
 99 |   // ]]]
100 |   // generated using cog, from THClGather.cl:
101 |   const char * kernelSource =  
102 |   "// probably should put this on its own somewhere, so we\n" 
103 |   "// dont have to either ocpy/paste, or include entire THClReduceApplyUtils\n" 
104 |   "typedef struct TensorInfoCl {\n" 
105 |   "  unsigned int sizes[{{MAX_CLTORCH_DIMS}}];\n" 
106 |   "  unsigned int strides[{{MAX_CLTORCH_DIMS}}];\n" 
107 |   "  int offset;\n" 
108 |   "  int dims;\n" 
109 |   "} TensorInfoCl;\n" 
110 |   "\n" 
111 |   "kernel void THClTensor_kernel_gather(\n" 
112 |   "    global TensorInfoCl *dst_info, global float*dst_data,\n" 
113 |   "    global const TensorInfoCl *src_info, global float*src_data,\n" 
114 |   "   int dim,\n" 
115 |   "    global const TensorInfoCl *idx_info, global float*idx_data,\n" 
116 |   "   int totalElements\n" 
117 |   ")\n" 
118 |   "{\n" 
119 |   "  for (int _linearId = get_global_id(0);\n" 
120 |   "       _linearId < totalElements;\n" 
121 |   "       _linearId += get_global_size(0)) {\n" 
122 |   "\n" 
123 |   "      // plan is:\n" 
124 |   "      // based on our linearIndex, this gets us a spot in the index\n" 
125 |   "      // tensor\n" 
126 |   "      // this is also a spot in the tgt_data (at least, if we can\n" 
127 |   "      // convert into actual coordinates, then it is the coordinates\n" 
128 |   "      // in the target tensor\n" 
129 |   "      // the coordinates in the source are teh same, except that\n" 
130 |   "      // we replace that of dimension dim with the value from\n" 
131 |   "      // the index tensor\n" 
132 |   "      //\n" 
133 |   "      // so, everything hinges on us getting the coordinates, I think?\n" 
134 |   "      // so, lets do that :-)\n" 
135 |   "      int idxOffset = idx_info->offset;\n" 
136 |   "      int srcOffset = src_info->offset;\n" 
137 |   "      int dstOffset = dst_info->offset;\n" 
138 |   "      int linearId = _linearId; // copy it, since we'll modify it\n" 
139 |   "//      for(int d={{dims}}-1; d >= 0; d--) {  // just use slow, unbkaed loop for now, to\n" 
140 |   "                                   // get it working\n" 
141 |   "        int curDimIndex;\n" 
142 |   "        {% for d=dims-1,0,-1 do %}\n" 
143 |   "          curDimIndex = linearId % idx_info->sizes[{{d}}];\n" 
144 |   "          idxOffset += curDimIndex * idx_info->strides[{{d}}];\n" 
145 |   "          dstOffset += curDimIndex * dst_info->strides[{{d}}];\n" 
146 |   "          if( {{d}} != dim ) { // this only matters for the source, the others are\n" 
147 |   "                           // unaffected by which dimension we are on. I think.\n" 
148 |   "            srcOffset += curDimIndex * src_info->strides[{{d}}];\n" 
149 |   "          }\n" 
150 |   "          linearId /= idx_info->sizes[{{d}}];\n" 
151 |   "        {% end %}\n" 
152 |   "//      }\n" 
153 |   "      // now we have the idxoffset.  get the value at that location\n" 
154 |   "      int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based\n" 
155 |   "      // then use this to get the final value for srcOffset\n" 
156 |   "      srcOffset += idxValue * src_info->strides[dim];\n" 
157 |   "      // get the value...\n" 
158 |   "      float value = src_data[srcOffset];\n" 
159 |   "      // and save it up...\n" 
160 |   "      dst_data[dstOffset] = value;\n" 
161 |   "      // thats it?\n" 
162 |   "  }\n" 
163 |   "}\n" 
164 |   "\n" 
165 |   "";
166 |   // [[[end]]]
167 |   return kernelSource;
168 | }
169 | 
170 | 


--------------------------------------------------------------------------------
/src/lib/THClGeneral.cpp:
--------------------------------------------------------------------------------
  1 | #include "THClGeneral.h"
  2 | #include "TH.h"
  3 | 
  4 | #include <stdio.h>
  5 | #include "EasyCL.h"
  6 | #include <clBLAS.h>
  7 | #include "DeviceInfo.h"
  8 | 
  9 | //using namespace easycl;
 10 | 
 11 | //#include "THCTensorRandom.h"
 12 | //#include "THCBlas.h"
 13 | //#include "THCAllocator.h"
 14 | 
 15 | /* Size of scratch space available in global memory per each SM + stream */
 16 | #define FLOATS_PER_SCRATCH_SPACE 4
 17 | #define GLOBAL_SCRATCH_SPACE_PER_SM_STREAM (FLOATS_PER_SCRATCH_SPACE) * sizeof(float)
 18 | 
 19 | void THCl_initializeState(THClState *state) {
 20 |   if(state->initialized) {
 21 |     return;
 22 |   }
 23 |   state->initialized = 1;
 24 |   if(state->allowNonGpus) {
 25 |     state->allocatedDevices = easycl::DevicesInfo::getNumDevices();
 26 |   } else {
 27 |     state->allocatedDevices = easycl::DevicesInfo::getNumGpus();
 28 |   }
 29 |   state->clByDevice = new EasyCL *[state->allocatedDevices];
 30 |   state->scratchSpaceByDevice = new THClScratchSpace *[state->allocatedDevices];
 31 |   state->trace = 0;
 32 |   state->detailedTimings = 0;
 33 |   state->addFinish = 0;
 34 | //  state->workgroupSizeByDevice = new int[state->allocatedDevices];
 35 |   state->deviceInfoByDevice = (DeviceInfo **)new easycl::DeviceInfo *[state->allocatedDevices];
 36 |   for(int i = 0; i < state->allocatedDevices; i++) {
 37 |     state->clByDevice[i] = 0;
 38 |     state->scratchSpaceByDevice[i] = 0;
 39 |     state->deviceInfoByDevice[i] = 0;
 40 |   }
 41 |   state->currentDevice = 0;
 42 |   //state->cl = EasyCL::createForFirstGpuOtherwiseCpu(); // obviously this should change...
 43 | 
 44 |   cl_int err;
 45 | 
 46 |   err = clblasSetup();
 47 |   if (err != CL_SUCCESS) {
 48 |     THError("clblasSetup() failed with %d", err);
 49 |   }
 50 | }
 51 | void THClSetAllowNonGpus(THClState *state, int allowNonGpus) {
 52 |   if(state->initialized) {
 53 |     THError("cannot set allowNonGpus after initialization done");
 54 |   } else {
 55 |     state->allowNonGpus = allowNonGpus;
 56 |   }
 57 | }
 58 | void THClInit(THClState* state)
 59 | {
 60 |     state->initialized = 0;
 61 |     state->allowNonGpus = 0;
 62 |     state->trace = 0;
 63 |     state->detailedTimings = 0;
 64 |     state->addFinish = 0;
 65 |     state->currentDevice = 0;
 66 |     state->allocatedDevices = 0;
 67 | 
 68 |     state->clByDevice = 0;
 69 |     state->scratchSpaceByDevice = 0;
 70 |     state->deviceInfoByDevice = 0;
 71 | }
 72 | 
 73 | void THClShutdown(THClState* state)
 74 | {
 75 |   if(state->initialized == 0) {
 76 |       return;
 77 |   }
 78 | 
 79 |   clblasTeardown();
 80 |   for( int i = 0; i < state->allocatedDevices; i++ ) {
 81 |     delete state->clByDevice[i];
 82 |     delete state->scratchSpaceByDevice[i]->wrapper;
 83 |     delete[] state->scratchSpaceByDevice[i]->data;
 84 |     delete (easycl::DeviceInfo*)state->deviceInfoByDevice[i];
 85 |   }
 86 |   delete[] (easycl::DeviceInfo**)state->deviceInfoByDevice;
 87 |   delete[] state->clByDevice;
 88 |   delete[] state->scratchSpaceByDevice;
 89 |   state->initialized = 0;
 90 | //  delete[] state->workgroupSizeByDevice
 91 | 
 92 |   printf("THClShutdown() done\n");
 93 |   printf("*******************************************\n");
 94 | }
 95 | 
 96 | std::ostream &operator<<( std::ostream &os, const dim3 &obj ) {
 97 |   os << "dim3{" << obj.vec[0] << ", " << obj.vec[1] << ", " << obj.vec[2] << "}";
 98 |   return os;
 99 | }
100 | 
101 | int THClState_getNumDevices(THClState* state) {
102 |   if(state->initialized == 0) {
103 |     THCl_initializeState(state);
104 |   }
105 |   return state->allocatedDevices;
106 | }
107 | void THClState_setDevice(THClState* state, int device) {
108 |   if(state->initialized == 0) {
109 |     THCl_initializeState(state);
110 |   }
111 |   state->currentDevice = device;
112 | }
113 | int THClState_getDevice(THClState* state) {
114 |   if(state->initialized == 0) {
115 |     THCl_initializeState(state);
116 |   }
117 |   return state->currentDevice;
118 | }
119 | EasyCL *THClState_getCl(THClState* state ) {
120 |   if(state->initialized == 0) {
121 |     THCl_initializeState(state);
122 |   }
123 |   return THClState_getClv2(state, state->currentDevice);
124 | }
125 | EasyCL *THClState_getCl(THClState* state, int *p_device) {
126 |   if(state->initialized == 0) {
127 |     THCl_initializeState(state);
128 |   }
129 |   if( p_device != 0 ) {
130 |     *p_device = state->currentDevice;
131 |   }
132 |   return THClState_getClv2(state, state->currentDevice);
133 | }
134 | EasyCL *THClState_getClv2(THClState* state, int device) {
135 |   if(!state->initialized) {
136 |     THCl_initializeState(state);
137 |   }
138 |   if(state->allocatedDevices == 0) {
139 |     THError("No OpenCL-enabled devices available");
140 |   }
141 |   if(state->currentDevice >= state->allocatedDevices || state->currentDevice < 0) {
142 |     THError("Please use setDevice to choose an available device first");
143 |   }
144 |   if( state->clByDevice[device] == 0 ) {
145 |     EasyCL *cl = 0;
146 |     if(state->allowNonGpus) {
147 |       cl = EasyCL::createForIndexedDevice(device);
148 |     } else {
149 |       cl = EasyCL::createForIndexedGpu(device);
150 |     }
151 |     state->clByDevice[device] = cl;
152 |     THClScratchSpace *scratch = new THClScratchSpace();
153 |     scratch->data = new float[FLOATS_PER_SCRATCH_SPACE];
154 |     scratch->wrapper = cl->wrap(FLOATS_PER_SCRATCH_SPACE, scratch->data);
155 |     scratch->wrapper->createOnDevice();
156 |     state->scratchSpaceByDevice[device] = scratch;
157 |     state->deviceInfoByDevice[device] = (DeviceInfo *)new easycl::DeviceInfo();
158 |     if(state->allowNonGpus) {
159 |       *((easycl::DeviceInfo *)state->deviceInfoByDevice[device]) = easycl::DevicesInfo::getDeviceInfo( device );
160 |     } else {
161 |       *((easycl::DeviceInfo *)state->deviceInfoByDevice[device]) = easycl::DevicesInfo::getGpuInfo( device );
162 |     }
163 |   }
164 |   return state->clByDevice[device];
165 | }
166 | 
167 | THClScratchSpace* THClState_getDeviceScratchSpace(THClState* state, int device, int stream)
168 | {
169 |   if(state->initialized == 0) {
170 |     THCl_initializeState(state);
171 |   }
172 |   if( stream != 0 ) {
173 |     THError("%d is not a stream", stream);
174 |   }
175 |   return state->scratchSpaceByDevice[device];
176 | }
177 | 
178 | size_t THClState_getCurrentDeviceScratchSpaceSize(THClState* state)
179 | {
180 |   if(state->initialized == 0) {
181 |     THCl_initializeState(state);
182 |   }
183 |   int device = state->currentDevice;
184 |   return THClState_getDeviceScratchSpaceSize(state, device);
185 | }
186 | 
187 | size_t THClState_getDeviceScratchSpaceSize(THClState* state, int device)
188 | {
189 |   if(state->initialized == 0) {
190 |     THCl_initializeState(state);
191 |   }
192 | 
193 |   return GLOBAL_SCRATCH_SPACE_PER_SM_STREAM; // true currently since we only have
194 |              // one stream per device, currently
195 | }
196 | 
197 | 


--------------------------------------------------------------------------------
/src/lib/THClGeneral.h:
--------------------------------------------------------------------------------
  1 | #ifndef THCL_GENERAL_INC
  2 | #define THCL_GENERAL_INC
  3 | 
  4 | #include "THGeneral.h"
  5 | #include "THAllocator.h"
  6 | #undef log1p
  7 | 
  8 | #ifdef __cplusplus
  9 | # define THCL_EXTERNC extern "C"
 10 | # define THCL_EXTERNCPP extern
 11 | #else
 12 | # define THCL_EXTERNC extern
 13 | #endif
 14 | 
 15 | #ifdef WIN32
 16 | # ifdef THCL_EXPORTS
 17 | #  define THCL_API THCL_EXTERNC __declspec(dllexport)
 18 | # else
 19 | #  define THCL_API THCL_EXTERNC __declspec(dllimport)
 20 | # endif
 21 | #else
 22 | # define THCL_API THCL_EXTERNC
 23 | #endif
 24 | 
 25 | #ifdef __cplusplus
 26 |   #ifdef WIN32
 27 |     #ifdef THCL_EXPORTS
 28 |       #define THCL_API_CPP THCL_EXTERNCPP __declspec(dllexport)
 29 |     #else
 30 |       #define THCL_API_CPP THCL_EXTERNCPP __declspec(dllimport)
 31 |     #endif
 32 |   #else
 33 |     #define THCL_API_CPP THCL_EXTERNCPP
 34 |   #endif
 35 | #endif
 36 | 
 37 | //// from http://stackoverflow.com/questions/295120/c-mark-as-deprecated
 38 | //#ifdef __GNUC__
 39 | //#define DEPRECATED __attribute__((deprecated))
 40 | //#elif defined(_MSC_VER)
 41 | //#define DEPRECATED __declspec(deprecated)
 42 | //#else
 43 | //#pragma message("WARNING: You need to implement DEPRECATED for this compiler")
 44 | //#define DEPRECATED
 45 | //#endif
 46 | 
 47 | #ifdef __GNUC__
 48 | #define DEPRECATED_POST __attribute__((deprecated))
 49 | #endif
 50 | 
 51 | #ifdef __cplusplus
 52 | //#define PTR_CLASS class*
 53 | #define PTR_EASYCL EasyCL*
 54 | #define PTR_DEVICEINFO DeviceInfo*
 55 | #define PTR_CLWRAPPER CLWrapper*
 56 | class EasyCL;
 57 | class CLWrapper;
 58 | class DeviceInfo;
 59 | #else
 60 | //#define PTR_CLASS struct*
 61 | #define PTR_EASYCL struct EasyCL*
 62 | #define PTR_DEVICEINFO struct DeviceInfo*
 63 | #define PTR_CLWRAPPER struct CLWrapper*
 64 | #endif
 65 | 
 66 | #ifdef __cplusplus
 67 | #include <iostream>
 68 | #endif // __cplusplus
 69 | 
 70 | typedef struct THClScratchSpace {
 71 |   PTR_CLWRAPPER wrapper;
 72 |   float *data;
 73 | } THClScratchSpace;
 74 | 
 75 | /* Global state to be held in the cltorch table. */
 76 | typedef struct THClState
 77 | {
 78 |   int initialized;
 79 |   int allowNonGpus;
 80 |   int allocatedDevices;
 81 |   int currentDevice;
 82 |   int trace; // default 0; set to 1 to see message for every gpu buffer alloc, delete,
 83 |              // or device <-> host transfer
 84 |   int addFinish; // default 0, should we add clFinish() after any kernel, enqueue, etc?
 85 |                  // (good for debugging stuff, bad for perf)
 86 |   int detailedTimings;
 87 |   struct THClScratchSpace**scratchSpaceByDevice; // for now, do one 'stream' per device
 88 |                                  // can improve later...
 89 |   PTR_DEVICEINFO *deviceInfoByDevice;
 90 | //  int *workgroupSizeByDevice;
 91 |   PTR_EASYCL *clByDevice;
 92 |  // EasyCL *getCl();  
 93 | } THClState;
 94 | 
 95 | THCL_API void THCl_initializeState(THClState* state);
 96 | THCL_API void THClSetAllowNonGpus(THClState *state, int allowNonGpus);
 97 | THCL_API void THClInit(THClState* state);
 98 | THCL_API void THClShutdown(THClState* state);
 99 | //THCL_API void THClEnablePeerToPeerAccess(THClState* state);
100 | 
101 | /* State manipulators and accessors */
102 | THCL_API int THClState_getNumDevices(THClState* state);
103 | THCL_API void THClState_setDevice(THClState* state, int device);
104 | THCL_API int THClState_getDevice(THClState* state);
105 | THCL_API PTR_EASYCL THClState_getCl(THClState* state) DEPRECATED_POST;
106 | THCL_API PTR_EASYCL THClState_getClAndDevice(THClState* state, int *p_device) DEPRECATED_POST;
107 | THCL_API PTR_EASYCL THClState_getClv2(THClState* state, int device);
108 | 
109 | //THCL_API void THClState_reserveStreams(THClState* state, int numStreams);
110 | //THCL_API int THClState_getNumStreams(THClState* state);
111 | 
112 | //THCL_API cudaStream_t THClState_getDeviceStream(THClState *state, int device, int stream);
113 | //THCL_API cudaStream_t THClState_getCurrentStream(THClState *state);
114 | //THCL_API int THClState_getCurrentStreamIndex(THClState *state);
115 | //THCL_API void THClState_setStream(THClState *state, int device, int stream);
116 | //THCL_API void THClState_setStreamForCurrentDevice(THClState *state, int stream);
117 | 
118 | //THCL_API void THClState_reserveBlasHandles(THClState* state, int numHandles);
119 | //THCL_API int THClState_getNumBlasHandles(THClState* state);
120 | 
121 | //THCL_API clblasHandle_t THClState_getDeviceBlasHandle(THClState *state, int device, int handle);
122 | //THCL_API clblasHandle_t THClState_getCurrentBlasHandle(THClState *state);
123 | //THCL_API int THClState_getCurrentBlasHandleIndex(THClState *state);
124 | //THCL_API void THClState_setBlasHandle(THClState *state, int device, int handle);
125 | //THCL_API void THClState_setBlasHandleForCurrentDevice(THClState *state, int handle);
126 | 
127 | /* For the current device and stream, returns the allocated scratch space */
128 | THCL_API struct THClScratchSpace* THClState_getCurrentDeviceScratchSpace(THClState* state) DEPRECATED_POST;
129 | THCL_API struct THClScratchSpace* THClState_getDeviceScratchSpace(THClState* state, int device, int stream);
130 | THCL_API size_t THClState_getCurrentDeviceScratchSpaceSize(THClState* state) DEPRECATED_POST;
131 | THCL_API size_t THClState_getDeviceScratchSpaceSize(THClState* state, int device);
132 | 
133 | //#define THClCheck(err)  __THClCheck(err, __FILE__, __LINE__)
134 | //#define THCublasCheck(err)  __THCublasCheck(err,  __FILE__, __LINE__)
135 | 
136 | //THCL_API void __THClCheck(cudaError_t err, const char *file, const int line);
137 | //THCL_API void __THCublasCheck(clblasStatus_t status, const char *file, const int line);
138 | 
139 | typedef unsigned long long uint64;
140 | typedef unsigned int uint32;
141 | typedef long long int64;
142 | typedef int int32;
143 | 
144 | // define dim3, since this came from cuda in cutorch
145 | #ifdef __cplusplus
146 | class dim3 {
147 | public:
148 |     uint32 vec[3];
149 |     size_t vec_for_cl[3];
150 | //    size_t vec_size_t[3];
151 |     dim3() {
152 |         vec[0] = 1;
153 |         vec[1] = 1;
154 |         vec[2] = 1;
155 |     }
156 |     dim3( uint32 x ) {
157 |         vec[0] = x;
158 |         vec[1] = 1;
159 |         vec[2] = 1;
160 |     }
161 |     dim3( uint32 x, uint32 y ) {
162 |         vec[0] = x;
163 |         vec[1] = y;
164 |         vec[2] = 1;
165 |     }
166 |     dim3( uint32 x, uint32 y, uint32 z ) {
167 |         vec[0] = x;
168 |         vec[1] = y;
169 |         vec[2] = z;
170 |     }
171 |     inline uint32 x() {
172 |         return vec[0];
173 |     }
174 |     inline uint32 y() {
175 |         return vec[1];
176 |     }
177 |     inline uint32 z() {
178 |         return vec[2];
179 |     }
180 |     size_t const *as_size_t() {
181 |         for( int i = 0; i < 3; i++ ) {
182 |             vec_for_cl[i] = vec[i];
183 |         }
184 |         return vec_for_cl;
185 |     }
186 | };
187 | 
188 | std::ostream &operator<<( std::ostream &os, const dim3 &obj );
189 | 
190 | //typedef struct _dim3 {
191 | //    int x;
192 | //    int y;
193 | //    int z;
194 | //    _dim3( int x ) {
195 | //        this->x = x;
196 | //        y = 1;
197 | //        z = 1;
198 | //    }
199 | //} dim3;
200 | #endif // __cplusplus
201 | 
202 | // seems that min is really inconsistent across standard libraires, lets just make our own ... :-/
203 | static inline int THCl_min( int a, int b ) {
204 |     return a < b ? a : b;
205 | }
206 | 
207 | #endif
208 | 
209 | 


--------------------------------------------------------------------------------
/src/lib/THClKernels.cpp:
--------------------------------------------------------------------------------
  1 | #include "THClKernels.h"
  2 | #include "EasyCL.h"
  3 | #include "THClTensor.h"
  4 | #include <stdexcept>
  5 | #include "THClReduceApplyUtils.h"
  6 | #include "CLKernel_structs.h"
  7 | 
  8 | #include <iostream>
  9 | using namespace std;
 10 | 
 11 | // Constructor
 12 | THClKernels::THClKernels(THClState *state, CLKernel *kernel) :
 13 |   state(state),
 14 |   kernel(kernel) {
 15 | }
 16 | THClKernels::~THClKernels() {
 17 |   for( int i = 0; i < (int)tensorInfoCls.size(); i++ ) {
 18 |     delete tensorInfoCls[i];
 19 |   }
 20 | }
 21 | // CLTensors =====================
 22 | THClKernels *THClKernels::in(THClTensor *tensor) {
 23 |   try {
 24 |     kernel->in(THClTensor_wrapper(state, tensor));
 25 |     kernel->in((int)THClTensor_storageOffset(state, tensor));
 26 |   } catch( runtime_error &e ) {
 27 |     THError(e.what());
 28 |   }
 29 |   return this;
 30 | }
 31 | THClKernels *THClKernels::inout(THClTensor *tensor) {
 32 |   try {
 33 |     kernel->inout(THClTensor_wrapper(state, tensor));
 34 |     kernel->in((int)THClTensor_storageOffset(state, tensor));
 35 |   } catch( runtime_error &e ) {
 36 |     THError(e.what());
 37 |   }
 38 |   return this;
 39 | }
 40 | THClKernels *THClKernels::out(THClTensor *tensor) {
 41 |   try {
 42 |     kernel->out(THClTensor_wrapper(state, tensor));
 43 |     kernel->in((int)THClTensor_storageOffset(state, tensor));
 44 |   } catch( runtime_error &e ) {
 45 |     THError(e.what());
 46 |   }
 47 |   return this;
 48 | }
 49 | // CLTensors v2 =====================
 50 | THClKernels *THClKernels::inv2(THClTensor *tensor) {
 51 |   try {
 52 |     TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensor);
 53 |     kernel->in(1, tensorInfoCl);
 54 |     kernel->in(THClTensor_wrapper(state, tensor));
 55 |     tensorInfoCls.push_back(tensorInfoCl);
 56 |   } catch( runtime_error &e ) {
 57 |     THError(e.what());
 58 |   }
 59 |   return this;
 60 | }
 61 | THClKernels *THClKernels::inoutv2(THClTensor *tensor) {
 62 |   try {
 63 |     TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensor);
 64 |     kernel->in(1, tensorInfoCl);
 65 |     kernel->inout(THClTensor_wrapper(state, tensor));
 66 |     tensorInfoCls.push_back(tensorInfoCl);
 67 |   } catch( runtime_error &e ) {
 68 |     THError(e.what());
 69 |   }
 70 |   return this;
 71 | }
 72 | THClKernels *THClKernels::outv2(THClTensor *tensor) {
 73 |   try {
 74 |     TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensor);
 75 |     kernel->in(1, tensorInfoCl);
 76 |     kernel->out(THClTensor_wrapper(state, tensor));
 77 |     tensorInfoCls.push_back(tensorInfoCl);
 78 |   } catch( runtime_error &e ) {
 79 |     THError(e.what());
 80 |   }
 81 |   return this;
 82 | }
 83 | // scalars ==================
 84 | THClKernels *THClKernels::in(int value) {
 85 |   try {
 86 |     kernel->in(value);
 87 |   } catch( runtime_error &e ) {
 88 |     THError(e.what());
 89 |   }
 90 |   return this;
 91 | }
 92 | THClKernels *THClKernels::in(float value) {
 93 |   try {
 94 |     kernel->in(value);
 95 |   } catch( runtime_error &e ) {
 96 |     THError(e.what());
 97 |   }
 98 |   return this;
 99 | }
100 | // CLTensorInfos ================
101 | template< typename IndexType >
102 | THClKernels *THClKernels::in(TensorInfo<IndexType>tensorInfo) {
103 |   TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensorInfo);
104 |   kernel->in(1, tensorInfoCl);
105 |   kernel->in(tensorInfo.wrapper);
106 |   tensorInfoCls.push_back(tensorInfoCl);
107 |   return this;
108 | }
109 | template< typename IndexType >
110 | THClKernels *THClKernels::inout(TensorInfo<IndexType>tensorInfo) {
111 |   TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensorInfo);
112 |   kernel->in(1, tensorInfoCl);
113 |   kernel->inout(tensorInfo.wrapper);
114 |   tensorInfoCls.push_back(tensorInfoCl);
115 |   return this;
116 | }
117 | template< typename IndexType >
118 | THClKernels *THClKernels::out(TensorInfo<IndexType>tensorInfo) {
119 |   TensorInfoCl *tensorInfoCl = new TensorInfoCl(tensorInfo);
120 |   if( !tensorInfo.wrapper->isOnDevice() ) {
121 |     tensorInfo.wrapper->createOnDevice();
122 |   }
123 |   kernel->in(1, tensorInfoCl);
124 |   kernel->out(tensorInfo.wrapper);
125 |   tensorInfoCls.push_back(tensorInfoCl);
126 |   return this;
127 | }
128 | // CLWrapper ===============
129 | THClKernels *THClKernels::in(CLWrapper *wrapper) {
130 |   try {
131 |     kernel->in(wrapper);
132 |   } catch( runtime_error &e ) {
133 |     THError(e.what());
134 |   }
135 |   return this;
136 | }
137 | THClKernels *THClKernels::inout(CLWrapper *wrapper) {
138 |   try {
139 |     kernel->inout(wrapper);
140 |   } catch( runtime_error &e ) {
141 |     THError(e.what());
142 |   }
143 |   return this;
144 | }
145 | THClKernels *THClKernels::out(CLWrapper *wrapper) {
146 |   try {
147 |     if( !wrapper->isOnDevice() ) {
148 |       wrapper->createOnDevice();
149 |     }
150 |     kernel->out(wrapper);
151 |   } catch( runtime_error &e ) {
152 |     THError(e.what());
153 |   }
154 |   return this;
155 | }
156 | void THClKernels::run(dim3 grid, dim3 block) {
157 |   dim3 global_ws;
158 |   for( int i = 0; i < 3; i++ ) {
159 |       global_ws.vec[i] = grid.vec[i] * block.vec[i];
160 |   }
161 |   try {
162 |     kernel->run(3, global_ws.as_size_t(), block.as_size_t());
163 |   } catch( runtime_error &e ) {
164 |     cout << e.what() << endl;
165 |     THError(e.what());
166 |   }
167 | }
168 | // locals ==================
169 | THClKernels *THClKernels::localFloats(int count) {
170 |   try {
171 |     kernel->localFloats(count);
172 |   } catch( runtime_error &e ) {
173 |     THError(e.what());
174 |   }
175 |   return this;
176 | }
177 | 
178 | // template instantiations ====================
179 | #define DECLARE_THCLKERNELS(IndexType) \
180 | template \
181 | THClKernels *THClKernels::in<IndexType>(TensorInfo<IndexType>tensorInfo); \
182 | template \
183 | THClKernels *THClKernels::inout<IndexType>(TensorInfo<IndexType>tensorInfo); \
184 | template \
185 | THClKernels *THClKernels::out<IndexType>(TensorInfo<IndexType>tensorInfo);
186 | 
187 | DECLARE_THCLKERNELS(uint32);
188 | DECLARE_THCLKERNELS(uint64);
189 | 
190 | template CLKernel *CLKernel::in<>(int N, const TensorInfoCl *data);
191 | template CLKernel *CLKernel::inout<>(int N, const TensorInfoCl *data);
192 | template CLKernel *CLKernel::out<>(int N, const TensorInfoCl *data);
193 | 
194 | 


--------------------------------------------------------------------------------
/src/lib/THClKernels.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | 
  5 | //class THClState;
  6 | class CLKernel;
  7 | //class THClTensor;
  8 | class CLWrapper;
  9 | 
 10 | #include "THClGeneral.h"
 11 | #include "THClReduceApplyUtils.h"
 12 | 
 13 | // inty types
 14 | // ==========
 15 | //
 16 | // this uses ints for all the long/int type things
 17 | // we can create a new version later that uses longs
 18 | // for now, you need to make surethat anything inty in the kernel parameters is an int, not a long etc
 19 | //
 20 | // Passing THClTensors
 21 | // ===================
 22 | // - when you are passing in a tensor, you need two parameters for each tensor,
 23 | //   in the kernel, eg lets say in the cuda, there is a kernel parameter
 24 | //
 25 | //     __global__ foo( float *src, ...
 26 | //
 27 | // This will become, in our kernel:
 28 | //
 29 | //     kernel foo( global float *src_data, int src_offset, ...
 30 | //
 31 | // thats it :-)  now just use an object of this class to pass in the data
 32 | // oh...in the kernel, when you use the src_data object, make sure to 
 33 | // add the offset.  Like:
 34 | //
 35 | //    src[i]
 36 | //
 37 | // ... in the cuda becomes:
 38 | //
 39 | //    src_data[src_offset + i]
 40 | //
 41 | //  ....in the opencl
 42 | //
 43 | // Passing THClTEnsorInfos
 44 | // =======================
 45 | //
 46 | // On the receiving side, there needs to be two global parameters. ie, if in cuda kernel 
 47 | // we have:
 48 | //
 49 | //   __global__ foo(THClTensorInfo<IndexType> mytensor, ...)
 50 | //
 51 | // on OpenCL kernel, we will have:
 52 | //
 53 | //  kernel foo(global THClTensorInfoCl *mytensor_info, global float *mytensor_data, ...)
 54 | //
 55 | // You'll also need to define THClTensorInfoCl struct in your kernel, eg by including
 56 | // code from include_THClReduceApplyUtils.cl, see THClApply.h for an example
 57 | //
 58 | // in, inout, out
 59 | // ==============
 60 | // Note on difference between 'in', 'out', 'inout':
 61 | // - 'inout' and 'out' will mark the CLWrapper gpu buffer as 'dirty',
 62 | //   needing to be
 63 | //   copied to host, if we want to work on host-side
 64 | // - 'out' will allocate the CLWrapper device-side buffer, if not already
 65 | //   allocated (in and inout will throw an error, if not allocated on device-side
 66 | //   already)
 67 | class THClKernels {
 68 |   THClState *state;
 69 |   CLKernel *kernel;
 70 | 
 71 |   std::vector< TensorInfoCl * >tensorInfoCls;
 72 | 
 73 | public:
 74 |   THClKernels(THClState *state, CLKernel *kernel);
 75 |   ~THClKernels();
 76 | 
 77 |   THClKernels *in(THClTensor *tensor);
 78 |   THClKernels *inout(THClTensor *tensor);
 79 |   THClKernels *out(THClTensor *tensor);
 80 | 
 81 |   THClKernels *inv2(THClTensor *tensor); // expects kernel parameters as `global struct THClTensorInfoCl *a_info, global float *a_data`
 82 |   THClKernels *inoutv2(THClTensor *tensor);
 83 |   THClKernels *outv2(THClTensor *tensor);
 84 | 
 85 |   template< typename IndexType >
 86 |   THClKernels *in(TensorInfo<IndexType>tensorInfo);
 87 |   template< typename IndexType >
 88 |   THClKernels *inout(TensorInfo<IndexType>tensorInfo);
 89 |   template< typename IndexType >
 90 |   THClKernels *out(TensorInfo<IndexType>tensorInfo);
 91 | 
 92 |   THClKernels *in(CLWrapper *wrapper);
 93 |   THClKernels *inout(CLWrapper *wrapper);
 94 |   THClKernels *out(CLWrapper *wrapper);
 95 | 
 96 |   THClKernels *in(int value);
 97 |   THClKernels *in(float value);
 98 | 
 99 |   THClKernels *localFloats(int count);
100 | 
101 |   void run(dim3 grid, dim3 block);  // uses cutorch-compatible dimensions
102 | };
103 | 
104 | 


--------------------------------------------------------------------------------
/src/lib/THClOperators.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef __cplusplus
 4 | 
 5 | #include <string>
 6 | 
 7 | class OpBase {
 8 | public:
 9 |   virtual std::string getName() const { return "OpBase"; } // just to make this class polymorphic
10 | };
11 | 
12 | class HasScalars : public OpBase {
13 | public:
14 |     virtual int getNumScalars() const = 0;
15 |     virtual float getScalar(int index) const = 0;
16 | };
17 | 
18 | class HasOperator1 : public OpBase {
19 | public:
20 |     virtual std::string operator1() const = 0;
21 | };
22 | 
23 | class HasOperator2 : public OpBase {
24 | public:
25 |     virtual std::string operator2() const = 0;
26 | };
27 | 
28 | class HasOperator3 : public OpBase {
29 | public:
30 |     virtual std::string operator3() const = 0;
31 | };
32 | 
33 | class HasGlobalTensors {
34 | public:
35 |   virtual int getNumGlobalTensors() const = 0;
36 |   virtual THClTensor *getTensor(int index) const = 0;
37 |   virtual std::string getTensorName(int index) const = 0;
38 | };
39 | 
40 | class HasPointTensors {
41 | public:
42 |   virtual int getNumPointTensors() const = 0;
43 |   virtual const THClTensor *getPointTensor(int index) const = 0;
44 | };
45 | 
46 | #endif // __cplusplus
47 | 
48 | 


--------------------------------------------------------------------------------
/src/lib/THClReduce.cl:
--------------------------------------------------------------------------------
  1 | // Threads per thread block
  2 | #define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
  3 | 
  4 | static inline float modifyOp(float _in1) {
  5 |   float _out;
  6 |   float *in1 = &_in1;
  7 |   float *out = &_out;
  8 |   {{modify_operation}};
  9 |   return _out;
 10 | }
 11 | 
 12 | static inline float reduceOp(float _in1, float _in2) {
 13 |   // I guess the compiler can sort this stuff out :-P
 14 |   float _out;
 15 |   float *in1 = &_in1;
 16 |   float *in2 = &_in2;
 17 |   float *out = &_out;
 18 |   {{reduce_operation}};
 19 |   return _out;
 20 | }
 21 | 
 22 | {{include_THClReduceApplyUtils}}
 23 | 
 24 | static inline {{IndexType}} getReduceNoncontigDimSliceIndex() {
 25 |   // Each thread handles one slice
 26 |   return getLinearBlockId() * THCL_NONCONTIG_REDUCE_BLOCK_SIZE + /*threadIdx.x*/ get_local_id(0);
 27 | }
 28 | 
 29 | // Kernel that handles an entire reduction of a slice of a tensor per each thread
 30 | kernel void
 31 | THClTensor_reduceNoncontigDim(global TensorInfoCl *out_info,
 32 |                               global float *out_data,
 33 |                               global TensorInfoCl *in_info,
 34 |                               global float *in_data,
 35 |                               int reductionStride,
 36 |                               int reductionSize,
 37 |                               int totalSlices,
 38 |                               float init) {
 39 |   const {{IndexType}} sliceIndex = getReduceNoncontigDimSliceIndex();
 40 | 
 41 |   if ((int)sliceIndex >= totalSlices) {
 42 |     return;
 43 |   }
 44 | 
 45 |   // Each thread picks a point in `out` and `in` for which it is
 46 |   // producing the reduction
 47 |   const {{IndexType}} outOffset =
 48 |     IndexToOffset_{{1000 + dim1}}_get(sliceIndex, &out_info[0]);
 49 |   const {{IndexType}} inBaseOffset =
 50 |     IndexToOffset_{{1000 + dim2}}_get(sliceIndex, &in_info[0]);
 51 | 
 52 |   // For each point in reductionSize, reduce into `r`
 53 |   {{IndexType}} inOffset = inBaseOffset;
 54 |   float r = init;
 55 | 
 56 |   for ({{IndexType}} i = 0; (int)i < reductionSize; ++i) {
 57 |     r = reduceOp(r, modifyOp(in_data[inOffset]));
 58 |     inOffset += reductionStride;
 59 |   }
 60 | 
 61 |   // Write out reduced value
 62 |   out_data[outOffset] = r;
 63 | }
 64 | 
 65 | static inline {{IndexType}} getReduceContigDimSliceIndex() {
 66 |   // Each block handles one slice
 67 |   return getLinearBlockId();
 68 | }
 69 | 
 70 | // Kernel that handles an entire reduction of a slice of a tensor per
 71 | // each block
 72 | kernel void
 73 | THClTensor_reduceContigDim(global TensorInfoCl *out_info,
 74 |                            global float *out_data,
 75 |                            global TensorInfoCl *in_info,
 76 |                            global float *in_data,
 77 |                            int reductionSize,
 78 |                            int totalSlices,
 79 |                            float init,
 80 |                            local float *smem) {
 81 |   const {{IndexType}} sliceIndex = getReduceContigDimSliceIndex();
 82 | 
 83 |   if ((int)sliceIndex >= totalSlices) {
 84 |     return;
 85 |   }
 86 | 
 87 |   // Get the offset in `out` for the reduction
 88 |   const {{IndexType}} outOffset =
 89 |     IndexToOffset_{{1000 + dim1}}_get(sliceIndex, &out_info[0]);
 90 | 
 91 |   // Get the base offset in `in` for this block's reduction
 92 |   const {{IndexType}} inBaseOffset =
 93 |     IndexToOffset_{{1000 + dim2}}_get(sliceIndex, &in_info[0]);
 94 | 
 95 |   // Each thread in the block will reduce some subset of elements in
 96 |   // the slice. The elements are guaranteed contiguous starting at
 97 |   // `inBaseOffset`.
 98 |   float r = init;
 99 |   for ({{IndexType}} i = /*threadIdx.x*/ get_local_id(0); (int)i < reductionSize; i += /*blockDim.x*/ get_local_size(0)) {
100 |     r = reduceOp(r, modifyOp(in_data[inBaseOffset + i]));
101 |   }
102 | 
103 |   // Reduce within the block
104 | //  extern __shared__ float smem[];
105 |   r = reduceBlock(smem, /*blockDim.x*/ get_local_size(0), r, init);
106 | 
107 |   if (/*threadIdx.x*/ get_local_id(0) == 0) {
108 |     // Write out reduced value
109 |     out_data[outOffset] = r;
110 |   }
111 | }
112 | 
113 | 


--------------------------------------------------------------------------------
/src/lib/THClReduce.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef THCL_REDUCE_INC
 4 | #define THCL_REDUCE_INC
 5 | 
 6 | #include <string>
 7 | #include <vector>
 8 | #include <set>
 9 | #include "THClReduceApplyUtils.h"
10 | #include "templates/TemplatedKernel.h"
11 | #include "util/easycl_stringhelper.h"
12 | #include "EasyCL.h"
13 | #include "THClTypeParseTraits.h"
14 | #include "THClDeviceUtils.h"
15 | #include "THClKernels.h"
16 | #include "util/StatefulTimer.h"
17 | 
18 | 
19 | std::string THClReduce_getKernelSource();
20 | 
21 | //
22 | // This file contains dimension reduction operation functions and
23 | // kernels that work on both contiguous and non-contiguous tensor
24 | // arguments of arbitrary (up to MAX_CUTORCH_DIMS) dimensioned
25 | // arguments without copying or temporary storage.
26 | //
27 | 
28 | //#define THCL_NONCONTIG_REDUCE_BLOCK_SIZE 32 * 16
29 | bool THClTensor_reduceDim(THClState* state,
30 |                           THClTensor* out,
31 |                           THClTensor* in,
32 |                           float init,
33 |                           const HasOperator2 *modifyOp,
34 |                           const HasOperator3 *reduceOp,
35 |                           int dim);
36 | 
37 | #undef THCL_NONCONTIG_REDUCE_BLOCK_SIZE
38 | 
39 | #endif // THCL_REDUCE_INC
40 | 
41 | 


--------------------------------------------------------------------------------
/src/lib/THClReduceAll.cl:
--------------------------------------------------------------------------------
  1 | {{include_THClDeviceUtils}}
  2 | 
  3 | static inline float modifyOp(float _in1) {
  4 |   float _out;
  5 |   float *in1 = &_in1;
  6 |   float *out = &_out;
  7 |   {{modify_operation}};
  8 |   return _out;
  9 | }
 10 | 
 11 | static inline float reduceOp(float _in1, float _in2) {
 12 |   // I guess the compiler can sort this stuff out :-P
 13 |   float _out;
 14 |   float *in1 = &_in1;
 15 |   float *in2 = &_in2;
 16 |   float *out = &_out;
 17 |   {{reduce_operation}};
 18 |   return _out;
 19 | }
 20 | 
 21 | {{include_THClReduceApplyUtils}}
 22 | 
 23 | // Kernel that handles an entire reduction of a tensor in one pass
 24 | kernel void
 25 | THClTensor_reduceAll(global TensorInfoCl *in_info,
 26 |                      global float *in_data,
 27 |                      {{IndexType}} totalElements,
 28 |                      float init,
 29 |                      global float* out,
 30 |                      local float *smem) {
 31 |   // With a block-wide stride, have each thread perform its own reduction.
 32 |   float r = init;
 33 |   for ({{IndexType}} i = get_local_id(0); i < totalElements; i += get_local_size(0)) {
 34 |     const {{IndexType}} inOffset = IndexToOffset_{{1000 + dim1}}_get(i, &in_info[0]);
 35 |     r = reduceOp(r, modifyOp(in_data[inOffset]));
 36 |   }
 37 | 
 38 |   // Reduce within the block
 39 |   r = reduceBlock(smem, get_local_size(0), r, init);
 40 | 
 41 |   if(get_local_id(0) == 0) {
 42 |     // Write out reduced value
 43 |     out[0] = r;
 44 |   }
 45 | }
 46 | 
 47 | static inline {{IndexType}} getStartIndex({{IndexType}} totalSize) {
 48 |   {{IndexType}} sizePerBlock = THClCeilDiv(totalSize, ({{IndexType}}) get_num_groups(0));
 49 |   return get_group_id(0) * sizePerBlock;
 50 | }
 51 | 
 52 | static inline {{IndexType}} getEndIndex({{IndexType}} totalSize) {
 53 |   {{IndexType}} sizePerBlock = THClCeilDiv(totalSize, ({{IndexType}}) get_num_groups(0));
 54 |   return min(({{IndexType}}) ((get_group_id(0) + 1) * sizePerBlock), totalSize);
 55 | }
 56 | 
 57 | // Kernel that handles an entire reduction of a tensor in two passes
 58 | kernel void
 59 | THClTensor_reduceAllPass1(global TensorInfoCl *in_info,
 60 |                           global float *in_data,
 61 |                           {{IndexType}} totalElements,
 62 |                           float init,
 63 |                           global float* scratchSpace,
 64 |                           local float *smem) {
 65 |   const {{IndexType}} startIndex = getStartIndex(totalElements);
 66 |   const {{IndexType}} endIndex = getEndIndex(totalElements);
 67 | 
 68 |   // With a block-wide stride, have each thread perform its own reduction.
 69 |   float r = init;
 70 |   for ({{IndexType}} i = startIndex + get_local_id(0); i < endIndex; i += get_local_size(0)) {
 71 |     const {{IndexType}} inOffset = IndexToOffset_{{1000 + dim1}}_get(i, &in_info[0]);
 72 |     r = reduceOp(r, modifyOp(in_data[inOffset]));
 73 |   }
 74 | 
 75 |   // Reduce within the block
 76 |   r = reduceBlock(smem, get_local_size(0), r, init);
 77 | 
 78 |   if ((int)get_local_id(0) == 0) {
 79 |     // Write out block-wide reduced value
 80 |     scratchSpace[get_group_id(0)] = r;
 81 |   }
 82 | }
 83 | 
 84 | kernel void THClTensor_reduceAllPass2(int numPass1Blocks,
 85 |                             float init,
 86 |                             global float* scratchSpace,
 87 |                             global float* out,
 88 |                             local float *smem) {
 89 |   float r = init;
 90 |   if ((int)get_local_id(0) < numPass1Blocks) {
 91 |     r = scratchSpace[get_local_id(0)];
 92 |   }
 93 | 
 94 |   // Reduce within the block
 95 |   r = reduceBlock(smem, numPass1Blocks, r, init);
 96 | 
 97 |   if((int)get_local_id(0) == 0) {
 98 |     out[0] = r;
 99 |   }
100 | }
101 | 
102 | 
103 | 
104 | 


--------------------------------------------------------------------------------
/src/lib/THClReduceAll.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "THClGeneral.h"
 4 | #include "THClTensor.h"
 5 | #include "THClOperators.h"
 6 | #include "EasyCL.h"
 7 | 
 8 | //
 9 | // This file contains dimension reduction operation functions and
10 | // kernels that work on both contiguous and non-contiguous tensor
11 | // arguments of arbitrary (up to MAX_CLTORCH_DIMS) dimensioned
12 | // arguments without copying or temporary storage, for reducing an
13 | // entire tensor to one value.
14 | //
15 | 
16 | bool THClTensor_reduceAll(THClState* state,
17 |                             THClTensor* in,
18 |                             const HasOperator2 *modifyOp,
19 |                             const HasOperator3 *reduceOp,
20 |                             float init,
21 |                             CLWrapper *res);
22 | 
23 | 


--------------------------------------------------------------------------------
/src/lib/THClReduceApplyUtils.cl:
--------------------------------------------------------------------------------
  1 | // kernel argument that defines tensor layout
  2 | typedef struct TensorInfoCl {
  3 |   // Extracts size/stride information for the kernel.
  4 |   // Successive dimensions can be collapsed if the size/strides match
  5 |   // up and thus there are no holes between the dimensions. This is used
  6 |   // to reduce the complexity of the problem.
  7 |   // The optional `reduceDim` indicates a reduction dimension for the
  8 |   // given tensor, so that the output size for this dimension will be 1.
  9 | 
 10 |   {{IndexType}} sizes[{{MAX_CLTORCH_DIMS}}];
 11 |   {{IndexType}} strides[{{MAX_CLTORCH_DIMS}}];
 12 |   {{IndexType}} offset;
 13 |   int dims;
 14 | } TensorInfoCl;
 15 | // Contiguous tensors of more than one dimension are collapsed down
 16 | // to one tensor
 17 | {% if defiscontiguous==1 then %}
 18 | static inline bool TensorInfo_isContiguous( global TensorInfoCl *tensorInfo ) {
 19 |     return (tensorInfo->dims == 1 && tensorInfo->strides[0] == 1);    
 20 | }
 21 | {% end %}
 22 | 
 23 | // Translate a linear index for the apply to a float* offset;
 24 | // specialized on `Dims` to reduce nvcc compilation time
 25 | {% for _,dim in ipairs(dims) do %}
 26 | static inline {{IndexType}} IndexToOffset_{{1000 + dim}}_get( {{IndexType}} linearId, global TensorInfoCl *info) {
 27 |   {{IndexType}} offset = info->offset;
 28 | 
 29 |   // Use static dims
 30 | //  for (int i = {{dim}} - 1; i >= 0; --i) {
 31 |   {{IndexType}} curDimIndex;
 32 |   {{IndexType}} curDimOffset;
 33 |   {% for i=dim-1,0,-1 do %}  // bake this in....
 34 |     curDimIndex = linearId % info->sizes[{{i}}];
 35 |     curDimOffset = curDimIndex * info->strides[{{i}}];
 36 |     offset += curDimOffset;
 37 | 
 38 |     {% if i > 0 then %}
 39 |       linearId /= info->sizes[{{i}}];
 40 |     {% end %}
 41 |   {% end %}
 42 | //  }
 43 | 
 44 |   return offset;
 45 | }
 46 | {% end %}
 47 | 
 48 | static inline {{IndexType}} IndexToOffset_998_get({{IndexType}} linearId, global const TensorInfoCl *info) {
 49 |     return linearId + info->offset;
 50 | }
 51 | 
 52 | static inline {{IndexType}} IndexToOffset_999_get({{IndexType}} linearId, global const TensorInfoCl *info) {
 53 |   {{IndexType}} offset = info->offset;
 54 | 
 55 |   // Use dynamic dims
 56 |   for (int i = info->dims - 1; i >= 0; --i) {
 57 |     {{IndexType}} curDimIndex = linearId % info->sizes[i];
 58 |     {{IndexType}} curDimOffset = curDimIndex * info->strides[i];
 59 |     offset += curDimOffset;
 60 | 
 61 |     linearId /= info->sizes[i];
 62 |   }
 63 | 
 64 |   return offset;
 65 | }
 66 | 
 67 | static inline {{IndexType}} getLinearBlockId() {
 68 |   return get_group_id(2) * get_num_groups(1) * get_num_groups(0) +
 69 |     get_group_id(1) * get_num_groups(0) +
 70 |     get_group_id(0);
 71 | }
 72 | 
 73 | // Block-wide reduction in shared memory helper; only /*threadIdx.x*/ get_local_id(0) == 0 will
 74 | // return the reduced value
 75 | {% if defreduceblock == 1 then %}
 76 | static inline float reduceBlock( local float* smem,
 77 |                    int numVals,
 78 |                    float threadVal,
 79 |                    float init) {
 80 |   if (numVals == 0) {
 81 |     return init;
 82 |   }
 83 | 
 84 |   if ((int)get_local_id(0) < numVals) {
 85 |     smem[ get_local_id(0)] = threadVal;
 86 |   }
 87 | 
 88 |   // First warp will perform reductions across warps
 89 |   barrier(CLK_LOCAL_MEM_FENCE);
 90 |   if ((get_local_id(0) / {{WarpSize}}) == 0) {
 91 |     float r = (int)get_local_id(0) < numVals ? smem[get_local_id(0)] : init;
 92 | 
 93 |     for (int i = {{WarpSize}} + get_local_id(0); i < numVals; i += {{WarpSize}}) {
 94 |       r = reduceOp(r, smem[i]);
 95 |     }
 96 | 
 97 |     smem[get_local_id(0)] = r;
 98 |   }
 99 | 
100 |   // First thread will perform reductions across the block
101 |   barrier(CLK_LOCAL_MEM_FENCE);
102 | 
103 |   float r = init;
104 |   if (get_local_id(0) == 0) {
105 |     r = smem[0];
106 | 
107 |     int numLanesParticipating = min(numVals, {{WarpSize}});
108 | 
109 |     if (numLanesParticipating == 32) {
110 |       // Unroll for {{WarpSize}} == 32 and numVals >= 32
111 |       // #pragma unroll
112 |       // unrolling by hand, so compiler-independent
113 |       {% for i=1,31 do %}
114 |         r = reduceOp(r, smem[{{i}}]);
115 |       {% end %}
116 |     } else {
117 |       for (int i = 1; i < numLanesParticipating; ++i) {
118 |         r = reduceOp(r, smem[i]);
119 |       }
120 |     }
121 |   }
122 | 
123 |   return r;
124 | }
125 | {% end %}
126 | 
127 | 


--------------------------------------------------------------------------------
/src/lib/THClScatter.cl:
--------------------------------------------------------------------------------
  1 | // probably should put this on its own somewhere, so we 
  2 | // dont have to either ocpy/paste, or include entire THClReduceApplyUtils
  3 | typedef struct TensorInfoCl {
  4 |   unsigned int sizes[{{MAX_CLTORCH_DIMS}}];
  5 |   unsigned int strides[{{MAX_CLTORCH_DIMS}}];
  6 |   int offset;
  7 |   int dims;
  8 | } TensorInfoCl;
  9 | 
 10 | {% if scatter then %}
 11 | kernel void THClTensor_kernel_scatter(
 12 |     global TensorInfoCl *dst_info, global float*dst_data,
 13 |    int dim,
 14 |     global const TensorInfoCl *idx_info, global float*idx_data,
 15 |     global const TensorInfoCl *src_info, global float*src_data,
 16 |    int totalElements
 17 | )
 18 | {
 19 |   for (int _linearId = get_global_id(0);
 20 |        _linearId < totalElements;
 21 |        _linearId += get_global_size(0)) {
 22 | 
 23 |       // plan is:
 24 |       // based on our linearIndex, this gets us a spot in the index
 25 |       // tensor
 26 |       // this is also a spot in the src_data (at least, if we can
 27 |       // convert into actual coordinates, then it is the coordinates
 28 |       // in the src tensor
 29 |       // the coordinates in the dest are teh same, except that
 30 |       // we replace that of dimension dim with the value from
 31 |       // the index tensor
 32 |       //
 33 |       // so, everything hinges on us getting the coordinates, I think?
 34 |       // so, lets do that :-)
 35 |       int idxOffset = idx_info->offset;
 36 |       int srcOffset = src_info->offset;
 37 |       int dstOffset = dst_info->offset;
 38 |       int linearId = _linearId; // copy it, since we'll modify it
 39 | //      for(int d={{dims}}-1; d >= 0; d--) {  // just use slow, unbkaed loop for now, to
 40 |                                    // get it working
 41 |         int curDimIndex;
 42 |         {% for d=dims-1,0,-1 do %}
 43 |           curDimIndex = linearId % idx_info->sizes[{{d}}];
 44 |           idxOffset += curDimIndex * idx_info->strides[{{d}}];
 45 |           srcOffset += curDimIndex * src_info->strides[{{d}}];
 46 |           if( {{d}} != dim ) { // this only matters for the source, the others are 
 47 |                            // unaffected by which dimension we are on. I think.
 48 |             dstOffset += curDimIndex * dst_info->strides[{{d}}];
 49 |           }
 50 |           linearId /= idx_info->sizes[{{d}}];
 51 |         {% end %}
 52 | //      }
 53 |       // now we have the idxoffset.  get the value at that location
 54 |       int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based
 55 |       // then use this to get the final value for dstOffset
 56 |       dstOffset += idxValue * dst_info->strides[dim];
 57 |       // get the value...
 58 |       float value = src_data[srcOffset];
 59 |       // and save it up...
 60 |       dst_data[dstOffset] = value;
 61 |       // thats it?
 62 |   }
 63 | }
 64 | {% end %}
 65 | 
 66 | {% if scatterFill then %}
 67 | kernel void THClTensor_kernel_scatterFill(
 68 |     global TensorInfoCl *dst_info, global float*dst_data,
 69 |    const int dim,
 70 |     global const TensorInfoCl *idx_info, global float*idx_data,
 71 |     const float src_val,
 72 |    const int totalElements
 73 | )
 74 | {
 75 |   for (int _linearId = get_global_id(0);
 76 |        _linearId < totalElements;
 77 |        _linearId += get_global_size(0)) {
 78 | 
 79 |       // plan is:
 80 |       // based on our linearIndex, this gets us a spot in the index
 81 |       // tensor
 82 |       // the coordinates in the dest are teh same, except that
 83 |       // we replace that of dimension dim with the value from
 84 |       // the index tensor
 85 |       //
 86 |       // so, everything hinges on us getting the coordinates, I think?
 87 |       // so, lets do that :-)
 88 |       int idxOffset = idx_info->offset;
 89 |       int dstOffset = dst_info->offset;
 90 |       int linearId = _linearId; // copy it, since we'll modify it
 91 | //      for(int d={{dims}}-1; d >= 0; d--) {  // just use slow, unbkaed loop for now, to
 92 |                                    // get it working
 93 |         int curDimIndex;
 94 |         {% for d=dims-1,0,-1 do %}
 95 |           curDimIndex = linearId % idx_info->sizes[{{d}}];
 96 |           idxOffset += curDimIndex * idx_info->strides[{{d}}];
 97 |           if( {{d}} != dim ) { // this only matters for the source, the others are 
 98 |                            // unaffected by which dimension we are on. I think.
 99 |             dstOffset += curDimIndex * dst_info->strides[{{d}}];
100 |           }
101 |           linearId /= idx_info->sizes[{{d}}];
102 |         {% end %}
103 | //      }
104 |       // now we have the idxoffset.  get the value at that location
105 |       int idxValue = idx_data[idxOffset] - 1; // subtract 1, because 1-based
106 |       // then use this to get the final value for dstOffset
107 |       dstOffset += idxValue * dst_info->strides[dim];
108 |       // and save value up...
109 |       dst_data[dstOffset] = src_val;
110 |       // thats it?
111 |   }
112 | }
113 | {% end %}
114 | 
115 | 


--------------------------------------------------------------------------------
/src/lib/THClStorage.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCL_STORAGE_INC
 2 | #define THCL_STORAGE_INC
 3 | 
 4 | #include "THStorage.h"
 5 | #include "THClGeneral.h"
 6 | 
 7 | #define TH_STORAGE_REFCOUNTED 1
 8 | #define TH_STORAGE_RESIZABLE  2
 9 | #define TH_STORAGE_FREEMEM    4
10 | 
11 | //extern int THClStorage_traceOn;
12 | 
13 | typedef struct THClStorage
14 | {
15 |   int device;
16 |   float *data; // I know this seems a bit superfluous....
17 |   PTR_EASYCL cl;
18 |   PTR_CLWRAPPER wrapper;
19 |   long size;
20 |   int refcount;
21 |   char flag;
22 |   THAllocator *allocator;
23 |   void *allocatorContext;
24 |   struct THClStorage *view;
25 | } THClStorage;
26 | 
27 | 
28 | THCL_API float* THClStorage_data(THClState *state, const THClStorage*);
29 | THCL_API long THClStorage_size(THClState *state, const THClStorage*);
30 | 
31 | /* slow access -- checks everything */
32 | THCL_API void THClStorage_set(THClState *state, THClStorage*, long, float);
33 | THCL_API float THClStorage_get(THClState *state, const THClStorage*, long);
34 | 
35 | THCL_API THClStorage* THClStorage_new(THClState *state) DEPRECATED_POST;
36 | THCL_API THClStorage* THClStorage_newv2(THClState *state, int device);
37 | THCL_API THClStorage* THClStorage_newWithSize(THClState *state, int device, long size);
38 | THCL_API THClStorage* THClStorage_newWithSize1(THClState *state, int device, float);
39 | THCL_API THClStorage* THClStorage_newWithSize2(THClState *state, int device, float, float);
40 | THCL_API THClStorage* THClStorage_newWithSize3(THClState *state, int device, float, float, float);
41 | THCL_API THClStorage* THClStorage_newWithSize4(THClState *state, int device, float, float, float, float);
42 | THCL_API THClStorage* THClStorage_newWithMapping(THClState *state, int device, const char *filename, long size, int shared);
43 | 
44 | /* takes ownership of data */
45 | THCL_API THClStorage* THClStorage_newWithData(THClState *state, int device, float *data, long size);
46 | 
47 | THCL_API THClStorage* THClStorage_newWithAllocator(THClState *state, int device, long size,
48 |                                                       THAllocator* allocator,
49 |                                                       void *allocatorContext);
50 | THCL_API THClStorage* THClStorage_newWithDataAndAllocator(
51 |     THClState *state, int device, float* data, long size, THAllocator* allocator, void *allocatorContext);
52 | 
53 | THCL_API void THClStorage_setFlag(THClState *state, THClStorage *storage, const char flag);
54 | THCL_API void THClStorage_clearFlag(THClState *state, THClStorage *storage, const char flag);
55 | THCL_API void THClStorage_retain(THClState *state, THClStorage *storage);
56 | 
57 | THCL_API void THClStorage_free(THClState *state, THClStorage *storage);
58 | THCL_API void THClStorage_resize(THClState *state, THClStorage *storage, long size);
59 | THCL_API void THClStorage_fill(THClState *state, THClStorage *storage, float value);
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/lib/THClStorageCopy.cpp:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | 
 3 | #include "THClStorageCopy.h"
 4 | #include "THClGeneral.h"
 5 | 
 6 | #include <stdio.h>
 7 | #include "EasyCL.h"
 8 | 
 9 | using namespace std;
10 | 
11 | void THClStorage_rawCopy(THClState *state, THClStorage *self, float *src)
12 | {
13 |   THError("not available yet for THClStorage");
14 | //  THClCheck(clMemcpyAsync(self->data, src, self->size * sizeof(float), clMemcpyDeviceToDevice, THClState_getCurrentStream(state)));
15 | }
16 | 
17 | void THClStorage_copy(THClState *state, THClStorage *self, THClStorage *src)
18 | {
19 |   THArgCheck(self->size == src->size, 2, "size does not match");
20 |   if( !self->wrapper->isOnDevice() ) {
21 |     self->wrapper->createOnDevice();
22 |   }
23 |   src->wrapper->copyTo( self->wrapper );
24 |   if(state->trace) cout << "wrapper->copyTo() size" << self->size << endl;
25 | }
26 | 
27 | void THClStorage_copyCl(THClState *state, THClStorage *self, THClStorage *src)
28 | {
29 |   THError("not available yet for THClStorage");
30 |   THArgCheck(self->size == src->size, 2, "size does not match");
31 | //  THClCheck(clMemcpyAsync(self->data, src->data, self->size * sizeof(float), clMemcpyDeviceToDevice, THClState_getCurrentStream(state)));
32 | }
33 | 
34 | void THClStorage_copyFloat(THClState *state, THClStorage *self, struct THFloatStorage *src)
35 | {
36 | //  cout << "THClStorgae_copyFloat()" << endl;
37 |   THArgCheck(self->size == src->size, 2, "size does not match");
38 |   for( int i = 0; i < self->size; i++ ) {
39 |     self->data[i] = src->data[i];
40 |   }
41 |   self->wrapper->copyToDevice();
42 |   if(state->trace) cout << "wrapper->copyToDevice() size" << self->size << endl;
43 |  // THClCheck(clMemcpy(self->data, src->data, self->size * sizeof(float), clMemcpyHostToDevice));
44 | }
45 | 
46 | #define TH_CL_STORAGE_IMPLEMENT_COPY(TYPEC)                           \
47 |   void THClStorage_copy##TYPEC(THClState *state, THClStorage *self, struct TH##TYPEC##Storage *src) \
48 |   {                                                                     \
49 |     THFloatStorage *buffer;                                             \
50 |     THArgCheck(self->size == src->size, 2, "size does not match");      \
51 |     buffer = THFloatStorage_newWithSize(src->size);                     \
52 |     THFloatStorage_copy##TYPEC(buffer, src);                            \
53 |     THClStorage_copyFloat(state, self, buffer);                              \
54 |     THFloatStorage_free(buffer);                                        \
55 |   }
56 | 
57 | TH_CL_STORAGE_IMPLEMENT_COPY(Byte)
58 | TH_CL_STORAGE_IMPLEMENT_COPY(Char)
59 | TH_CL_STORAGE_IMPLEMENT_COPY(Short)
60 | TH_CL_STORAGE_IMPLEMENT_COPY(Int)
61 | TH_CL_STORAGE_IMPLEMENT_COPY(Long)
62 | TH_CL_STORAGE_IMPLEMENT_COPY(Double)
63 | 
64 | void THFloatStorage_copyCl(THClState *state, THFloatStorage *self, struct THClStorage *src)
65 | {
66 | //  cout << "THfloatStorage_copyCl" << endl;
67 |   THArgCheck(self->size == src->size, 2, "size does not match");
68 |   if( src->size == 0 ) {
69 |     // dont need to do anything...
70 |     return;
71 |   }
72 |   if( src->wrapper->isDeviceDirty() ) {
73 |     src->wrapper->copyToHost();
74 |     if(state->trace) cout << "wrapper->copyToHost() size" << self->size << endl;
75 |   }
76 |   for( int i = 0; i < self->size; i++ ) {
77 |     self->data[i] = src->data[i];
78 |   }
79 | }
80 | 
81 | #define TH_CL_STORAGE_IMPLEMENT_COPYTO(TYPEC)                           \
82 |   void TH##TYPEC##Storage_copyCl(THClState *state, TH##TYPEC##Storage *self, struct THClStorage *src) \
83 |   {                                                                     \
84 |     THFloatStorage *buffer;                                             \
85 |     THArgCheck(self->size == src->size, 2, "size does not match");      \
86 |     buffer = THFloatStorage_newWithSize(src->size);                     \
87 |     THFloatStorage_copyCl(state, buffer, src);                               \
88 |     TH##TYPEC##Storage_copyFloat(self, buffer);                         \
89 |     THFloatStorage_free(buffer);                                        \
90 |   }
91 | 
92 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Byte)
93 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Char)
94 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Short)
95 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Int)
96 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Long)
97 | TH_CL_STORAGE_IMPLEMENT_COPYTO(Double)
98 | 
99 | 


--------------------------------------------------------------------------------
/src/lib/THClStorageCopy.h:
--------------------------------------------------------------------------------
 1 | #ifndef THCL_STORAGE_COPY_INC
 2 | #define THCL_STORAGE_COPY_INC
 3 | 
 4 | #include "THClStorage.h"
 5 | #include "THClGeneral.h"
 6 | 
 7 | /* Support for copy between different Storage types */
 8 | 
 9 | THCL_API void THClStorage_rawCopy(THClState *state, THClStorage *storage, float *src);
10 | THCL_API void THClStorage_copy(THClState *state, THClStorage *storage, THClStorage *src);
11 | THCL_API void THClStorage_copyByte(THClState *state, THClStorage *storage, struct THByteStorage *src);
12 | THCL_API void THClStorage_copyChar(THClState *state, THClStorage *storage, struct THCharStorage *src);
13 | THCL_API void THClStorage_copyShort(THClState *state, THClStorage *storage, struct THShortStorage *src);
14 | THCL_API void THClStorage_copyInt(THClState *state, THClStorage *storage, struct THIntStorage *src);
15 | THCL_API void THClStorage_copyLong(THClState *state, THClStorage *storage, struct THLongStorage *src);
16 | THCL_API void THClStorage_copyFloat(THClState *state, THClStorage *storage, struct THFloatStorage *src);
17 | THCL_API void THClStorage_copyDouble(THClState *state, THClStorage *storage, struct THDoubleStorage *src);
18 | 
19 | THCL_API void THByteStorage_copyCl(THClState *state, THByteStorage *self, struct THClStorage *src);
20 | THCL_API void THCharStorage_copyCl(THClState *state, THCharStorage *self, struct THClStorage *src);
21 | THCL_API void THShortStorage_copyCl(THClState *state, THShortStorage *self, struct THClStorage *src);
22 | THCL_API void THIntStorage_copyCl(THClState *state, THIntStorage *self, struct THClStorage *src);
23 | THCL_API void THLongStorage_copyCl(THClState *state, THLongStorage *self, struct THClStorage *src);
24 | THCL_API void THFloatStorage_copyCl(THClState *state, THFloatStorage *self, struct THClStorage *src);
25 | THCL_API void THDoubleStorage_copyCl(THClState *state, THDoubleStorage *self, struct THClStorage *src);
26 | THCL_API void THClStorage_copyCl(THClState *state, THClStorage *self, THClStorage *src);
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/src/lib/THClStorageGet.cl:
--------------------------------------------------------------------------------
1 | kernel void THClStorageGet(global float *res, global float *data, int index) {
2 |   if(get_global_id(0) == 0) {
3 |     res[0] = data[index];
4 |   }
5 | }
6 | 
7 | 


--------------------------------------------------------------------------------
/src/lib/THClStorageSet.cl:
--------------------------------------------------------------------------------
 1 | kernel void THClStorageSet(global float *data, int index, float value) {
 2 |   if(get_global_id(0) == 0) {
 3 | //    int index2 = index;
 4 | //    data[index2] = 44;
 5 |     data[index] = value;
 6 | //    data[2] = index2;
 7 | //    data[3] = value;
 8 |   }
 9 | }
10 | 
11 | 


--------------------------------------------------------------------------------
/src/lib/THClTensor.h:
--------------------------------------------------------------------------------
  1 | #ifndef THCL_TENSOR_INC
  2 | #define THCL_TENSOR_INC
  3 | 
  4 | #include <stdint.h>
  5 | 
  6 | #include "THTensor.h"
  7 | #include "THClStorage.h"
  8 | #include "THClGeneral.h"
  9 | 
 10 | #define TH_TENSOR_REFCOUNTED 1
 11 | 
 12 | //struct CLWrapper;
 13 | 
 14 | typedef struct THClTensor
 15 | {
 16 |     long *size;
 17 |     long *stride;
 18 |     int nDimension;
 19 | 
 20 |     THClStorage *storage;
 21 |     long storageOffset;
 22 |     int refcount;
 23 | 
 24 |     char flag;
 25 | 
 26 |     int device;
 27 | } THClTensor;
 28 | 
 29 | 
 30 | /**** access methods ****/
 31 | THCL_API THClStorage* THClTensor_storage(THClState *state, const THClTensor *self);
 32 | THCL_API long THClTensor_storageOffset(THClState *state, const THClTensor *self);
 33 | THCL_API int THClTensor_nDimension(THClState *state, const THClTensor *self);
 34 | THCL_API long THClTensor_size(THClState *state, const THClTensor *self, int dim);
 35 | THCL_API long THClTensor_stride(THClState *state, const THClTensor *self, int dim);
 36 | THCL_API THLongStorage *THClTensor_newSizeOf(THClState *state, THClTensor *self);
 37 | THCL_API THLongStorage *THClTensor_newStrideOf(THClState *state, THClTensor *self);
 38 | THCL_API float *THClTensor_data(THClState *state, const THClTensor *self);
 39 | #ifdef __cplusplus
 40 | THCL_API class CLWrapper *THClTensor_wrapper(THClState *state, const THClTensor *self);
 41 | #endif // __cplusplus
 42 | 
 43 | THCL_API void THClTensor_setFlag(THClState *state, THClTensor *self, const char flag);
 44 | THCL_API void THClTensor_clearFlag(THClState *state, THClTensor *self, const char flag);
 45 | 
 46 | 
 47 | /**** creation methods ****/
 48 | THCL_API THClTensor *THClTensor_new(THClState *state) DEPRECATED_POST;
 49 | THCL_API THClTensor *THClTensor_newv2(THClState *state, int device);
 50 | THCL_API THClTensor *THClTensor_newWithTensor(THClState *state, THClTensor *tensor);
 51 | /* stride might be NULL */
 52 | THCL_API THClTensor *THClTensor_newWithStorage(THClState *state, int device, THClStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_);
 53 | THCL_API THClTensor *THClTensor_newWithStorage1d(THClState *state, int device, THClStorage *storage_, long storageOffset_,
 54 |                                 long size0_, long stride0_);
 55 | THCL_API THClTensor *THClTensor_newWithStorage2d(THClState *state, int device, THClStorage *storage_, long storageOffset_,
 56 |                                 long size0_, long stride0_,
 57 |                                 long size1_, long stride1_);
 58 | THCL_API THClTensor *THClTensor_newWithStorage3d(THClState *state, int device, THClStorage *storage_, long storageOffset_,
 59 |                                 long size0_, long stride0_,
 60 |                                 long size1_, long stride1_,
 61 |                                 long size2_, long stride2_);
 62 | THCL_API THClTensor *THClTensor_newWithStorage4d(THClState *state, int device, THClStorage *storage_, long storageOffset_,
 63 |                                 long size0_, long stride0_,
 64 |                                 long size1_, long stride1_,
 65 |                                 long size2_, long stride2_,
 66 |                                 long size3_, long stride3_);
 67 | 
 68 | /* stride might be NULL */
 69 | THCL_API THClTensor *THClTensor_newWithSize(THClState *state, int device, THLongStorage *size_, THLongStorage *stride_);
 70 | THCL_API THClTensor *THClTensor_newWithSize1d(THClState *state, int device, long size0_);
 71 | THCL_API THClTensor *THClTensor_newWithSize2d(THClState *state, int device, long size0_, long size1_);
 72 | THCL_API THClTensor *THClTensor_newWithSize3d(THClState *state, int device, long size0_, long size1_, long size2_);
 73 | THCL_API THClTensor *THClTensor_newWithSize4d(THClState *state, int device, long size0_, long size1_, long size2_, long size3_);
 74 | 
 75 | THCL_API THClTensor *THClTensor_newClone(THClState *state, THClTensor *self);
 76 | THCL_API THClTensor *THClTensor_newContiguous(THClState *state, THClTensor *tensor);
 77 | THCL_API THClTensor *THClTensor_newSelect(THClState *state, THClTensor *tensor, int dimension_, long sliceIndex_);
 78 | THCL_API THClTensor *THClTensor_newNarrow(THClState *state, THClTensor *tensor, int dimension_, long firstIndex_, long size_);
 79 | THCL_API THClTensor *THClTensor_newTranspose(THClState *state, THClTensor *tensor, int dimension1_, int dimension2_);
 80 | THCL_API THClTensor *THClTensor_newUnfold(THClState *state, THClTensor *tensor, int dimension_, long size_, long step_);
 81 | 
 82 | THCL_API void THClTensor_resize(THClState *state, THClTensor *tensor, THLongStorage *size, THLongStorage *stride);
 83 | THCL_API void THClTensor_resizeAs(THClState *state, THClTensor *tensor, THClTensor *src);
 84 | THCL_API void THClTensor_resize0d(THClState *state, THClTensor *tensor);
 85 | THCL_API void THClTensor_resize1d(THClState *state, THClTensor *tensor, long size0_);
 86 | THCL_API void THClTensor_resize2d(THClState *state, THClTensor *tensor, long size0_, long size1_);
 87 | THCL_API void THClTensor_resize3d(THClState *state, THClTensor *tensor, long size0_, long size1_, long size2_);
 88 | THCL_API void THClTensor_resize4d(THClState *state, THClTensor *tensor, long size0_, long size1_, long size2_, long size3_);
 89 | THCL_API void THClTensor_resize5d(THClState *state, THClTensor *tensor, long size0_, long size1_, long size2_, long size3_, long size4_);
 90 | 
 91 | THCL_API void THClTensor_set(THClState *state, THClTensor *self, THClTensor *src);
 92 | THCL_API void THClTensor_setStorage(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_, THLongStorage *size_, THLongStorage *stride_);
 93 | THCL_API void THClTensor_setStorage1d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_,
 94 |                                     long size0_, long stride0_);
 95 | THCL_API void THClTensor_setStorage2d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_,
 96 |                                     long size0_, long stride0_,
 97 |                                     long size1_, long stride1_);
 98 | THCL_API void THClTensor_setStorage3d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_,
 99 |                                     long size0_, long stride0_,
100 |                                     long size1_, long stride1_,
101 |                                     long size2_, long stride2_);
102 | THCL_API void THClTensor_setStorage4d(THClState *state, THClTensor *self, THClStorage *storage_, long storageOffset_,
103 |                                     long size0_, long stride0_,
104 |                                     long size1_, long stride1_,
105 |                                     long size2_, long stride2_,
106 |                                     long size3_, long stride3_);
107 | 
108 | THCL_API void THClTensor_narrow(THClState *state, THClTensor *self, THClTensor *src, int dimension_, long firstIndex_, long size_);
109 | THCL_API void THClTensor_select(THClState *state, THClTensor *self, THClTensor *src, int dimension_, long sliceIndex_);
110 | THCL_API void THClTensor_transpose(THClState *state, THClTensor *self, THClTensor *src, int dimension1_, int dimension2_);
111 | THCL_API void THClTensor_unfold(THClState *state, THClTensor *self, THClTensor *src, int dimension_, long size_, long step_);
112 | 
113 | THCL_API void THClTensor_squeeze(THClState *state, THClTensor *self, THClTensor *src);
114 | THCL_API void THClTensor_squeeze1d(THClState *state, THClTensor *self, THClTensor *src, int dimension_);
115 | 
116 | THCL_API int THClTensor_isContiguous(THClState *state, const THClTensor *self);
117 | THCL_API int THClTensor_isSameSizeAs(THClState *state, const THClTensor *self, const THClTensor *src);
118 | THCL_API long THClTensor_nElement(THClState *state, const THClTensor *self);
119 | 
120 | THCL_API void THClTensor_retain(THClState *state, THClTensor *self);
121 | THCL_API void THClTensor_free(THClState *state, THClTensor *self);
122 | THCL_API void THClTensor_freeCopyTo(THClState *state, THClTensor *self, THClTensor *dst);
123 | 
124 | /* Slow access methods [check everything] */
125 | THCL_API void THClTensor_set1d(THClState *state, THClTensor *tensor, long x0, float value);
126 | THCL_API void THClTensor_set2d(THClState *state, THClTensor *tensor, long x0, long x1, float value);
127 | THCL_API void THClTensor_set3d(THClState *state, THClTensor *tensor, long x0, long x1, long x2, float value);
128 | THCL_API void THClTensor_set4d(THClState *state, THClTensor *tensor, long x0, long x1, long x2, long x3, float value);
129 | 
130 | THCL_API float THClTensor_get1d(THClState *state, const THClTensor *tensor, long x0);
131 | THCL_API float THClTensor_get2d(THClState *state, const THClTensor *tensor, long x0, long x1);
132 | THCL_API float THClTensor_get3d(THClState *state, const THClTensor *tensor, long x0, long x1, long x2);
133 | THCL_API float THClTensor_get4d(THClState *state, const THClTensor *tensor, long x0, long x1, long x2, long x3);
134 | 
135 | /* GPU-specific functions */
136 | //THCL_API cudaTextureObject_t THClTensor_getTextureObject(THClState *state, THClTensor *self);
137 | THCL_API int THClTensor_getDevice(THClState *state, const THClTensor *self);
138 | THCL_API int THClTensor_checkGPU(THClState *state, unsigned int nTensors, ...);
139 | 
140 | // new
141 | #ifdef __cplusplus
142 | THCL_API_CPP std::string THClTensor_toString(THClState *state, const THClTensor *tensor);
143 | THCL_API EasyCL *THClTensor_getCl(THClState *state, const THClTensor *tensor);
144 | #endif // __cplusplus
145 | THCL_API int THClTensor_getDevice(THClState *state, const THClTensor *tensor);
146 | 
147 | #endif
148 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorCopy.h:
--------------------------------------------------------------------------------
 1 | #ifndef TH_CL_TENSOR_COPY_INC
 2 | #define TH_CL_TENSOR_COPY_INC
 3 | 
 4 | #include "THClTensor.h"
 5 | #include "THClGeneral.h"
 6 | 
 7 | THCL_API void THClTensor_copy(THClState *state, THClTensor *self, THClTensor *src);
 8 | THCL_API void THClTensor_copyByte(THClState *state, THClTensor *self, THByteTensor *src);
 9 | THCL_API void THClTensor_copyChar(THClState *state, THClTensor *self, THCharTensor *src);
10 | THCL_API void THClTensor_copyShort(THClState *state, THClTensor *self, THShortTensor *src);
11 | THCL_API void THClTensor_copyInt(THClState *state, THClTensor *self, THIntTensor *src);
12 | THCL_API void THClTensor_copyLong(THClState *state, THClTensor *self, THLongTensor *src);
13 | THCL_API void THClTensor_copyFloat(THClState *state, THClTensor *self, THFloatTensor *src);
14 | THCL_API void THClTensor_copyDouble(THClState *state, THClTensor *self, THDoubleTensor *src);
15 | 
16 | THCL_API void THByteTensor_copyCl(THClState *state, THByteTensor *self, THClTensor *src);
17 | THCL_API void THCharTensor_copyCl(THClState *state, THCharTensor *self, THClTensor *src);
18 | THCL_API void THShortTensor_copyCl(THClState *state, THShortTensor *self, THClTensor *src);
19 | THCL_API void THIntTensor_copyCl(THClState *state, THIntTensor *self, THClTensor *src);
20 | THCL_API void THLongTensor_copyCl(THClState *state, THLongTensor *self, THClTensor *src);
21 | THCL_API void THFloatTensor_copyCl(THClState *state, THFloatTensor *self, THClTensor *src);
22 | THCL_API void THDoubleTensor_copyCl(THClState *state, THDoubleTensor *self, THClTensor *src);
23 | THCL_API void THClTensor_copyCl(THClState *state, THClTensor *self, THClTensor *src);
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorIndex.cl:
--------------------------------------------------------------------------------
  1 | // from lib/THC/THCTensorIndex.cu:
  2 | 
  3 | kernel void THClTensor_kernel_indexFill(
  4 |    global float *tensor_data, int tensor_offset,
  5 |   global int* stride,
  6 |   global float *index_data, int index_offset, 
  7 |   int src_nDim,
  8 |    int dim, int idx_size, int tensor_size, int size_dim, float val
  9 | )
 10 | {
 11 |   int thread_idx = get_group_id(0) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0);
 12 | 
 13 |   long flat_size = tensor_size / idx_size;
 14 | 
 15 |   if (thread_idx < flat_size)
 16 |   {
 17 |     long coeff = 0;
 18 |     for (int i=0; i<idx_size; i++)
 19 |     {
 20 |       int leftover = thread_idx;
 21 |       int srcIdx = 0;
 22 |       for (int d=0; d<src_nDim; d++)
 23 |       {
 24 |         if (d < dim)
 25 |         {
 26 |           coeff = leftover / (stride[d] / size_dim);
 27 |           leftover -= coeff * (stride[d] / size_dim);
 28 |           srcIdx += coeff * stride[d];
 29 |         }
 30 |         else if (d > dim)
 31 |         {
 32 |           coeff = leftover / stride[d];
 33 |           leftover -= coeff * stride[d];
 34 |           srcIdx += coeff * stride[d];
 35 |         }
 36 |       }
 37 |       tensor_data[tensor_offset + srcIdx + (int)((index_data[index_offset + i])-1)*stride[dim] ] = val;
 38 |     }
 39 |   }
 40 | }
 41 | 
 42 | kernel void THClTensor_kernel_indexCopy(
 43 |    global float *res_data, int res_offset, 
 44 |    global float *src_data, int src_offset,
 45 |    global int* res_stride, global float *index_data, int index_offset,
 46 |    int res_nDim, int dim, int idx_size, int src_size, int size_dim
 47 | )
 48 | {
 49 |   int thread_idx = get_group_id(0) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0);
 50 | 
 51 |   long flat_size = src_size / idx_size;
 52 | 
 53 |   if (thread_idx < flat_size)
 54 |   {
 55 |     long coeff = 0;
 56 |     for (int i=0; i<idx_size; i++)
 57 |     {
 58 |       int leftover = thread_idx;
 59 |       int targetIdx = 0;
 60 |       int resIdx = 0;
 61 |       for (int d=0; d<res_nDim; d++)
 62 |       {
 63 |         if (d < dim)
 64 |         {
 65 |           long stride_d = res_stride[d] / size_dim;
 66 |           coeff = leftover / stride_d;
 67 |           leftover -= coeff * stride_d;
 68 |           targetIdx += coeff * stride_d * idx_size;
 69 |           resIdx += coeff * res_stride[d];
 70 |         }
 71 |         else if (d > dim)
 72 |         {
 73 |           coeff = leftover / res_stride[d];
 74 |           leftover -= coeff * res_stride[d];
 75 |           targetIdx += coeff * res_stride[d];
 76 |           resIdx += coeff * res_stride[d];
 77 |         }
 78 |       }
 79 |       res_data[res_offset + resIdx + ((int)(index_data[index_offset + i])-1)*res_stride[dim] ] = src_data[src_offset + targetIdx + i*res_stride[dim] ];
 80 |     }
 81 |   }
 82 | }
 83 | 
 84 | kernel void THClTensor_kernel_indexSelect(
 85 |    global float *tensor_data, int tensor_offset, global float *src_data, int src_offset,
 86 |   global int* src_stride, global float *index_data, int index_offset,
 87 |    int src_nDim, int dim, int idx_size, int tensor_size, int size_dim
 88 | )
 89 | {
 90 |   int thread_idx = get_group_id(0) * get_local_size(0) * get_local_size(1) + get_local_id(1) * get_local_size(0) + get_local_id(0);
 91 | 
 92 |   long flat_size = tensor_size / idx_size;
 93 | 
 94 |   if (thread_idx < flat_size)
 95 |   {
 96 |     long coeff = 0;
 97 |     for (int i=0; i<idx_size; i++)
 98 |     {
 99 |       int leftover = thread_idx;
100 |       int targetIdx = 0;
101 |       int srcIdx = 0;
102 |       for (int d=0; d<src_nDim; d++)
103 |       {
104 |         if (d < dim)
105 |         {
106 |           long stride_d = src_stride[d] / size_dim;
107 |           coeff = leftover / stride_d;
108 |           leftover -= coeff * stride_d;
109 |           targetIdx += coeff * stride_d * idx_size;
110 |           srcIdx += coeff * src_stride[d];
111 |         }
112 |         else if (d > dim)
113 |         {
114 |           coeff = leftover / src_stride[d];
115 |           leftover -= coeff * src_stride[d];
116 |           targetIdx += coeff * src_stride[d];
117 |           srcIdx += coeff * src_stride[d];
118 |         }
119 |       }
120 |       tensor_data[tensor_offset + targetIdx + i*src_stride[dim] ] = src_data[src_offset + srcIdx + ((int)(index_data[index_offset + i])-1)*src_stride[dim] ];
121 |     }
122 |   }
123 | }
124 | 
125 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorInfoCl.cl:
--------------------------------------------------------------------------------
1 | typedef struct THClTensorInfoCl {
2 |   unsigned int sizes[{{MAX_CLTORCH_DIMS}}];
3 |   unsigned int strides[{{MAX_CLTORCH_DIMS}}];
4 |   int offset;
5 |   int dims;
6 | } TensorInfoCl;
7 | 
8 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMasked.cl:
--------------------------------------------------------------------------------
 1 | // from lib/THC/THCTensorMasked.cu:
 2 | 
 3 | struct TensorMaskedFillOp {
 4 |   TensorMaskedFillOp(float v) : value(v) {}
 5 |   /*__device__*/ /*__forceline__*/ void operator()(float* t, float* mask) {
 6 |     // Really mask should be `0` or `1` but we can't propagate errors here.
 7 |     if (*mask != 0.0f) {
 8 |       *t = value;
 9 |     }
10 |   }
11 | 
12 |   float value;
13 | };
14 | 
15 | struct TensorMaskedCopyOp {
16 |   TensorMaskedCopyOp(float* s, float* bm, float* ps)
17 |       : src(s),
18 |         baseMask(bm),
19 |         maskPrefixSum(ps) {
20 |   }
21 | 
22 |   /*__device__*/ /*__forceline__*/ void operator()(float* out, float* mask) {
23 |     // Really mask should be `0` or `1` but we can't propagate errors here.
24 |     if (*mask != 0.0f) {
25 |       // We've already checked that this offset is <= 2^24, so this is ok.
26 |       int srcOffset = (int) (mask - baseMask);
27 |       *out = src[(int) maskPrefixSum[srcOffset]];
28 |     }
29 |   }
30 | 
31 |   // Where we are copying from
32 |   float* src;
33 | 
34 |   // The base address of mask so we can calculate offset
35 |   float* baseMask;
36 | 
37 |   // The index we are copying from
38 |   float* maskPrefixSum;
39 | };
40 | 
41 | struct TensorMaskedSelectOp {
42 |   TensorMaskedSelectOp(float* t) : out(t) {}
43 |   /*__device__*/ /*__forceline__*/ void operator()(float* mask, float* maskPrefixSum, float* in) {
44 |     // Really mask should be `0` or `1` but we can't propagate errors here.
45 |     if (*mask != 0.0f) {
46 |       out[(int) *maskPrefixSum] = *in;
47 |     }
48 |   }
49 | 
50 |   float* out;
51 | };
52 | 
53 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMath2.cl:
--------------------------------------------------------------------------------
  1 | // from lib/THC/THCTensorMath2.cu:
  2 | 
  3 | // Given the sum of values and the sum of squares, compute the variance or standard deviation.
  4 | template<bool flag, bool apply_sqrt>
  5 | /*__forceline__*/ /*__device__*/ float THClTensor_computeVar(float sum, float sum2, unsigned row_size) {
  6 |   if (flag) {
  7 |     sum /= row_size;
  8 |     sum2 /= row_size;
  9 |     sum2 -= sum * sum;
 10 |     sum2 = (sum2 < 0 ? 0 : sum2);
 11 |   }
 12 |   else {
 13 |     sum /= row_size;
 14 |     sum2 /= row_size - 1;
 15 |     sum2 -= ((float)row_size) / ((float)(row_size - 1)) * sum * sum;
 16 |     sum2 = (sum2 < 0 ? 0 : sum2);
 17 |   }
 18 |   if (apply_sqrt)
 19 |     return sqrt(sum2);
 20 |   else
 21 |     return sum2;
 22 | }
 23 | 
 24 | /* Compute the variance (or standard deviation) along an outer dimension of a tensor.
 25 |  *
 26 |  * - num_orows is the size of the flattened outer dimensions;
 27 |  * - num_irows is the size of the flattened inner dimensions;
 28 |  * - row_size is the size of the dimension along which to compute the variance;
 29 |  * - if flag is set, normalize by `row_size` instead of `row_size - 1`
 30 |  * - if apply_sqrt is set, compute the standard deviation instead of variance
 31 |  *
 32 |  * The dimensions to the outside and inside of the specified dimension are considered as flattened.
 33 |  * Thread blocks with the same get_group_id(1) process an "outer row" (i.e. an element of the flattened
 34 |  * outer dimensions, which contains several "inner rows").
 35 |  * Each thread processes a single inner row at a time.
 36 |  */
 37 | template<bool flag, bool apply_sqrt>
 38 | kernel void THClTensor_kernel_varOuterDim(float *tgt, float *src_, unsigned num_orows, unsigned num_irows, unsigned row_size)
 39 | {
 40 |   for (unsigned orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) {
 41 |     for (unsigned irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 42 |       float *src = src_ + orow * row_size * num_irows + irow;
 43 |       float sum = 0, sum2 = 0;
 44 | 
 45 |       for (unsigned col = 0; col < row_size; ++col) {
 46 |         float val = *src;
 47 |         sum += val;
 48 |         sum2 += val * val;
 49 | 
 50 |         src += num_irows;
 51 |       }
 52 | 
 53 |       tgt[orow * num_irows + irow] = THClTensor_computeVar<flag, apply_sqrt>(sum, sum2, row_size);
 54 |     }
 55 |   }
 56 | }
 57 | 
 58 | /* Compute the variance (or standard deviation) of the innermost dimension of a tensor.
 59 |  *
 60 |  * - num_rows is the size of the flattened outer dimensions;
 61 |  * - row_size is the size of the innermost dimension;
 62 |  * - if flag is set, normalize by `row_size` instead of `row_size - 1`
 63 |  * - if apply_sqrt is set, compute the standard deviation instead of variance
 64 |  *
 65 |  * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
 66 |  * considered as having 'num_rows' rows of size 'row_size'.
 67 |  * Each thread block processes one or more sets of contiguous rows (processing multiple rows
 68 |  * per thread block is quicker than processing a single row, especially for short rows).
 69 |  */
 70 | template<bool flag, bool apply_sqrt>
 71 | kernel void THClTensor_kernel_varInnermostDim(float *tgt, float *src_, unsigned num_rows, unsigned row_size)
 72 | {
 73 |   local float ssum[32][16];
 74 |   local float ssum2[32][16];
 75 | 
 76 |   for (unsigned block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) {
 77 |     unsigned row = block_row + get_local_id(1);
 78 |     float sum = 0, sum2 = 0;
 79 |     if (row < num_rows) {
 80 |       float *src = src_ + row * row_size;
 81 |       // Sequential reduction within a thread.
 82 |       for (unsigned col = get_local_id(0); col < row_size; col += get_local_size(0)) {
 83 |         float val = src[col];
 84 |         sum += val;
 85 |         sum2 += val * val;
 86 |       }
 87 |     }
 88 |     ssum[get_local_id(1)][get_local_id(0)] = sum;
 89 |     ssum2[get_local_id(1)][get_local_id(0)] = sum2;
 90 |     barrier(CLK_LOCAL_MEM_FENCE);
 91 | 
 92 |     // Reduce intermediate values to single value.
 93 |     for (unsigned s = 8; s > 1; s >>= 1) {
 94 |       if (row < num_rows && get_local_id(0) < s) {
 95 |         ssum[get_local_id(1)][get_local_id(0)] += ssum[get_local_id(1)][get_local_id(0) + s];
 96 |         ssum2[get_local_id(1)][get_local_id(0)] += ssum2[get_local_id(1)][get_local_id(0) + s];
 97 |       }
 98 |       barrier(CLK_LOCAL_MEM_FENCE);
 99 |     }
100 | 
101 |     if (row < num_rows && get_local_id(0) == 0) {
102 |       sum = ssum[get_local_id(1)][0] + ssum[get_local_id(1)][1];
103 |       sum2 = ssum2[get_local_id(1)][0] + ssum2[get_local_id(1)][1];
104 |       tgt[row] = THClTensor_computeVar<flag, apply_sqrt>(sum, sum2, row_size);
105 |     }
106 |     barrier(CLK_LOCAL_MEM_FENCE);
107 |   }
108 | }
109 | 
110 | kernel void THClTensor_kernel_renorm(float *data, const float value, const long size, const float maxnorm)
111 | {
112 |   local float buffer[32];
113 |   long tx = get_local_id(0);
114 |   long bx = get_group_id(0);
115 |   long step = get_local_size(0);
116 |   float *row = data + size*bx;
117 | 
118 |   buffer[tx] = 0;
119 | 
120 |   // get norm of axis
121 |   for (long i=tx; i<size; i+=step)
122 |   {
123 |     buffer[tx] += pow(fabs(row[i]), value);
124 |   }
125 |   // add (reduce)
126 |   for (unsigned int stride = get_local_size(0) >> 1; stride > 0; stride >>= 1)
127 |   {
128 |     barrier(CLK_LOCAL_MEM_FENCE);
129 |     if (tx < stride)
130 |       buffer[tx] += buffer[tx+stride];
131 |   }
132 |   // clip norms
133 |   barrier(CLK_LOCAL_MEM_FENCE);
134 |   float norm = pow(buffer[0], 1/value);
135 |   if (norm > maxnorm)
136 |   {
137 |     norm = maxnorm / (norm + 1e-7);
138 |     // renormalize
139 |     for (long i=tx; i<size; i+=step)
140 |     {
141 |       row[i] *= norm;
142 |     }
143 |   }
144 | }
145 | 
146 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathCompare.cpp:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #include "THClTensorMath.h"
 4 | #include "THClGeneral.h"
 5 | //#include "THClBlas.h"
 6 | #include "THClTensorCopy.h"
 7 | //#include "THCTensorRandom.h"
 8 | #include "THClApply.h"
 9 | #include "THClTensorMathCompare.h"
10 | 
11 | using namespace std;
12 | 
13 | #ifndef DIVUP
14 | #define DIVUP(x, y) (((x) + (y) - 1) / (y))
15 | #endif
16 | 
17 | void THClTensor_logicalValue(THClState *state, THClTensor *self_, THClTensor *src, HasOperator2 *op)
18 | {
19 |   THClTensor_resizeAs(state, self_, src);
20 | 
21 |   if (!THClTensor_pointwiseApply2(state, self_, src, op)) {
22 |     THArgCheck(false, 2, CLTORCH_DIM_WARNING);
23 |   }
24 | }
25 | 
26 | class TensorGenCompareValueOp : public HasOperator2, public HasScalars {
27 | public:
28 |   int getNumScalars() const { return 1; }
29 |   float getScalar( int index ) const { return val; }
30 |   TensorGenCompareValueOp(std::string op, float v) : 
31 |     val(v),
32 |     op(op) {}
33 |   string operator2() const {
34 |     return "*out = (*in1 " + op + " val1)";
35 |   }
36 |   const float val;
37 |   std::string op;
38 | };
39 | 
40 | #define GENERATE_THClTensor_LogValue(NAME, OP) \
41 |  void THClTensor_##NAME##Value(THClState *state, THClTensor *self_, THClTensor *src, float value) \
42 | { \
43 |   THAssert(THClTensor_checkGPU(state, 2, self_, src)); \
44 |   TensorGenCompareValueOp op(#OP, value); \
45 |   THClTensor_logicalValue(state, self_, src, &op); \
46 | }
47 | 
48 | GENERATE_THClTensor_LogValue(ge, >=)
49 | GENERATE_THClTensor_LogValue(ne, !=)
50 | GENERATE_THClTensor_LogValue(eq, ==)
51 | GENERATE_THClTensor_LogValue(le, <=)
52 | GENERATE_THClTensor_LogValue(lt, <)
53 | GENERATE_THClTensor_LogValue(gt, >)
54 | 
55 | class TensorGenComparePointTensorOp : public HasOperator2, public HasPointTensors {
56 | public:
57 |   int getNumPointTensors() const { return 1; }
58 |   const THClTensor *getPointTensor( int index ) const { return val; }
59 |   TensorGenComparePointTensorOp(std::string op, THClTensor *v) : 
60 |     val(v),
61 |     op(op) {}
62 |   string operator2() const {
63 |     return "*out = (*in1 " + op + " *pointTensor1)";
64 |   }
65 |   const THClTensor *val;
66 |   std::string op;
67 | };
68 | 
69 | #define GENERATE_THClTensor_LogPointTensor(NAME, OP) \
70 |  void THClTensor_##NAME##PointTensor(THClState *state, THClTensor *self_, THClTensor *src, THClTensor *value) \
71 | { \
72 |   THAssert(THClTensor_checkGPU(state, 3, self_, src, value)); \
73 |   TensorGenComparePointTensorOp op(#OP, value); \
74 |   THClTensor_logicalValue(state, self_, src, &op); \
75 | }
76 | 
77 | GENERATE_THClTensor_LogPointTensor(ge, >=)
78 | GENERATE_THClTensor_LogPointTensor(ne, !=)
79 | GENERATE_THClTensor_LogPointTensor(eq, ==)
80 | GENERATE_THClTensor_LogPointTensor(le, <=)
81 | GENERATE_THClTensor_LogPointTensor(lt, <)
82 | GENERATE_THClTensor_LogPointTensor(gt, >)
83 | 
84 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathCompare.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | void THClTensor_logicalValue(THClState *state, THClTensor *self_, THClTensor *src, HasOperator2 *op);
4 | void THClTensor_logicalTensor(THClState *state, THClTensor *self_, THClTensor *src1, THClTensor *src2, HasOperator3 *op);
5 | 
6 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathCompareT.cpp:
--------------------------------------------------------------------------------
 1 | #include <string>
 2 | 
 3 | #include "THClTensorMath.h"
 4 | #include "THClGeneral.h"
 5 | //#include "THCBlas.h"
 6 | #include "THClTensorCopy.h"
 7 | //#include "THCTensorRandom.h"
 8 | #include "THClApply.h"
 9 | //#include "THCReduce.cuh"
10 | #include "THClTensorMathCompare.h"
11 | 
12 | using namespace std;
13 | 
14 | #ifndef DIVUP
15 | #define DIVUP(x, y) (((x) + (y) - 1) / (y))
16 | #endif
17 | 
18 | void THClTensor_logicalTensor(THClState *state, THClTensor *self_, THClTensor *src1, THClTensor *src2, HasOperator3 *op)
19 | {
20 |   THClTensor_resizeAs(state, self_, src1);
21 |   THArgCheck(THClTensor_nElement(state, src1) == THClTensor_nElement(state, src2), 3, "sizes do not match");
22 | 
23 |   if (!THClTensor_pointwiseApply3(state, self_, src1, src2, op)) {
24 |     THArgCheck(false, 2, CLTORCH_DIM_WARNING);
25 |   }
26 | }
27 | 
28 | class TensorGenLogOp : public HasOperator3 {
29 | public:
30 |   string logop;
31 |   TensorGenLogOp(string logop) {
32 |     this->logop = logop;
33 |   }
34 |   string operator3() const {
35 |     return "*out = (float) (*in1 " + logop + " *in2)";
36 |   }
37 | };
38 | 
39 | #define GENERATE_THClTensor_LogOpTensor(NAME, LOGOP) \
40 | void THClTensor_##NAME##Tensor(THClState *state, THClTensor *self_, THClTensor *src1, THClTensor *src2)  \
41 | {  \
42 |   if( src2->nDimension == 0 ) { \
43 |     THClTensor_##NAME##PointTensor(state, self_, src1, src2); \
44 |     return; \
45 |   } \
46 |   THAssert(THClTensor_checkGPU(state, 3, self_, src1, src2)); \
47 |   TensorGenLogOp op(#LOGOP); \
48 |   THClTensor_logicalTensor(state, self_, src1, src2, &op); \
49 | }
50 | 
51 | GENERATE_THClTensor_LogOpTensor(lt, <)
52 | GENERATE_THClTensor_LogOpTensor(gt, >)
53 | GENERATE_THClTensor_LogOpTensor(le, <=)
54 | GENERATE_THClTensor_LogOpTensor(ge, >=)
55 | GENERATE_THClTensor_LogOpTensor(ne, !=)
56 | GENERATE_THClTensor_LogOpTensor(eq, ==)
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathPairwise.cpp:
--------------------------------------------------------------------------------
  1 | #include "THClTensorMath.h"
  2 | #include "THClGeneral.h"
  3 | //#include "THCBlas.h"
  4 | #include "THClTensorCopy.h"
  5 | //#include "THCTensorRandom.h"
  6 | #include "THClApply.h"
  7 | //#include "THCReduce.cuh"
  8 | 
  9 | #include <iostream>
 10 | #include <string>
 11 | 
 12 | using namespace std;
 13 | 
 14 | #ifndef DIVUP
 15 | #define DIVUP(x, y) (((x) + (y) - 1) / (y))
 16 | #endif
 17 | 
 18 | class TensorAddConstantOp : public HasOperator1, public HasOperator2, public HasScalars {
 19 | public:
 20 |   int getNumScalars() const { return 1; }
 21 |   float getScalar( int index ) const { return val; }
 22 |   TensorAddConstantOp(float v) : val(v) {}
 23 |   string operator2() const {
 24 |     return "*out = *in1 + val1";
 25 |   }
 26 |   string operator1() const {
 27 |     return "*out += val1";
 28 |   }
 29 |   const float val;
 30 | };
 31 | 
 32 | class TensorAddPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors {
 33 | public:
 34 |   int getNumPointTensors() const { return 1; }
 35 |   const THClTensor *getPointTensor( int index ) const { return val; }
 36 |   TensorAddPointTensorOp(THClTensor *v) : val(v) {}
 37 |   string operator2() const {
 38 |     return "*out = *in1 + *pointTensor1";
 39 |   }
 40 |   string operator1() const {
 41 |     return "*out += *pointTensor1";
 42 |   }
 43 |   const THClTensor *val;
 44 | };
 45 | 
 46 | class TensorSubPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors {
 47 | public:
 48 |   int getNumPointTensors() const { return 1; }
 49 |   const THClTensor *getPointTensor( int index ) const { return val; }
 50 |   TensorSubPointTensorOp(THClTensor *v) : val(v) {}
 51 |   string operator2() const {
 52 |     return "*out = *in1 - *pointTensor1";
 53 |   }
 54 |   string operator1() const {
 55 |     return "*out -= *pointTensor1";
 56 |   }
 57 |   const THClTensor *val;
 58 | };
 59 | 
 60 | void THClTensor_add(THClState *state, THClTensor *self_, THClTensor *src_, float value)
 61 | {
 62 |   THAssert(THClTensor_checkGPU(state, 2, self_, src_));
 63 |   if (self_ == src_) {
 64 |     TensorAddConstantOp op(value);
 65 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
 66 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
 67 |     }
 68 |   } else {
 69 |     THClTensor_resizeAs(state, self_, src_);
 70 | 
 71 |     TensorAddConstantOp op(value);
 72 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
 73 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
 74 |     }
 75 |   }
 76 | }
 77 | 
 78 | void THClTensor_sub(THClState *state, THClTensor *self_, THClTensor *src_, float value)
 79 | {
 80 |   THAssert(THClTensor_checkGPU(state, 2, self_, src_));
 81 |   TensorAddConstantOp op(-value);
 82 |   if (self_ == src_) {
 83 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
 84 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
 85 |     }
 86 |   } else {
 87 |     THClTensor_resizeAs(state, self_, src_);
 88 | 
 89 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
 90 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
 91 |     }
 92 |   }
 93 | }
 94 | 
 95 | class TensorMulConstantOp : public HasOperator2, public HasOperator1, public HasScalars {
 96 | public:
 97 |   int getNumScalars() const { return 1; }
 98 |   float getScalar( int index ) const { return val; }
 99 |   TensorMulConstantOp(float v) : val(v) {}
100 |   string operator2() const {
101 |     return "*out = *in1 * val1";
102 |   }
103 |   string operator1() const {
104 |     return "*out *= val1";
105 |   }
106 |   const float val;
107 | };
108 | 
109 | class TensorMulPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors {
110 | public:
111 |   int getNumPointTensors() const { return 1; }
112 |   const THClTensor *getPointTensor( int index ) const { return val; }
113 |   TensorMulPointTensorOp(THClTensor *v) : val(v) {}
114 |   string operator2() const {
115 |     return "*out = *in1 * *pointTensor1";
116 |   }
117 |   string operator1() const {
118 |     return "*out *= *pointTensor1";
119 |   }
120 |   const THClTensor *val;
121 | };
122 | 
123 | class TensorDivPointTensorOp : public HasOperator2, public HasOperator1, public HasPointTensors {
124 | public:
125 |   int getNumPointTensors() const { return 1; }
126 |   const THClTensor *getPointTensor( int index ) const { return val; }
127 |   TensorDivPointTensorOp(THClTensor *v) : val(v) {}
128 |   string operator2() const {
129 |     return "*out = *in1 / *pointTensor1";
130 |   }
131 |   string operator1() const {
132 |     return "*out /= *pointTensor1";
133 |   }
134 |   const THClTensor *val;
135 | };
136 | 
137 | void THClTensor_mul(THClState *state, THClTensor *self_, THClTensor *src_, float value)
138 | {
139 |   THAssert(THClTensor_checkGPU(state, 2, self_, src_));
140 |   if (self_ == src_) {
141 |     TensorMulConstantOp op(value);
142 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
143 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
144 |     }
145 |   } else {
146 |     THClTensor_resizeAs(state, self_, src_);
147 | 
148 |     TensorMulConstantOp op(value);
149 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
150 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
151 |     }
152 |   }
153 | }
154 | 
155 | void THClTensor_add_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_)
156 | {
157 |   THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_));
158 |   TensorAddPointTensorOp op(value_);
159 |   if (self_ == src_) {
160 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
161 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
162 |     }
163 |   } else {
164 |     THClTensor_resizeAs(state, self_, src_);
165 | 
166 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
167 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
168 |     }
169 |   }
170 | }
171 | 
172 | void THClTensor_sub_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_)
173 | {
174 |   THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_));
175 |   TensorSubPointTensorOp op(value_);
176 |   if (self_ == src_) {
177 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
178 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
179 |     }
180 |   } else {
181 |     THClTensor_resizeAs(state, self_, src_);
182 | 
183 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
184 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
185 |     }
186 |   }
187 | }
188 | 
189 | void THClTensor_mul_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_)
190 | {
191 |   THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_));
192 |   TensorMulPointTensorOp op(value_);
193 |   if (self_ == src_) {
194 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
195 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
196 |     }
197 |   } else {
198 |     THClTensor_resizeAs(state, self_, src_);
199 | 
200 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
201 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
202 |     }
203 |   }
204 | }
205 | 
206 | void THClTensor_div_gpu(THClState *state, THClTensor *self_, THClTensor *src_, THClTensor *value_)
207 | {
208 |   THAssert(THClTensor_checkGPU(state, 3, self_, src_, value_));
209 |   TensorDivPointTensorOp op(value_);
210 |   if (self_ == src_) {
211 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
212 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
213 |     }
214 |   } else {
215 |     THClTensor_resizeAs(state, self_, src_);
216 | 
217 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
218 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
219 |     }
220 |   }
221 | }
222 | 
223 | void THClTensor_div(THClState* state, THClTensor *self_, THClTensor *src_, float value)
224 | {
225 |   THAssert(THClTensor_checkGPU(state, 2, self_, src_));
226 |   THArgCheck(value != 0.0f, 3, "divide by zero");
227 | 
228 |   if (self_ == src_) {
229 |     TensorMulConstantOp op(1.0f / value);
230 |     if (!THClTensor_pointwiseApply1(state, self_, &op)) {
231 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
232 |     }
233 |   } else {
234 |     THClTensor_resizeAs(state, self_, src_);
235 | 
236 |     TensorMulConstantOp op(1.0f / value);
237 |     if (!THClTensor_pointwiseApply2(state, self_, src_, &op)) {
238 |       THArgCheck(false, 2, CLTORCH_DIM_WARNING);
239 |     }
240 |   }
241 | }
242 | 
243 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathPointwise.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | class TensorSigmoidOp : public HasOperator1, public HasOperator2 {
  4 | public:
  5 |   TensorSigmoidOp() {
  6 |   }
  7 |   std::string operator1() const {
  8 |     return "*out = 1.0f / (1 + exp(- (*out)))";
  9 |   }
 10 |   std::string operator2() const {
 11 |     return "*out = 1.0f / (1 + exp(- (*in1)))";
 12 |   }
 13 | };
 14 | 
 15 | class TensorGenOp : public HasOperator1, public HasOperator2 {
 16 | public:
 17 |   std::string cfun;
 18 |   TensorGenOp( std::string cfun ) {
 19 |      this->cfun = cfun;
 20 |   }
 21 |   std::string operator1() const {
 22 |     return "*out =" + cfun + "( *out )";
 23 |   }
 24 |   std::string operator2() const {
 25 |     return "*out = " + cfun + "( *in1 )";
 26 |   }
 27 | };
 28 | 
 29 | class TensorGenOpFullInline1 : public HasOperator1 {
 30 | public:
 31 |   std::string cfun;
 32 |   TensorGenOpFullInline1( std::string cfun ) {
 33 |      this->cfun = cfun;
 34 |   }
 35 |   std::string operator1() const {
 36 |     return cfun;
 37 |   }
 38 | };
 39 | 
 40 | class TensorGenOpFullInline2 : public HasOperator2 {
 41 | public:
 42 |   std::string cfun;
 43 |   TensorGenOpFullInline2( std::string cfun ) {
 44 |      this->cfun = cfun;
 45 |   }
 46 |   std::string operator2() const {
 47 |     return cfun;
 48 |   }
 49 | };
 50 | 
 51 | class TensorGenOpFullInline3 : public HasOperator3 {
 52 | public:
 53 |   std::string cfun;
 54 |   TensorGenOpFullInline3( std::string cfun ) {
 55 |      this->cfun = cfun;
 56 |   }
 57 |   std::string operator3() const {
 58 |     return cfun;
 59 |   }
 60 | };
 61 | 
 62 | // used for maxall etc
 63 | class MaxOp : public HasOperator2, public HasOperator3 {
 64 | public:
 65 |     std::string operator2() const {
 66 |         return "*out = fmax(*out, *in1)";
 67 |     }
 68 |     std::string operator3() const {
 69 |         return "*out = fmax(*in1, *in2)";
 70 |     }
 71 | };
 72 | 
 73 | // used for minall etc
 74 | class MinOp : public HasOperator2, public HasOperator3 {
 75 | public:
 76 |     std::string operator2() const {
 77 |         return "*out = fmin(*out, *in1)";
 78 |     }
 79 |     std::string operator3() const {
 80 |         return "*out = fmin(*in1, *in2)";
 81 |     }
 82 | };
 83 | 
 84 | class TensorAddOp : public HasOperator2, public HasOperator3 {
 85 | public:
 86 |     std::string operator2() const {
 87 |         return "*out += *in1";
 88 |     }
 89 |     std::string operator3() const {
 90 |         return "*out = *in1 + *in2";
 91 |     }
 92 | };
 93 | 
 94 | class TensorCAddOp : public HasOperator2, public HasOperator3, public HasScalars {
 95 | public:
 96 |   int getNumScalars() const { return 1; }
 97 |   float getScalar(int index) const { return val; }
 98 |   TensorCAddOp(float v) : val(v) {}
 99 |     std::string operator2() const {
100 |         return "*out += val1 * *in1";
101 |     }
102 |     std::string operator3() const {
103 |         return "*out = *in1 + val1 * *in2";
104 |     }
105 |   float val;
106 | };
107 | 
108 | class TensorSubOp : public HasOperator2, public HasOperator3 {
109 | public:
110 |     std::string operator2() const {
111 |         return "*out -= *in1";
112 |     }
113 |     std::string operator3() const {
114 |         return "*out = *in1 - *in2";
115 |     }
116 | };
117 | 
118 | class TensorCSubOp : public HasOperator2, public HasOperator3, public HasScalars {
119 | public:
120 |   int getNumScalars() const { return 1; }
121 |   float getScalar(int index) const { return val; }
122 |   TensorCSubOp(float v) : val(v) {}
123 |     std::string operator2() const {
124 |         return "*out -= val1 * *in1";
125 |     }
126 |     std::string operator3() const {
127 |         return "*out = *in1 - val1 * *in2";
128 |     }
129 |   float val;
130 | };
131 | 
132 | class TensorMulOp : public HasOperator2, public HasOperator3 {
133 | public:
134 |     std::string operator2() const {
135 |         return "*out *= *in1";
136 |     }
137 |     std::string operator3() const {
138 |         return "*out = (*in1) * (*in2)";
139 |     }
140 | };
141 | 
142 | 
143 | 
144 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathScan.cl:
--------------------------------------------------------------------------------
  1 | // from lib/THC/THCTensorMathScan.cu:
  2 | 
  3 | /* Perform an inclusive scan along an outer dimension of a tensor.
  4 |  *
  5 |  * - num_orows is the size of the flattened outer dimensions;
  6 |  * - num_irows is the size of the flattened inner dimensions;
  7 |  * - row_size is the size of the dimension along which to compute the variance;
  8 |  *
  9 |  * The dimensions to the outside and inside of the specified dimension are considered as flattened.
 10 |  * Thread blocks with the same get_group_id(1) process an "outer row" (i.e. an element of the flattened
 11 |  * outer dimensions, which contains several "inner rows").
 12 |  * Each thread processes a single inner row at a time.
 13 |  */
 14 | 
 15 | static inline float binary_op(float _in1, float _in2) {
 16 |   // hope the compiler can handle this :-P
 17 |   float _out;
 18 |   float *out = &_out;
 19 |   float *in1 = &_in1;
 20 |   float *in2 = &_in2;
 21 |   *out = 10 * (*in2) * (*in1);
 22 |   {{operator3}};
 23 |  // *out = (*in1) * (*in2);
 24 |   return _out;
 25 | }
 26 | 
 27 | kernel void THClTensor_kernel_scanOuterDim(
 28 |   global float *tgt_data, int tgt_offset,
 29 |   global float *src_data, int src_offset,
 30 |   int num_orows, int num_irows, int row_size,
 31 |   float init)
 32 | {
 33 |   for (unsigned orow = get_group_id(0); (int)orow < num_orows; orow += get_num_groups(0)) {
 34 |     for (unsigned irow = get_group_id(1) * get_local_size(0) + get_local_id(0); (int)irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 35 |       global float *src = src_data + src_offset + orow * row_size * num_irows + irow;
 36 |       global float *tgt = tgt_data + tgt_offset + orow * row_size * num_irows + irow;
 37 |       float acc = init;
 38 | 
 39 |       for (unsigned col = 0; (int)col < row_size; ++col) {
 40 |         acc = binary_op(acc, *src);
 41 | //        binary_op(&acc, &acc, src);
 42 |         *tgt = acc;
 43 | 
 44 |         src += num_irows;
 45 |         tgt += num_irows;
 46 |       }
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | /* Perform an inclusive scan along the innermost dimension of a tensor.
 52 |  *
 53 |  * - num_rows is the size of the flattened outer dimensions;
 54 |  * - row_size is the size of the innermost dimension;
 55 |  *
 56 |  * The outer dimensions of the tensor are considered as a single dimension, i.e. the tensor is
 57 |  * considered as having 'num_rows' rows of size 'row_size'.
 58 |  * Each thread block processes one or more sets of contiguous rows (processing multiple rows
 59 |  * per thread block is quicker than processing a single row, especially for short rows).
 60 |  */
 61 | //template<int num_threads_x, int num_threads_y, class BinaryFunction>
 62 | kernel void THClTensor_kernel_scanInnermostDim(
 63 |   global float *tgt_data, int tgt_offset,
 64 |   global float *src_data, int src_offset,
 65 |   int num_rows, int row_size,
 66 |   float init)
 67 | {
 68 |   local float sbuf[{{num_threads_y}}][2 * {{num_threads_x}}];
 69 | 
 70 |   local float* row_buf = sbuf[get_local_id(1)];
 71 | 
 72 |   for (int block_row = get_group_id(0) * get_local_size(1);
 73 |        block_row < num_rows;
 74 |        block_row += get_local_size(1) * get_num_groups(0)) {
 75 |     int row = block_row + get_local_id(1);
 76 |     float block_total = init;
 77 | 
 78 |     global float *row_src = src_data + src_offset + row * row_size;
 79 |     global float *row_tgt = tgt_data + tgt_offset + row * row_size;
 80 | 
 81 |     // Perform scan on one block at a time, keeping track of the total value of
 82 |     // all blocks processed so far.
 83 |     for (int block_col = 0; block_col < (int)row_size; block_col += 2 * {{num_threads_x}}) {
 84 |       // Load data into shared memory (two values per thread).
 85 |       int col1 = block_col + get_local_id(0);
 86 |       int col2 = block_col + {{num_threads_x}} + get_local_id(0);
 87 |       if (row < num_rows) {
 88 |         if (col1 < row_size) {
 89 |           row_buf[get_local_id(0)] = row_src[col1];
 90 |         } else {
 91 |           row_buf[get_local_id(0)] = init;
 92 |         }
 93 | 
 94 |         if (col2 < row_size) {
 95 |           row_buf[{{num_threads_x}} + get_local_id(0)] = row_src[col2];
 96 |         } else {
 97 |           row_buf[{{num_threads_x}} + get_local_id(0)] = init;
 98 |         }
 99 | 
100 |         // Add the total value of all previous blocks to the first value of this block.
101 |         if (get_local_id(0) == 0) {
102 |           row_buf[0] = binary_op(row_buf[0], block_total);
103 | //          binary_op(row_buf, row_buf, &block_total);
104 |         }
105 |       }
106 |       barrier(CLK_LOCAL_MEM_FENCE);
107 | 
108 |       // Parallel reduction (up-sweep).
109 |       for (int s = {{num_threads_x}}, d = 1; s >= 1; s >>= 1, d <<= 1) {
110 |         if (row < num_rows && (int)get_local_id(0) < s) {
111 |           int offset = (2 * get_local_id(0) + 1) * d - 1;
112 |           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
113 | //          binary_op(row_bufer + offset + d, row_buf + offset, row_buf + offset + d);
114 |         }
115 |         barrier(CLK_LOCAL_MEM_FENCE);
116 |       }
117 | 
118 |       // Down-sweep.
119 |       for (int s = 2, d = {{num_threads_x}} / 2; d >= 1; s <<= 1, d >>= 1) {
120 |         if (row < num_rows && (int)get_local_id(0) < s - 1) {
121 |           int offset = 2 * (get_local_id(0) + 1) * d - 1;
122 |           row_buf[offset + d] = binary_op(row_buf[offset], row_buf[offset + d]);
123 | //          binary_op(row_buff + offset + d, row_buf + offset, row_buf + offset + d);
124 |         }
125 |         barrier(CLK_LOCAL_MEM_FENCE);
126 |       }
127 | 
128 | 
129 |       // Write back to output.
130 |       if (row < num_rows) {
131 |         if (col1 < row_size) row_tgt[col1] = row_buf[get_local_id(0)];
132 |         if (col2 < row_size) row_tgt[col2] = row_buf[{{num_threads_x}} + get_local_id(0)];
133 |       }
134 |       block_total = row_buf[2 * {{num_threads_x}} - 1];
135 |       barrier(CLK_LOCAL_MEM_FENCE);
136 | 
137 |     }
138 |   }
139 | }
140 | 
141 | 


--------------------------------------------------------------------------------
/src/lib/THClTensorMathTransformReduce.cl:
--------------------------------------------------------------------------------
  1 | // from lib/THC/THCTensorMathTransformReduce.cu:
  2 | 
  3 | typedef struct Pair {
  4 |    float first;
  5 |    float second;
  6 | } Pair;
  7 | 
  8 | static Pair binary_op( Pair a, Pair b ) {
  9 |     {{pair_operator2}};
 10 | }
 11 | 
 12 | /* A set of reduction kernels that take in binary ops on thrust pairs (of value, index).
 13 |    These are useful when you not only have to do a reduction, but you might have
 14 |    to preserve the location of contention (for example min/max operations).
 15 |    The structure of the kernels follows the structure of the reduction kernels.
 16 | */
 17 | kernel void THClTensor_kernel_transformReduceOuterDimIndex(
 18 |     global float *tgt1_data, int tgt1_offset,
 19 |     global float *tgt2_data, int tgt2_offset,
 20 |     global float *src__data, int src__offset,
 21 |     int num_orows, int num_irows, int row_size
 22 |   ) {
 23 |   global float *tgt1 = tgt1_data + tgt1_offset;
 24 |   global float *tgt2 = tgt2_data + tgt2_offset;
 25 |   global float *src_ = src__data + src__offset;
 26 | 
 27 |   for (int orow = get_group_id(0); orow < num_orows; orow += get_num_groups(0)) {
 28 |     for (int irow = get_group_id(1) * get_local_size(0) + get_local_id(0); irow < num_irows; irow += get_num_groups(1) * get_local_size(0)) {
 29 |       global float *src = src_ + orow * row_size * num_irows + irow;
 30 |       Pair acc = {.first={{init}}, .second=-1};
 31 |       for (int col = 0; col < row_size; ++col) {
 32 |         Pair lhs = {*src, col+1};
 33 |         acc = binary_op( lhs, acc);
 34 | //         acc = binary_op(thrust::make_pair(*src, col+1), acc); // i+1 for 1-indexing
 35 |         src += num_irows;
 36 |       }
 37 |       tgt1[orow * num_irows + irow] = acc.first;
 38 |       tgt2[orow * num_irows + irow] = acc.second;
 39 |     }
 40 |   }
 41 | }
 42 | 
 43 | /* Reduce the innermost dimension of a tensor (on thrust::pair functors which are (value, index))
 44 |  *
 45 |  * For an n-d tensor (n <= 4) where the reduction is along the innermost dimension:
 46 |  *
 47 |  * - block.x is the innermost dimension, i.e. dimension 0;
 48 |  * - block.y and grid.y make up dimension 1; and
 49 |  * - grid.x and grid z are the remaining two outer dimensions (if any)
 50 |  *
 51 |  * Reduction along other dimensions is handled in a separate kernel.
 52 |  */
 53 | kernel void THClTensor_kernel_transformReduceInnermostDimIndex(
 54 |     global float *tgt1_data, int tgt1_offset,
 55 |     global float *tgt2_data, int tgt2_offset,
 56 |     global float *src__data, int src__offset,
 57 |     int num_rows, int row_size
 58 |   ) {
 59 |   global float *tgt1 = tgt1_data + tgt1_offset;
 60 |   global float *tgt2 = tgt2_data + tgt2_offset;
 61 |   global float *src_ = src__data + src__offset;
 62 | 
 63 |   local float sbuf[{{y_threads}}][{{x_threads}}];
 64 |   local float ibuf[{{y_threads}}][{{x_threads}}];
 65 | 
 66 |   for (int block_row = get_group_id(0) * get_local_size(1); block_row < num_rows; block_row += get_local_size(1) * get_num_groups(0)) {
 67 |     int row = block_row + get_local_id(1);
 68 | //     thrust::pair<float,float> acc = init;
 69 |     Pair acc = { .first={{init}}, .second=-1 };
 70 |     if (row < num_rows) {
 71 |       global float *src = src_ + row * row_size;
 72 |       // Sequential reduction within a thread.
 73 |       for (int col = get_local_id(0); col < row_size; col += get_local_size(0)) {
 74 |         Pair lhs = {src[col], col+1};
 75 |         acc = binary_op(lhs, acc);
 76 |       }
 77 |     }
 78 | 
 79 |     sbuf[get_local_id(1)][get_local_id(0)] = acc.first;
 80 |     ibuf[get_local_id(1)][get_local_id(0)] = acc.second;
 81 | 
 82 |     // Reduce intermediate values to single value.
 83 |     local float* sline = &sbuf[get_local_id(1)][0];
 84 |     local float* iline = &ibuf[get_local_id(1)][0];
 85 |     for (int s = 8; s > 0; s >>= 1) {
 86 |       if (row < num_rows && (int)get_local_id(0) < s) {
 87 |         Pair arg1 = {.first=sline[get_local_id(0)], .second=iline[get_local_id(0)]};
 88 |         Pair arg2 = {.first=sline[get_local_id(0) + s], .second=iline[get_local_id(0) + s]};
 89 |         Pair res = binary_op(arg1, arg2);
 90 |         sline[get_local_id(0)] = res.first;
 91 |         iline[get_local_id(0)] = res.second;
 92 |       }
 93 |       barrier(CLK_LOCAL_MEM_FENCE);
 94 |     }
 95 | 
 96 |     if (row < num_rows && get_local_id(0) == 0) {
 97 |       tgt1[row] = sline[0];
 98 |       tgt2[row] = iline[0];
 99 |     }
100 |     barrier(CLK_LOCAL_MEM_FENCE);
101 |   }
102 | }
103 | 
104 | 


--------------------------------------------------------------------------------
/src/lib/THClTypeParseTraits.cpp:
--------------------------------------------------------------------------------
 1 | #include "THClTypeParseTraits.h"
 2 | 
 3 | #define REGISTER_PARSE_TYPE(X) template <> struct TypeParseTraits<X> \
 4 |     { static const char* name; } ; const char* TypeParseTraits<X>::name = #X
 5 | 
 6 | #define REGISTER_PARSE_TYPE_DEFINITION(X) \
 7 |   const char* TypeParseTraits<X>::name = #X
 8 | 
 9 | 
10 | REGISTER_PARSE_TYPE_DEFINITION(unsigned int);
11 | REGISTER_PARSE_TYPE_DEFINITION(unsigned long long);
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/src/lib/THClTypeParseTraits.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // adapted from http://stackoverflow.com/questions/1055452/c-get-name-of-type-in-template
 4 | template<typename T>
 5 | struct TypeParseTraits;
 6 | 
 7 | #define REGISTER_PARSE_TYPE_DECLARATION(X) template <> struct TypeParseTraits<X> \
 8 |     { static const char* name; } ; 
 9 | 
10 | 
11 | REGISTER_PARSE_TYPE_DECLARATION(unsigned int);
12 | REGISTER_PARSE_TYPE_DECLARATION(unsigned long);
13 | REGISTER_PARSE_TYPE_DECLARATION(unsigned long long);
14 | 
15 | 


--------------------------------------------------------------------------------
/src/test/run-test-device.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Note: should be called from root directory
 4 | 
 5 | source ~/torch/activate || exit 1
 6 | luarocks make rocks/cltorch-scm-1.rockspec || exit 1
 7 | 
 8 | if [[ ! -v LUAEXE ]]; then {
 9 |     LUAEXE=luajit
10 | } fi
11 | echo using luaexe: ${LUAEXE}
12 | 
13 | if [[ x${RUNGDB} == x1 ]]; then {
14 |   rungdb.sh ${LUAEXE} test/test-device.lua
15 | } else {
16 |   ${LUAEXE} test/test-device.lua
17 | } fi
18 | 
19 | 


--------------------------------------------------------------------------------
/src/test/run-test-perf.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Note: should be called from root directory
 4 | 
 5 | source ~/torch/activate || exit 1
 6 | luarocks make rocks/cltorch-scm-1.rockspec || exit 1
 7 | 
 8 | if [[ ! -v LUAEXE ]]; then {
 9 |     LUAEXE=luajit
10 | } fi
11 | echo using luaexe: ${LUAEXE}
12 | 
13 | if [[ x${RUNGDB} == x1 ]]; then {
14 |   rungdb.sh ${LUAEXE} test/test-perf.lua
15 | } else {
16 |   ${LUAEXE} test/test-perf.lua
17 | } fi
18 | 
19 | 


--------------------------------------------------------------------------------
/src/test/run-test-tensor.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Note: should be called from root directory
 4 | 
 5 | source ~/torch/activate || exit 1
 6 | luarocks make rocks/cltorch-scm-1.rockspec || exit 1
 7 | 
 8 | if [[ ! -v LUAEXE ]]; then {
 9 |     LUAEXE=luajit
10 | } fi
11 | echo using luaexe: ${LUAEXE}
12 | 
13 | if [[ x${RUNGDB} == x1 ]]; then {
14 |   rungdb.sh ${LUAEXE} test/test-tensor.lua
15 | } else {
16 |   ${LUAEXE} test/test-tensor.lua
17 | } fi
18 | 
19 | 


--------------------------------------------------------------------------------
/src/test/test-device.lua:
--------------------------------------------------------------------------------
 1 | print("running require cltorch...")
 2 | require 'cltorch'
 3 | print("... require cltorch done")
 4 | 
 5 | numDevices =  cltorch.getDeviceCount()
 6 | print('num devices:', numDevices)
 7 | 
 8 | for device=1,numDevices do
 9 |   props = cltorch.getDeviceProperties(device)
10 |   print('device properties, device', device)
11 |   for k,v in pairs(props) do
12 |     print('   ', k, v)
13 |   end
14 | end
15 | 
16 | for device=1,numDevices do
17 |   cltorch.setDevice(device)
18 |   c = torch.ClTensor{7,-4,5}
19 |   print('c1\n', c)
20 |   print(c:abs())
21 | end
22 | 
23 | --c = torch.ClTensor{7,4,5}
24 | --print('c1\n', c)
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/src/test/test-perf.lua:
--------------------------------------------------------------------------------
  1 | require 'cltorch'
  2 | require 'sys'
  3 | 
  4 | function test_apply1(its)
  5 |   a = torch.ClTensor(50, 500)
  6 |   a:uniform()
  7 |   a:add(1)
  8 |   cltorch.dumpProfiling()
  9 |   cltorch.dumpTimings()
 10 |   for it=1,its do
 11 |     a:add(it)    
 12 |   end
 13 |   cltorch.dumpTimings()
 14 | end
 15 | 
 16 | function test_apply2(its)
 17 |   a = torch.ClTensor(50, 500)
 18 |   a:uniform()
 19 |   b = torch.ClTensor(50, 500)
 20 |   b:uniform()
 21 |   a:add(b)
 22 |   cltorch.dumpProfiling()
 23 |   cltorch.dumpTimings()
 24 |   for it=1,its do
 25 |     a:add(b)    
 26 |   end
 27 |   cltorch.dumpTimings()
 28 | end
 29 | 
 30 | function test_scatterFill(its)
 31 |   idx = torch.multinomial(torch.range(1,10):reshape(10,1):expand(10,10):t(),10):t():cl()
 32 |   a = torch.Tensor(10,10):uniform():mul(100):int():cl()
 33 |   c = a:scatter(1,idx,3)
 34 |   cltorch.dumpProfiling()
 35 |   cltorch.dumpTimings()
 36 |   for it=1,its do
 37 |     a:scatter(1,idx,it)
 38 |   end
 39 |   cltorch.dumpProfiling()
 40 |   cltorch.dumpTimings()
 41 | end
 42 | 
 43 | function test_apply3(its, size)
 44 |   its = its or 900
 45 |   size = size or 6400
 46 |   a = torch.ClTensor(size)
 47 |   a:uniform()
 48 |   b = torch.ClTensor(size)
 49 |   b:uniform()
 50 |   c = torch.ClTensor(size)
 51 |   c:uniform()
 52 |   a:cmul(b,c)
 53 |   cltorch.finish()
 54 |   sys.tic()
 55 |   cltorch.dumpProfiling()
 56 |   cltorch.dumpTimings()
 57 |   for it=1,its do
 58 |     a:cmul(b,c)
 59 |   end
 60 |   cltorch.finish()
 61 |   print(sys.toc() * 1000)
 62 |   print('after mul', its, size)
 63 |   cltorch.dumpTimings()
 64 |   cltorch.dumpProfiling()
 65 | end
 66 | 
 67 | function test_apply3b(its, size)
 68 |   its = its or 900
 69 |   size = size or 6400
 70 |   a = torch.ClTensor(size)
 71 |   a:uniform()
 72 |   b = torch.ClTensor(size)
 73 |   b:uniform()
 74 |   c = torch.ClTensor(size)
 75 |   c:uniform()
 76 |   d = torch.ClTensor(size)
 77 |   d:uniform()
 78 |   a:cmul(b,c)
 79 |   a:add(b,c)
 80 |   e = torch.ClTensor(1000,1000)
 81 |   cltorch.finish()
 82 |   cltorch.dumpProfiling()
 83 |   cltorch.dumpTimings()
 84 |   sys.tic()
 85 |   for it=1,its do
 86 |     --d:cmul(b,c)
 87 |     f = e:sum()
 88 |     c:add(a,d)
 89 |   end
 90 |   cltorch.finish()
 91 |   print('sys.toc', sys.toc() * 1000)
 92 |   print('after mul', its, size)
 93 |   cltorch.dumpTimings()
 94 |   cltorch.dumpProfiling()
 95 | end
 96 | 
 97 | cltorch.setAddFinish(1)
 98 | cltorch.setDevice(1)
 99 | --cltorch.setProfiling(1)
100 | cltorch.setTiming(1)
101 | --test_apply1(500)
102 | --test_apply2(500)
103 | -- test_scatterFill(10000)
104 | --test_apply3(900, 6400)
105 | test_apply3b(900, 6400)
106 | test_apply3(900, 64000)
107 | --test_apply3(900, 64000)
108 | --test_apply3(900, 640000)
109 | --cltorch.dumpProfiling()
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/src/test/test-zsh.zsh:
--------------------------------------------------------------------------------
 1 | 
 2 | ps
 3 | source ~/torch/install/bin/torch-activate
 4 | env | grep LD
 5 | env | grep PATH
 6 | env | grep LUA
 7 | luajit -e 'print("hello")'
 8 | luajit -l torch -e 'print(torch.Tensor(3,2):uniform())'
 9 | luajit -l cltorch -e 'cltorch.setAllowNonGpus(1); print(torch.ClTensor(3,2):uniform())'
10 | 
11 | 


--------------------------------------------------------------------------------
/src/test/test_userkernel.lua:
--------------------------------------------------------------------------------
 1 | require 'cltorch'
 2 | 
 3 | k = torch.ClKernel({input={nElements='int', input='torch.ClTensor'},output={output='torch.ClTensor'},src=[[
 4 |    int linearId = get_global_id(0);
 5 |    if(linearId < nElements) {
 6 |      output_data[linearId] = input_data[linearId] + 3.0f;
 7 |    }
 8 | ]]})
 9 | print('k', k)
10 | k:print()
11 | 
12 | x = torch.ClTensor({3,5,2})
13 | y = torch.ClTensor({6,4,2})
14 | print('x before\n', x)
15 | print('y before\n', y)
16 | 
17 | k:run({nElements=3, input=x, output=y}, {numWorkgroups=10, workgroupSize=32})
18 | 
19 | print('y after\n', y)
20 | 
21 | 


--------------------------------------------------------------------------------
/src/test/unit_storage.lua:
--------------------------------------------------------------------------------
  1 | require 'string'
  2 | 
  3 | local runtests = false
  4 | if not cltorch then
  5 |    print('requiring cltorch')
  6 |    require 'cltorch'
  7 |    runtests = true
  8 | end
  9 | 
 10 | if not cltorch.tests then
 11 |   cltorch.tests = {}
 12 | end
 13 | 
 14 | cltorch.tests.storage = {}
 15 | 
 16 | local function assertStrContains(target, value )
 17 |   local res = string.find(target, value)
 18 |   if res == nil then
 19 |     print('assertStrContains fail: [' .. string.gsub(target, '\n', '\\n\n') .. '] not contains [' .. string.gsub(value, '\n', '\\n\n') .. ']')
 20 |     tester:assert(string.find(target, value) ~= nil)
 21 |   end
 22 | end
 23 | 
 24 | function cltorch.tests.storage.test_basic()
 25 |   tester:asserteq('[torch.ClStorage of size 0]\n', tostring(torch.ClStorage()), '')
 26 |   assertStrContains(tostring(torch.ClStorage(3)), '%[torch.ClStorage of size 3%]\n')
 27 |   tester:asserteq(tostring(torch.ClStorage{4,9,2}), ' 4\n 9\n 2\n[torch.ClStorage of size 3]\n')
 28 |   tester:asserteq(tostring(torch.ClStorage{1.5,2.4,5.3}), ' 1.5000\n 2.4000\n 5.3000\n[torch.ClStorage of size 3]\n')
 29 | 
 30 |   c = torch.ClStorage{4,9,2}
 31 |   c:fill(7)
 32 |   tester:asserteq(tostring(c), ' 7\n 7\n 7\n[torch.ClStorage of size 3]\n', '')
 33 | 
 34 |   c = torch.ClStorage{4,9,2}
 35 |   c:copy(torch.Storage{1.5,2.4,5.3})
 36 |   tester:asserteq(tostring(c), ' 1.5000\n 2.4000\n 5.3000\n[torch.ClStorage of size 3]\n')
 37 | 
 38 |   a = torch.Storage(3)
 39 |   c = torch.ClStorage{4,9,2}
 40 |   a:copy(c)  
 41 |   tester:asserteq(tostring(a), ' 4\n 9\n 2\n[torch.DoubleStorage of size 3]\n')
 42 | 
 43 | -- removed, since copies whole buffer :-(
 44 | --  c = torch.ClStorage{4,9,2}
 45 | --  c[2] = 21
 46 | --  tester:asserteq(tostring(c), '\n  4\n 21\n  2\n[torch.ClStorage of size 3]\n')
 47 | 
 48 |   c = torch.ClStorage{4,9,2}
 49 |   d = torch.ClStorage(3)
 50 |   d:copy(c)
 51 |   tester:asserteq(tostring(d), ' 4\n 9\n 2\n[torch.ClStorage of size 3]\n')
 52 |   tester:asserteq(3, #d)
 53 |   tester:asserteq(3, d:size())
 54 | 
 55 |   c:resize(5)
 56 |   tester:asserteq(5, #c)
 57 |   c:fill(1)
 58 |   tester:asserteq(tostring(c), ' 1\n 1\n 1\n 1\n 1\n[torch.ClStorage of size 5]\n')
 59 | end
 60 | 
 61 | function cltorch.tests.storage.test_get()
 62 |   -- we probably should support this.  specifically, without this, lbfgs doesnt work :-P
 63 | --  a = torch.Storage(10000)
 64 |   acl = torch.ClStorage(10000)
 65 |   tester:asserteq('torch.ClStorage', torch.type(acl))
 66 |   acl[2] = 72
 67 |   acl[500] = 104
 68 |   acl[7500] = 1040
 69 | --  acl:copy(a)
 70 |   tester:asserteq(72, acl[2])
 71 |   tester:asserteq(104, acl[500])
 72 |   tester:asserteq(1040, acl[7500])
 73 | end
 74 | 
 75 | local function setUp()
 76 |   -- cltorch.setDevice(1)
 77 |   print('')
 78 | end
 79 | 
 80 | local test = torch.TestSuite()
 81 | for k,v in pairs(cltorch.tests.storage) do
 82 |   test[k] = function()
 83 |     setUp()
 84 |     v()
 85 |   end
 86 | end
 87 | 
 88 | function cltorch.tests.storage.test()
 89 |    tester = torch.Tester()
 90 |    tester:add(test)
 91 |    tester:run(tests)
 92 |    print('#tester.errors', #tester.errors)
 93 |    return #tester.errors == 0
 94 | end
 95 | 
 96 | if runtests then
 97 |   return cltorch.tests.storage.test()
 98 | end
 99 | 
100 | 


--------------------------------------------------------------------------------
/src/torch/utils.c:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | THLongStorage* cltorch_checklongargs(lua_State *L, int index)
 4 | {
 5 |   THLongStorage *storage;
 6 |   int i;
 7 |   int narg = lua_gettop(L)-index+1;
 8 | 
 9 |   if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage"))
10 |   {
11 |     THLongStorage *storagesrc = luaT_toudata(L, index, "torch.LongStorage");
12 |     storage = THLongStorage_newWithSize(storagesrc->size);
13 |     THLongStorage_copy(storage, storagesrc);
14 |   }
15 |   else
16 |   {
17 |     storage = THLongStorage_newWithSize(narg);
18 |     for(i = index; i < index+narg; i++)
19 |     {
20 |       if(!lua_isnumber(L, i))
21 |       {
22 |         THLongStorage_free(storage);
23 |         luaL_argerror(L, i, "number expected");
24 |       }
25 |       THLongStorage_set(storage, i-index, lua_tonumber(L, i));
26 |     }
27 |   }
28 |   return storage;
29 | }
30 | 
31 | int cltorch_islongargs(lua_State *L, int index)
32 | {
33 |   int narg = lua_gettop(L)-index+1;
34 | 
35 |   if(narg == 1 && luaT_toudata(L, index, "torch.LongStorage"))
36 |   {
37 |     return 1;
38 |   }
39 |   else
40 |   {
41 |     int i;
42 | 
43 |     for(i = index; i < index+narg; i++)
44 |     {
45 |       if(!lua_isnumber(L, i))
46 |         return 0;
47 |     }
48 |     return 1;
49 |   }
50 |   return 0;
51 | }
52 | 
53 | struct THClState* cltorch_getstate(lua_State* L)
54 | {
55 |   lua_getglobal(L, "cltorch");
56 |   lua_getfield(L, -1, "_state");
57 |   struct THClState *state = lua_touserdata(L, -1);
58 |   lua_pop(L, 2);
59 |   return state;
60 | }
61 | 


--------------------------------------------------------------------------------
/src/torch/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef CLTORCH_UTILS_INC
 2 | #define CLTORCH_UTILS_INC
 3 | 
 4 | #include "luaT.h"
 5 | #include "TH.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | # define TORCH_EXTERNC extern "C"
 9 | #else
10 | # define TORCH_EXTERNC extern
11 | #endif
12 | 
13 | #ifdef WIN32
14 | # ifdef torch_EXPORTS
15 | #  define TORCH_API TORCH_EXTERNC __declspec(dllexport)
16 | # else
17 | #  define TORCH_API TORCH_EXTERNC __declspec(dllimport)
18 | # endif
19 | #else
20 | # define TORCH_API TORCH_EXTERNC
21 | #endif
22 | 
23 | #if LUA_VERSION_NUM == 501
24 | /*
25 | ** Adapted from Lua 5.2.0
26 | */
27 | static void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
28 |   luaL_checkstack(L, nup+1, "too many upvalues");
29 |   for (; l->name != NULL; l++) {  /* fill the table with given functions */
30 |     int i;
31 |     lua_pushstring(L, l->name);
32 |     for (i = 0; i < nup; i++)  /* copy upvalues to the top */
33 |       lua_pushvalue(L, -(nup+1));
34 |     lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
35 |     lua_settable(L, -(nup + 3));
36 |   }
37 |   lua_pop(L, nup);  /* remove upvalues */
38 | }
39 | #endif
40 | 
41 | 
42 | TORCH_API THLongStorage* cltorch_checklongargs(lua_State *L, int index);
43 | TORCH_API int cltorch_islongargs(lua_State *L, int index);
44 | 
45 | struct THClState;
46 | TORCH_API struct THClState* cltorch_getstate(lua_State* L);
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/src/travis/install-torch.sh:
--------------------------------------------------------------------------------
 1 | if false; then {
 2 |   git clone https://github.com/torch/distro.git ~/torch
 3 |   cd ~/torch
 4 |   'for pkg in cudnn cunn cunnx cutorch qttorch trepl graph optim sdl2 threads submodule graphicsmagick audio fftw3 signal nnx qtlua gnuplot dok iTorch argcheck image xlua; do { sed -i -e "s/\(.*$pkg.*\)/echo skipping $pkg # \1/" install.sh; } done'
 5 |   'awk ''NR==2{print "set -x"}1'' install.sh > ~install.sh'
 6 |   mv ~install.sh install.sh
 7 |   chmod +x install.sh
 8 |   cat install.sh
 9 |   for pkg in exe/luajit-rocks extra/nn pkg/cwrap pkg/paths pkg/sundown pkg/sys pkg/torch pkg/paths extra/lua-cjson extra/luaffifb extra/luafilesystem extra/penlight; do { git submodule update --quiet --init $pkg; } done
10 |   sed -i -e 's/$(MAKE)/$(MAKE) -j 4/' pkg/torch/rocks/torch-scm-1.rockspec
11 |   ./install.sh -b >/dev/null
12 | } else {
13 |   mkdir -p ~/torch
14 |   cd ~/torch
15 |   wget https://s3.amazonaws.com/hughperkinstravis/hughperkins/distro/3/3.1/torch-install.tar.bz2
16 |   tar -xf torch-install.tar.bz2
17 | } fi
18 | 
19 | sed -i -e 's/^export LD_LIBRARY_PATH/# export LD_LIBRARY_PATH/' ~/torch/install/bin/torch-activate
20 | sed -i -e 's/^export DYLD_LIBRARY_PATH/# export LD_LIBRARY_PATH/' ~/torch/install/bin/torch-activate
21 | source ~/torch/install/bin/torch-activate
22 | luajit -l torch -e 'print(torch.Tensor(3,2):uniform())'
23 | 
24 | 


--------------------------------------------------------------------------------
/src/util/port.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This does a first cut port from `../cutorch-goodies2` directory,
  3 | into the `port` subdirectory.
  4 | I've never actually used it for porting whole files yet, but 
  5 | it does make using `meld` against newer cutorch branches, such as 
  6 | `goodies2` much more possible.
  7 | 
  8 | Possible future enhancements:
  9 | - make it automatically move kernels and device functions into a '.cl'
 10 |   file (plausibly anything with host goes both into the .h/.cpp, and also
 11 |   into the .cl)
 12 | """
 13 | 
 14 | from __future__ import print_function
 15 | import sys
 16 | import os
 17 | from os.path import join as jp
 18 | from os import path
 19 | 
 20 | src_dir = '../cutorch'  # directory to port from
 21 | 
 22 | def process_block(block):
 23 |   if block.find(' operator()') >= 0:
 24 |     # its an Op struct, we are not writing these as kernels
 25 |     # but using Apply instead, and passing in the appropriate code
 26 |     # as strings into the kernel templates
 27 |     return (block, False)
 28 |   if block.find('__global__') >= 0 or block.find('__device__') >= 0:
 29 |     # kernel method, probably
 30 |     block = block.replace('gridDim.x', 'get_num_groups(0)')
 31 |     block = block.replace('gridDim.y', 'get_num_groups(1)')
 32 |     block = block.replace('blockDim.x', 'get_local_size(0)')
 33 |     block = block.replace('blockDim.y', 'get_local_size(1)')
 34 |     block = block.replace('blockIdx.x', 'get_group_id(0)')
 35 |     block = block.replace('blockIdx.y', 'get_group_id(1)')
 36 |     block = block.replace('threadIdx.x', 'get_local_id(0)')
 37 |     block = block.replace('threadIdx.y', 'get_local_id(1)')
 38 |     block = block.replace('__global__', 'kernel')
 39 |     block = block.replace('__shared__', 'local')
 40 |     block = block.replace('__syncthreads()', 'barrier(CLK_LOCAL_MEM_FENCE)')
 41 |     block = block.replace('warpSize', '{{WarpSize}}')
 42 |     block = block.replace('IndexType', '{{IndexType}}')
 43 |     block = block.replace('__device__', '/*__device__*/')
 44 |     block = block.replace('__forceinline__', '/*__forceline__*/')
 45 |     return (block, True)
 46 |   return (block, False)
 47 | 
 48 | def process_dir(cutorch_dir, port_dir, rel_dir):
 49 |   cutorch_src = jp(cutorch_dir, rel_dir)
 50 |   cltorch_dst = jp(port_dir, rel_dir).replace('THC', 'THCl')
 51 |   if not path.isdir(cltorch_dst):
 52 |     os.makedirs(cltorch_dst)
 53 |   for filename in os.listdir(cltorch_dst):
 54 |     filepath = jp(cltorch_dst, filename)
 55 |     if path.isfile(filepath):
 56 |       os.remove(filepath)
 57 |   out_filenames = []
 58 |   for filename in os.listdir(cutorch_src):
 59 |     original_filename = filename
 60 |     print('filename', filename)
 61 |     original_filepath = jp(cutorch_src, filename)
 62 |     if not path.isfile(original_filepath):
 63 |       continue
 64 |     f = open(jp(cutorch_src, filename), 'r')
 65 |     contents = f.read()
 66 |     f.close()
 67 |     base_name = filename.split('.')[0].replace('THC', 'THCl')
 68 |     suffix = '.' + filename.split('.')[1]
 69 |     if suffix == '.cuh':
 70 |       suffix = '.h'
 71 |     if suffix == '.cu':
 72 |       suffix = '.cpp'
 73 |     if suffix == '.c':
 74 |       suffix = '.cpp'
 75 |     filename = '{base}{suffix}'.format(
 76 |       base=base_name,
 77 |       suffix=suffix)
 78 |     if filename in out_filenames:
 79 |       print('warning: filename conflict: {filename}'.format(
 80 |         filename=filename))
 81 |     contents = contents.replace('CUDA', 'CL')
 82 |     contents = contents.replace('Cuda', 'Cl')
 83 |     contents = contents.replace('#include "THC', '#include "THCl')
 84 |     contents = contents.replace('THC_', 'THCL_')
 85 |     contents = contents.replace('THCState', 'THClState')
 86 |     contents = contents.replace('CUTORCH', 'CLTORCH')
 87 |     contents = contents.replace('THCBlasState', 'THClBlasState')
 88 |     contents = contents.replace('cublasOperation_t', 'clblasTranspose')
 89 |     contents = contents.replace('cublas', 'clblas')
 90 |     contents = contents.replace('cutorch', 'cltorch')
 91 |    
 92 |     # line by line:
 93 |     new_contents = ''
 94 |     new_cl = ''
 95 |     scope_dead = False
 96 |     depth = 0
 97 |     block = ''
 98 |     for line in contents.split('\n'):
 99 |       if line.startswith('#include <thrust'):
100 |         line = '// ' + line
101 |       elif line.find('thrust::') >= 0:
102 |         line = '// ' + line
103 |         scope_dead = True
104 |       if line.find('{') >= 0:
105 |         depth += 1
106 |       if line.find('#include <cuda') >= 0:
107 |         line = ''
108 |       if line.strip() == 'THClCheck(cudaGetLastError());':
109 |         line = ''
110 |       if scope_dead and line.find('return') >= 0:
111 |         line = ('  THError("Not implemented");\n' +
112 |             '  return 0;\n  // ' +
113 |             line)
114 |         scope_dead = False
115 |       if line.find('}') >= 0:
116 |         if scope_dead:
117 |           line = ('  THError("Not implemented");\n' +
118 |             line)
119 |           scope_dead = False
120 |         depth -= 1
121 |       block += line + '\n'
122 |       if line.strip() == '' and depth == 0:
123 |         block, is_cl = process_block(block)
124 |         if is_cl:
125 |           new_cl += block
126 |         else:
127 |           new_contents += block
128 |         block = ''
129 |     block, is_cl = process_block(block)
130 |     if is_cl:
131 |       new_cl += block
132 |     else:
133 |       new_contents += block
134 |     block = ''
135 |     if new_contents.strip() != "":
136 |       f = open(jp(cltorch_dst, filename), 'a')
137 |       f.write('// from lib/THC/{filename}:\n\n'.format(
138 |         filename=original_filename))
139 |       f.write(new_contents)
140 |       f.close()
141 |       out_filenames.append(filename)
142 |     if new_cl.strip() != '':
143 |       clfilename = original_filename.replace('.cuh', '.cl')
144 |       clfilename = clfilename.replace('.cu', '.cl')
145 |       clfilename = clfilename.replace('THC', 'THCl')
146 |       clfilepath = jp(cltorch_dst, clfilename)
147 |       f = open(clfilepath, 'a')
148 |       f.write('// from {rel_dir}/{filename}:\n\n'.format(
149 |         rel_dir=rel_dir,
150 |         filename=original_filename))
151 |       f.write(new_cl)
152 |       f.close()
153 | 
154 | process_dir(src_dir, 'port', 'lib/THC')
155 | process_dir(src_dir, 'port', 'torch')
156 | process_dir(src_dir, 'port', 'torch/generic')
157 | #  cutorch_dir = '../cutorch-goodies2'
158 | 
159 | #  cutorch_src = '{cutorch_dir}/lib/THC'.format(
160 | #    cutorch_dir=cutorch_dir)
161 | 
162 | #  port_dir = 'port'
163 | 
164 | 


--------------------------------------------------------------------------------