├── .gitignore ├── .gitmodules ├── Cuda ├── containers │ ├── safe_call.hpp │ ├── initialization.hpp │ ├── device_memory_impl.hpp │ ├── kernel_containers.hpp │ ├── device_array_impl.hpp │ ├── device_memory.cpp │ ├── device_memory.hpp │ ├── device_array.hpp │ └── initialization.cpp ├── internal.h ├── pyrdown.cu └── estimate.cu ├── LICENSE ├── ICPOdometry.h ├── CMakeLists.txt ├── CudaDetect.cmake ├── CudaComputeTargetFlags.cmake ├── README.md ├── ICPOdometry.cpp └── ICP.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .project 3 | .idea 4 | *.user 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third-party/Sophus"] 2 | path = third-party/Sophus 3 | url = https://github.com/strasdat/Sophus.git 4 | [submodule "third-party/Pangolin"] 5 | path = third-party/Pangolin 6 | url = https://github.com/stevenlovegrove/Pangolin.git 7 | [submodule "third-party/Eigen"] 8 | path = third-party/Eigen 9 | url = https://github.com/eigenteam/eigen-git-mirror.git 10 | -------------------------------------------------------------------------------- /Cuda/containers/safe_call.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * safe_call.hpp 3 | * 4 | * Created on: 1 Sep 2014 5 | * Author: thomas 6 | */ 7 | 8 | #ifndef SAFE_CALL_HPP_ 9 | #define SAFE_CALL_HPP_ 10 | 11 | #include 12 | #include 13 | #include "../containers/initialization.hpp" 14 | 15 | #if defined(__GNUC__) 16 | #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__, __func__) 17 | #else /* defined(__CUDACC__) || defined(__MSVC__) */ 18 | #define cudaSafeCall(expr) ___cudaSafeCall(expr, __FILE__, __LINE__) 19 | #endif 20 | 21 | static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "") 22 | { 23 | if (cudaSuccess != err) 24 | error(cudaGetErrorString(err), file, line, func); 25 | } 26 | 27 | #endif /* SAFE_CALL_HPP_ */ 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Thomas Whelan 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /ICPOdometry.h: -------------------------------------------------------------------------------- 1 | /* 2 | * ICPOdometry.h 3 | * 4 | * Created on: 17 Sep 2012 5 | * Author: thomas 6 | */ 7 | 8 | #ifndef ICPODOMETRY_H_ 9 | #define ICPODOMETRY_H_ 10 | 11 | #include "Cuda/internal.h" 12 | 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | class ICPOdometry { 20 | public: 21 | EIGEN_MAKE_ALIGNED_OPERATOR_NEW 22 | ICPOdometry(int width, int height, float cx, float cy, float fx, float fy, 23 | float distThresh = 0.10f, 24 | float angleThresh = sinf(20.f * 3.14159254f / 180.f)); 25 | 26 | virtual ~ICPOdometry(); 27 | 28 | void initICP(unsigned short *depth, const float depthCutoff = 20.0f); 29 | 30 | void initICPModel(unsigned short *depth, const float depthCutoff = 20.0f); 31 | 32 | void getIncrementalTransformation(Sophus::SE3d &T_prev_curr, int threads, 33 | int blocks); 34 | 35 | float lastError; 36 | float lastInliers; 37 | 38 | private: 39 | std::vector> depth_tmp; 40 | 41 | std::vector> vmaps_prev; 42 | std::vector> nmaps_prev; 43 | 44 | std::vector> vmaps_curr; 45 | std::vector> nmaps_curr; 46 | 47 | Intr intr; 48 | 49 | DeviceArray> sumData; 50 | DeviceArray> outData; 51 | 52 | static const int NUM_PYRS = 3; 53 | 54 | std::vector iterations; 55 | 56 | float dist_thresh; 57 | float angle_thresh; 58 | 59 | const int width; 60 | const int height; 61 | const float cx, cy, fx, fy; 62 | }; 63 | 64 | #endif /* ICPODOMETRY_H_ */ 65 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.6.0) 2 | 3 | project(ICP) 4 | 5 | set(Pangolin_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/Pangolin/build/src" CACHE PATH "Pangolin build directory") 6 | set(Sophus_INCLUDE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/third-party/Sophus" CACHE PATH "Sophus Include directory") 7 | 8 | find_package(Pangolin REQUIRED) 9 | find_package(CUDA REQUIRED) 10 | 11 | include_directories(${Pangolin_INCLUDE_DIRS}) 12 | include_directories(${CUDA_INCLUDE_DIRS}) 13 | include_directories(${EIGEN_INCLUDE_DIRS}) 14 | include_directories(${Sophus_INCLUDE_DIR}) 15 | 16 | file(GLOB srcs *.cpp *.cu *.h *.cuh) 17 | file(GLOB cuda Cuda/*.cu Cuda/*.cpp Cuda/*.cuh Cuda/*.h) 18 | file(GLOB containers Cuda/containers/*.cpp Cuda/containers/*.h Cuda/containers/*.cu Cuda/containers/*.cuh) 19 | 20 | set(CUDA_ARCH_BIN "" CACHE STRING "Specify 'real' GPU arch to build binaries for, BIN(PTX) format is supported. Example: 1.3 2.1(1.3) or 13 21(13)") 21 | set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX arch to build PTX intermediate code for. Example: 1.0 1.2 or 10 12") 22 | 23 | include("CudaDetect.cmake") 24 | detect_installed_gpus(CUDA_NVCC_ARCHS) 25 | foreach(NVCC_ARCH IN LISTS CUDA_NVCC_ARCHS) 26 | list(APPEND CUDA_ARCH_BIN "${NVCC_ARCH} ") 27 | endforeach(NVCC_ARCH) 28 | 29 | include("CudaComputeTargetFlags.cmake") 30 | APPEND_TARGET_ARCH_FLAGS() 31 | 32 | set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler;-fPIC;--expt-relaxed-constexpr") 33 | 34 | CUDA_COMPILE(cuda_objs ${cuda}) 35 | 36 | set(CMAKE_CXX_FLAGS "-O3 -msse2 -msse3 -Wall -std=c++11") 37 | 38 | add_executable(ICP 39 | ${srcs} 40 | ${cuda} 41 | ${cuda_objs} 42 | ${containers} 43 | ) 44 | 45 | target_link_libraries(ICP 46 | ${Pangolin_LIBRARIES} 47 | ${CUDA_LIBRARIES} 48 | ) 49 | 50 | -------------------------------------------------------------------------------- /CudaDetect.cmake: -------------------------------------------------------------------------------- 1 | # Taken from https://github.com/BVLC/caffe/blob/master/cmake/Cuda.cmake 2 | ################################################################################################ 3 | # A function for automatic detection of GPUs installed (if autodetection is enabled) 4 | # Usage: 5 | # detect_installed_gpus(out_variable) 6 | function(detect_installed_gpus out_variable) 7 | if(NOT CUDA_gpu_detect_output) 8 | set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) 9 | 10 | file(WRITE ${__cufile} "" 11 | "#include \n" 12 | "int main()\n" 13 | "{\n" 14 | " int count = 0;\n" 15 | " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" 16 | " if (count == 0) return -1;\n" 17 | " for (int device = 0; device < count; ++device)\n" 18 | " {\n" 19 | " cudaDeviceProp prop;\n" 20 | " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" 21 | " std::printf(\"%d.%d;\", prop.major, prop.minor);\n" 22 | " }\n" 23 | " return 0;\n" 24 | "}\n") 25 | 26 | execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-Wno-deprecated-gpu-targets" "--run" "${__cufile}" 27 | WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" 28 | RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out 29 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 30 | 31 | if(__nvcc_res EQUAL 0) 32 | string(REGEX REPLACE "\\." "" __nvcc_out "${__nvcc_out}") 33 | string(REGEX MATCHALL "[0-9;]+" __nvcc_out "${__nvcc_out}") 34 | list(REMOVE_DUPLICATES __nvcc_out) 35 | set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architectures from detect_gpus tool" FORCE) 36 | endif() 37 | endif() 38 | 39 | if(NOT CUDA_gpu_detect_output) 40 | message(STATUS "Automatic GPU detection failed. Is CUDA properly installed? .") 41 | else() 42 | set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) 43 | endif() 44 | endfunction() 45 | -------------------------------------------------------------------------------- /CudaComputeTargetFlags.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # Compute target flags macros by Anatoly Baksheev 3 | # 4 | # Usage in CmakeLists.txt: 5 | # include(CudaComputeTargetFlags.cmake) 6 | # APPEND_TARGET_ARCH_FLAGS() 7 | 8 | #compute flags macros 9 | MACRO(CUDA_COMPUTE_TARGET_FLAGS arch_bin arch_ptx cuda_nvcc_target_flags) 10 | string(REGEX REPLACE "\\." "" ARCH_BIN_WITHOUT_DOTS "${${arch_bin}}") 11 | string(REGEX REPLACE "\\." "" ARCH_PTX_WITHOUT_DOTS "${${arch_ptx}}") 12 | 13 | set(cuda_computer_target_flags_temp "") 14 | 15 | # Tell NVCC to add binaries for the specified GPUs 16 | string(REGEX MATCHALL "[0-9()]+" ARCH_LIST "${ARCH_BIN_WITHOUT_DOTS}") 17 | foreach(ARCH IN LISTS ARCH_LIST) 18 | if (ARCH MATCHES "([0-9]+)\$([0-9]+)\$") 19 | # User explicitly specified PTX for the concrete BIN 20 | set(cuda_computer_target_flags_temp ${cuda_computer_target_flags_temp} -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) 21 | else() 22 | # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN 23 | set(cuda_computer_target_flags_temp ${cuda_computer_target_flags_temp} -gencode arch=compute_${ARCH},code=sm_${ARCH}) 24 | endif() 25 | endforeach() 26 | 27 | # Tell NVCC to add PTX intermediate code for the specified architectures 28 | string(REGEX MATCHALL "[0-9]+" ARCH_LIST "${ARCH_PTX_WITHOUT_DOTS}") 29 | foreach(ARCH IN LISTS ARCH_LIST) 30 | set(cuda_computer_target_flags_temp ${cuda_computer_target_flags_temp} -gencode arch=compute_${ARCH},code=compute_${ARCH}) 31 | endforeach() 32 | 33 | set(${cuda_nvcc_target_flags} ${cuda_computer_target_flags_temp}) 34 | ENDMACRO() 35 | 36 | MACRO(APPEND_TARGET_ARCH_FLAGS) 37 | set(cuda_nvcc_target_flags "") 38 | CUDA_COMPUTE_TARGET_FLAGS(CUDA_ARCH_BIN CUDA_ARCH_PTX cuda_nvcc_target_flags) 39 | if (cuda_nvcc_target_flags) 40 | message(STATUS "CUDA NVCC target flags: ${cuda_nvcc_target_flags}") 41 | list(APPEND CUDA_NVCC_FLAGS ${cuda_nvcc_target_flags}) 42 | endif() 43 | ENDMACRO() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ICPCUDA 2 | Super fast implementation of ICP in CUDA for compute capable devices 3.5 or higher. On an NVIDIA GeForce GTX TITAN X it runs at over __750Hz__ (using projective data assocation). Last tested with Ubuntu 18.04.2, CUDA 10.1 and NVIDIA drivers 418.39. 3 | 4 | Requires CUDA, includes [Pangolin](https://github.com/stevenlovegrove/Pangolin), [Eigen](https://github.com/stevenlovegrove/eigen) and [Sophus](https://github.com/stevenlovegrove/Sophus) third party submodules. I've built it to take in raw TUM RGB-D datasets to do frame-to-frame dense ICP as an example application. 5 | 6 | Install; 7 | 8 | ```bash 9 | sudo apt-get install build-essential cmake libglew-dev libpng-dev 10 | git clone https://github.com/mp3guy/ICPCUDA.git 11 | cd ICPCUDA 12 | git submodule update --init 13 | cd third-party/Pangolin/ 14 | mkdir build 15 | cd build/ 16 | cmake ../ -DEIGEN_INCLUDE_DIR= 17 | make -j12 18 | cd ../../../ 19 | mkdir build 20 | cd build/ 21 | cmake .. 22 | make -j12 23 | ``` 24 | 25 | The particular version of ICP implemented is the one introduced by [KinectFusion](http://homes.cs.washington.edu/~newcombe/papers/newcombe_etal_ismar2011.pdf). This means a three level coarse-to-fine registration pyramid, from 160x120 to 320x240 and finally 640x480 image sizes, with 4, 5 and 10 iterations per level respectively. 26 | 27 | Run like; 28 | 29 | ```bash 30 | ./ICP ~/Desktop/rgbd_dataset_freiburg1_desk/ -v 31 | ``` 32 | 33 | Where ~/Desktop/rgbd\_dataset\_freiburg1\_desk/ contains the depth.txt file, for more information see [here](http://vision.in.tum.de/data/datasets/rgbd-dataset). 34 | 35 | The main idea to getting the best performance is determining the best thread/block sizes to use. I have provided an exhaustive search function to do this, since it varies between GPUs. Simply pass the "-v" switch to the program to activate the search. The code will then first do a search for the best thread/block sizes and then run ICP and output something like this on an nVidia GeForce GTX TITAN X; 36 | 37 | ```bash 38 | GeForce GTX TITAN X 39 | Searching for the best thread/block configuration for your GPU... 40 | Best: 256 threads, 96 blocks (1.3306ms), 100% 41 | ICP: 1.3236ms 42 | ICP speed: 755Hz 43 | ``` 44 | 45 | The code will output one file; output.poses. You can evaluate it on the TUM benchmark by using their tools. I get something like this; 46 | 47 | ```bash 48 | python ~/stuff/Kinect_Logs/Freiburg/evaluate_ate.py ~/Desktop/rgbd_dataset_freiburg1_desk/groundtruth.txt output.poses 49 | 0.144041 50 | ``` 51 | 52 | The difference in values comes down to the fact that each method uses a different reduction scheme and floating point operations are [not associative](https://halshs.archives-ouvertes.fr/hal-00949355v1/document). 53 | 54 | Also, if you're using this code in academic work and it would be suitable to do so, please consider referencing some of my possibly relevant [research](http://www.thomaswhelan.ie/#publications) in your literature review/related work section. 55 | -------------------------------------------------------------------------------- /Cuda/containers/initialization.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #ifndef INITIALISATION_HPP_ 38 | #define INITIALISATION_HPP_ 39 | 40 | #include 41 | 42 | /** \brief Returns number of Cuda device. */ 43 | int getCudaEnabledDeviceCount(); 44 | 45 | /** \brief Sets active device to work with. */ 46 | void setDevice(int device); 47 | 48 | /** \brief Return devuce name for gived device. */ 49 | std::string getDeviceName(int device); 50 | 51 | /** \brief Prints infromatoin about given cuda deivce or about all deivces 52 | * \param deivce: if < 0 prints info for all devices, otherwise the function interpets is as device id. 53 | */ 54 | void printCudaDeviceInfo(int device = -1); 55 | 56 | /** \brief Prints infromatoin about given cuda deivce or about all deivces 57 | * \param deivce: if < 0 prints info for all devices, otherwise the function interpets is as device id. 58 | */ 59 | void printShortCudaDeviceInfo(int device = -1); 60 | 61 | /** \brief Returns true if pre-Fermi generaton GPU. 62 | * \param device: device id to check, if < 0 checks current device. 63 | */ 64 | bool checkIfPreFermiGPU(int device = -1); 65 | 66 | /** \brief Error handler. All GPU functions call this to report an error. For internal use only */ 67 | void error(const char *error_string, const char *file, const int line, const char *func = ""); 68 | 69 | #endif /* INITIALISATION_HPP_ */ 70 | -------------------------------------------------------------------------------- /Cuda/containers/device_memory_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #ifndef DEVICE_MEMORY_IMPL_HPP_ 38 | #define DEVICE_MEMORY_IMPL_HPP_ 39 | 40 | ///////////////////// Inline implementations of DeviceMemory //////////////////////////////////////////// 41 | 42 | template inline T* DeviceMemory::ptr() { return ( T*)data_; } 43 | template inline const T* DeviceMemory::ptr() const { return (const T*)data_; } 44 | 45 | template inline DeviceMemory::operator PtrSz() const 46 | { 47 | PtrSz result; 48 | result.data = (U*)ptr(); 49 | result.size = sizeBytes_/sizeof(U); 50 | return result; 51 | } 52 | 53 | ///////////////////// Inline implementations of DeviceMemory2D //////////////////////////////////////////// 54 | 55 | template T* DeviceMemory2D::ptr(int y_arg) { return ( T*)(( char*)data_ + y_arg * step_); } 56 | template const T* DeviceMemory2D::ptr(int y_arg) const { return (const T*)((const char*)data_ + y_arg * step_); } 57 | 58 | template DeviceMemory2D::operator PtrStep() const 59 | { 60 | PtrStep result; 61 | result.data = (U*)ptr(); 62 | result.step = step_; 63 | return result; 64 | } 65 | 66 | template DeviceMemory2D::operator PtrStepSz() const 67 | { 68 | PtrStepSz result; 69 | result.data = (U*)ptr(); 70 | result.step = step_; 71 | result.cols = colsBytes_/sizeof(U); 72 | result.rows = rows_; 73 | return result; 74 | } 75 | 76 | #endif /* DEVICE_MEMORY_IMPL_HPP_ */ 77 | 78 | -------------------------------------------------------------------------------- /ICPOdometry.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * ICPOdometry.cpp 3 | * 4 | * Created on: 17 Sep 2012 5 | * Author: thomas 6 | */ 7 | 8 | #include "ICPOdometry.h" 9 | 10 | ICPOdometry::ICPOdometry(int width, int height, float cx, float cy, float fx, 11 | float fy, float distThresh, float angleThresh) 12 | : lastError(0), lastInliers(width * height), dist_thresh(distThresh), 13 | angle_thresh(angleThresh), width(width), height(height), cx(cx), cy(cy), 14 | fx(fx), fy(fy) { 15 | sumData.create(MAX_THREADS); 16 | outData.create(1); 17 | 18 | intr.cx = cx; 19 | intr.cy = cy; 20 | intr.fx = fx; 21 | intr.fy = fy; 22 | 23 | iterations.reserve(NUM_PYRS); 24 | 25 | depth_tmp.resize(NUM_PYRS); 26 | 27 | vmaps_prev.resize(NUM_PYRS); 28 | nmaps_prev.resize(NUM_PYRS); 29 | 30 | vmaps_curr.resize(NUM_PYRS); 31 | nmaps_curr.resize(NUM_PYRS); 32 | 33 | for (int i = 0; i < NUM_PYRS; ++i) { 34 | int pyr_rows = height >> i; 35 | int pyr_cols = width >> i; 36 | 37 | depth_tmp[i].create(pyr_rows, pyr_cols); 38 | 39 | vmaps_prev[i].create(pyr_rows * 3, pyr_cols); 40 | nmaps_prev[i].create(pyr_rows * 3, pyr_cols); 41 | 42 | vmaps_curr[i].create(pyr_rows * 3, pyr_cols); 43 | nmaps_curr[i].create(pyr_rows * 3, pyr_cols); 44 | } 45 | } 46 | 47 | ICPOdometry::~ICPOdometry() {} 48 | 49 | void ICPOdometry::initICP(unsigned short *depth, const float depthCutoff) { 50 | depth_tmp[0].upload(depth, sizeof(unsigned short) * width, height, width); 51 | 52 | for (int i = 1; i < NUM_PYRS; ++i) { 53 | pyrDown(depth_tmp[i - 1], depth_tmp[i]); 54 | } 55 | 56 | for (int i = 0; i < NUM_PYRS; ++i) { 57 | createVMap(intr(i), depth_tmp[i], vmaps_curr[i], depthCutoff); 58 | createNMap(vmaps_curr[i], nmaps_curr[i]); 59 | } 60 | 61 | cudaDeviceSynchronize(); 62 | } 63 | 64 | void ICPOdometry::initICPModel(unsigned short *depth, const float depthCutoff) { 65 | depth_tmp[0].upload(depth, sizeof(unsigned short) * width, height, width); 66 | 67 | for (int i = 1; i < NUM_PYRS; ++i) { 68 | pyrDown(depth_tmp[i - 1], depth_tmp[i]); 69 | } 70 | 71 | for (int i = 0; i < NUM_PYRS; ++i) { 72 | createVMap(intr(i), depth_tmp[i], vmaps_prev[i], depthCutoff); 73 | createNMap(vmaps_prev[i], nmaps_prev[i]); 74 | } 75 | 76 | cudaDeviceSynchronize(); 77 | } 78 | 79 | void ICPOdometry::getIncrementalTransformation(Sophus::SE3d &T_prev_curr, 80 | int threads, int blocks) { 81 | iterations[0] = 10; 82 | iterations[1] = 5; 83 | iterations[2] = 4; 84 | 85 | for (int i = NUM_PYRS - 1; i >= 0; i--) { 86 | for (int j = 0; j < iterations[i]; j++) { 87 | float residual_inliers[2]; 88 | Eigen::Matrix A_icp; 89 | Eigen::Matrix b_icp; 90 | 91 | estimateStep(T_prev_curr.rotationMatrix().cast().eval(), 92 | T_prev_curr.translation().cast().eval(), 93 | vmaps_curr[i], nmaps_curr[i], intr(i), vmaps_prev[i], 94 | nmaps_prev[i], dist_thresh, angle_thresh, sumData, outData, 95 | A_icp.data(), b_icp.data(), &residual_inliers[0], threads, 96 | blocks); 97 | 98 | lastError = sqrt(residual_inliers[0]) / residual_inliers[1]; 99 | lastInliers = residual_inliers[1]; 100 | 101 | const Eigen::Matrix update = 102 | A_icp.cast().ldlt().solve(b_icp.cast()); 103 | 104 | T_prev_curr = Sophus::SE3d::exp(update) * T_prev_curr; 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /Cuda/internal.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Point Cloud Library (PCL) - www.pointclouds.org 5 | * Copyright (c) 2011, Willow Garage, Inc. 6 | * 7 | * All rights reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 13 | * * Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * * Redistributions in binary form must reproduce the above 16 | * copyright notice, this list of conditions and the following 17 | * disclaimer in the documentation and/or other materials provided 18 | * with the distribution. 19 | * * Neither the name of Willow Garage, Inc. nor the names of its 20 | * contributors may be used to endorse or promote products derived 21 | * from this software without specific prior written permission. 22 | * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | * POSSIBILITY OF SUCH DAMAGE. 35 | * 36 | */ 37 | 38 | #ifndef INTERNAL_HPP_ 39 | #define INTERNAL_HPP_ 40 | 41 | #include "containers/device_array.hpp" 42 | 43 | #include 44 | #include 45 | #include 46 | 47 | #define MAX_THREADS 1024 48 | 49 | static inline int divUp(int total, int grain) { 50 | return (total + grain - 1) / grain; 51 | } 52 | 53 | /** \brief Camera intrinsics structure 54 | */ 55 | struct Intr { 56 | float fx, fy, cx, cy; 57 | Intr() : fx(0), fy(0), cx(0), cy(0) {} 58 | Intr(float fx_, float fy_, float cx_, float cy_) 59 | : fx(fx_), fy(fy_), cx(cx_), cy(cy_) {} 60 | 61 | Intr operator()(int level_index) const { 62 | int div = 1 << level_index; 63 | return (Intr(fx / div, fy / div, cx / div, cy / div)); 64 | } 65 | }; 66 | 67 | void estimateStep( 68 | const Eigen::Matrix &R_prev_curr, 69 | const Eigen::Matrix &t_prev_curr, 70 | const DeviceArray2D &vmap_curr, 71 | const DeviceArray2D &nmap_curr, const Intr &intr, 72 | const DeviceArray2D &vmap_prev, 73 | const DeviceArray2D &nmap_prev, float dist_thresh, 74 | float angle_thresh, 75 | DeviceArray> &sum, 76 | DeviceArray> &out, 77 | float *matrixA_host, float *vectorB_host, float *residual_inliers, 78 | int threads, int blocks); 79 | 80 | void pyrDown(const DeviceArray2D &src, 81 | DeviceArray2D &dst); 82 | 83 | void createVMap(const Intr &intr, const DeviceArray2D &depth, 84 | DeviceArray2D &vmap, const float depthCutoff); 85 | void createNMap(const DeviceArray2D &vmap, DeviceArray2D &nmap); 86 | 87 | #endif /* INTERNAL_HPP_ */ 88 | -------------------------------------------------------------------------------- /Cuda/containers/kernel_containers.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | 38 | #ifndef KERNEL_CONTAINERS_HPP_ 39 | #define KERNEL_CONTAINERS_HPP_ 40 | 41 | #include 42 | 43 | #if defined(__CUDACC__) 44 | #define GPU_HOST_DEVICE__ __host__ __device__ __forceinline__ 45 | #else 46 | #define GPU_HOST_DEVICE__ 47 | #endif 48 | 49 | template struct DevPtr 50 | { 51 | typedef T elem_type; 52 | const static size_t elem_size = sizeof(elem_type); 53 | 54 | T* data; 55 | 56 | GPU_HOST_DEVICE__ DevPtr() : data(0) {} 57 | GPU_HOST_DEVICE__ DevPtr(T* data_arg) : data(data_arg) {} 58 | 59 | GPU_HOST_DEVICE__ size_t elemSize() const { return elem_size; } 60 | GPU_HOST_DEVICE__ operator T*() { return data; } 61 | GPU_HOST_DEVICE__ operator const T*() const { return data; } 62 | }; 63 | 64 | template struct PtrSz : public DevPtr 65 | { 66 | GPU_HOST_DEVICE__ PtrSz() : size(0) {} 67 | GPU_HOST_DEVICE__ PtrSz(T* data_arg, size_t size_arg) : DevPtr(data_arg), size(size_arg) {} 68 | 69 | size_t size; 70 | }; 71 | 72 | template struct PtrStep : public DevPtr 73 | { 74 | GPU_HOST_DEVICE__ PtrStep() : step(0) {} 75 | GPU_HOST_DEVICE__ PtrStep(T* data_arg, size_t step_arg) : DevPtr(data_arg), step(step_arg) {} 76 | 77 | /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */ 78 | size_t step; 79 | 80 | GPU_HOST_DEVICE__ T* ptr(int y = 0) { return ( T*)( ( char*)DevPtr::data + y * step); } 81 | GPU_HOST_DEVICE__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr::data + y * step); } 82 | }; 83 | 84 | template struct PtrStepSz : public PtrStep 85 | { 86 | GPU_HOST_DEVICE__ PtrStepSz() : cols(0), rows(0) {} 87 | GPU_HOST_DEVICE__ PtrStepSz(int rows_arg, int cols_arg, T* data_arg, size_t step_arg) 88 | : PtrStep(data_arg, step_arg), cols(cols_arg), rows(rows_arg) {} 89 | 90 | int cols; 91 | int rows; 92 | }; 93 | 94 | #endif /* KERNEL_CONTAINERS_HPP_ */ 95 | 96 | -------------------------------------------------------------------------------- /Cuda/pyrdown.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement(BSD License) 3 | * 4 | * Point Cloud Library(PCL) - www.pointclouds.org 5 | * Copyright(c) 2011, Willow Garage, Inc. 6 | * 7 | * All rights reserved. 8 | * 9 | * Redistribution and use in source and binary forms, with or without 10 | * modification, are permitted provided that the following conditions 11 | * are met: 12 | * 13 | * * Redistributions of source code must retain the above copyright 14 | * notice, this list of conditions and the following disclaimer. 15 | * * Redistributions in binary form must reproduce the above 16 | * copyright notice, this list of conditions and the following 17 | * disclaimer in the documentation and/or other materials provided 18 | * with the distribution. 19 | * * Neither the name of Willow Garage, Inc. nor the names of its 20 | * contributors may be used to endorse or promote products derived 21 | * from this software without specific prior written permission. 22 | * 23 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES(INCLUDING, 29 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 31 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 | * LIABILITY, OR TORT(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 33 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 34 | * POSSIBILITY OF SUCH DAMAGE. 35 | * 36 | */ 37 | 38 | #include "containers/safe_call.hpp" 39 | #include "internal.h" 40 | 41 | __global__ void pyrDownGaussKernel(const PtrStepSz src, 42 | PtrStepSz dst, 43 | float sigma_color) { 44 | int x = blockIdx.x * blockDim.x + threadIdx.x; 45 | int y = blockIdx.y * blockDim.y + threadIdx.y; 46 | 47 | if (x >= dst.cols || y >= dst.rows) 48 | return; 49 | 50 | const int D = 5; 51 | 52 | int center = src.ptr(2 * y)[2 * x]; 53 | 54 | int x_mi = max(0, 2 * x - D / 2) - 2 * x; 55 | int y_mi = max(0, 2 * y - D / 2) - 2 * y; 56 | 57 | int x_ma = min(src.cols, 2 * x - D / 2 + D) - 2 * x; 58 | int y_ma = min(src.rows, 2 * y - D / 2 + D) - 2 * y; 59 | 60 | float sum = 0; 61 | float wall = 0; 62 | 63 | float weights[] = {0.375f, 0.25f, 0.0625f}; 64 | 65 | for (int yi = y_mi; yi < y_ma; ++yi) 66 | for (int xi = x_mi; xi < x_ma; ++xi) { 67 | int val = src.ptr(2 * y + yi)[2 * x + xi]; 68 | 69 | if (abs(val - center) < 3 * sigma_color) { 70 | sum += val * weights[abs(xi)] * weights[abs(yi)]; 71 | wall += weights[abs(xi)] * weights[abs(yi)]; 72 | } 73 | } 74 | 75 | dst.ptr(y)[x] = static_cast(sum / wall); 76 | } 77 | 78 | void pyrDown(const DeviceArray2D &src, 79 | DeviceArray2D &dst) { 80 | dst.create(src.rows() / 2, src.cols() / 2); 81 | 82 | dim3 block(32, 8); 83 | dim3 grid(divUp(dst.cols(), block.x), divUp(dst.rows(), block.y)); 84 | 85 | const float sigma_color = 30; 86 | 87 | pyrDownGaussKernel<<>>(src, dst, sigma_color); 88 | cudaSafeCall(cudaGetLastError()); 89 | }; 90 | 91 | __global__ void computeVmapKernel(const PtrStepSz depth, 92 | PtrStep vmap, float fx_inv, 93 | float fy_inv, float cx, float cy, 94 | float depthCutoff) { 95 | int u = threadIdx.x + blockIdx.x * blockDim.x; 96 | int v = threadIdx.y + blockIdx.y * blockDim.y; 97 | 98 | if (u < depth.cols && v < depth.rows) { 99 | float z = depth.ptr(v)[u] / 1000.f; // load and convert: mm -> meters 100 | 101 | if (z != 0 && z < depthCutoff) { 102 | float vx = z * (u - cx) * fx_inv; 103 | float vy = z * (v - cy) * fy_inv; 104 | float vz = z; 105 | 106 | vmap.ptr(v)[u] = vx; 107 | vmap.ptr(v + depth.rows)[u] = vy; 108 | vmap.ptr(v + depth.rows * 2)[u] = vz; 109 | } else { 110 | vmap.ptr(v)[u] = __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ 111 | } 112 | } 113 | } 114 | 115 | void createVMap(const Intr &intr, const DeviceArray2D &depth, 116 | DeviceArray2D &vmap, const float depthCutoff) { 117 | vmap.create(depth.rows() * 3, depth.cols()); 118 | 119 | dim3 block(32, 8); 120 | dim3 grid(1, 1, 1); 121 | grid.x = divUp(depth.cols(), block.x); 122 | grid.y = divUp(depth.rows(), block.y); 123 | 124 | float fx = intr.fx, cx = intr.cx; 125 | float fy = intr.fy, cy = intr.cy; 126 | 127 | computeVmapKernel<<>>(depth, vmap, 1.f / fx, 1.f / fy, cx, cy, 128 | depthCutoff); 129 | cudaSafeCall(cudaGetLastError()); 130 | } 131 | 132 | __global__ void computeNmapKernel(int rows, int cols, const PtrStep vmap, 133 | PtrStep nmap) { 134 | int u = threadIdx.x + blockIdx.x * blockDim.x; 135 | int v = threadIdx.y + blockIdx.y * blockDim.y; 136 | 137 | if (u >= cols || v >= rows) 138 | return; 139 | 140 | if (u == cols - 1 || v == rows - 1) { 141 | nmap.ptr(v)[u] = __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ 142 | return; 143 | } 144 | 145 | Eigen::Matrix v00, v01, v10; 146 | v00(0) = vmap.ptr(v)[u]; 147 | v01(0) = vmap.ptr(v)[u + 1]; 148 | v10(0) = vmap.ptr(v + 1)[u]; 149 | 150 | if (!isnan(v00(0)) && !isnan(v01(0)) && !isnan(v10(0))) { 151 | v00(1) = vmap.ptr(v + rows)[u]; 152 | v01(1) = vmap.ptr(v + rows)[u + 1]; 153 | v10(1) = vmap.ptr(v + 1 + rows)[u]; 154 | 155 | v00(2) = vmap.ptr(v + 2 * rows)[u]; 156 | v01(2) = vmap.ptr(v + 2 * rows)[u + 1]; 157 | v10(2) = vmap.ptr(v + 1 + 2 * rows)[u]; 158 | 159 | Eigen::Matrix r = 160 | (v01 - v00).cross(v10 - v00).normalized(); 161 | 162 | nmap.ptr(v)[u] = r(0); 163 | nmap.ptr(v + rows)[u] = r(1); 164 | nmap.ptr(v + 2 * rows)[u] = r(2); 165 | } else 166 | nmap.ptr(v)[u] = __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ 167 | } 168 | 169 | void createNMap(const DeviceArray2D &vmap, DeviceArray2D &nmap) { 170 | nmap.create(vmap.rows(), vmap.cols()); 171 | 172 | int rows = vmap.rows() / 3; 173 | int cols = vmap.cols(); 174 | 175 | dim3 block(32, 8); 176 | dim3 grid(1, 1, 1); 177 | grid.x = divUp(cols, block.x); 178 | grid.y = divUp(rows, block.y); 179 | 180 | computeNmapKernel<<>>(rows, cols, vmap, nmap); 181 | cudaSafeCall(cudaGetLastError()); 182 | } 183 | -------------------------------------------------------------------------------- /Cuda/containers/device_array_impl.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #ifndef DEVICE_ARRAY_IMPL_HPP_ 38 | #define DEVICE_ARRAY_IMPL_HPP_ 39 | 40 | 41 | ///////////////////// Inline implementations of DeviceArray //////////////////////////////////////////// 42 | 43 | template inline DeviceArray::DeviceArray() {} 44 | template inline DeviceArray::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {} 45 | template inline DeviceArray::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {} 46 | template inline DeviceArray::DeviceArray(const DeviceArray& other) : DeviceMemory(other) {} 47 | template inline DeviceArray& DeviceArray::operator=(const DeviceArray& other) 48 | { DeviceMemory::operator=(other); return *this; } 49 | 50 | template inline void DeviceArray::create(size_t size) 51 | { DeviceMemory::create(size * elem_size); } 52 | template inline void DeviceArray::release() 53 | { DeviceMemory::release(); } 54 | 55 | template inline void DeviceArray::copyTo(DeviceArray& other) const 56 | { DeviceMemory::copyTo(other); } 57 | template inline void DeviceArray::upload(const T *host_ptr, size_t size) 58 | { DeviceMemory::upload(host_ptr, size * elem_size); } 59 | template inline void DeviceArray::download(T *host_ptr) const 60 | { DeviceMemory::download( host_ptr ); } 61 | 62 | template void DeviceArray::swap(DeviceArray& other_arg) { DeviceMemory::swap(other_arg); } 63 | 64 | template inline DeviceArray::operator T*() { return ptr(); } 65 | template inline DeviceArray::operator const T*() const { return ptr(); } 66 | template inline size_t DeviceArray::size() const { return sizeBytes() / elem_size; } 67 | 68 | template inline T* DeviceArray::ptr() { return DeviceMemory::ptr(); } 69 | template inline const T* DeviceArray::ptr() const { return DeviceMemory::ptr(); } 70 | 71 | template template inline void DeviceArray::upload(const std::vector& data) { upload(&data[0], data.size()); } 72 | template template inline void DeviceArray::download(std::vector& data) const { data.resize(size()); if (!data.empty()) download(&data[0]); } 73 | 74 | ///////////////////// Inline implementations of DeviceArray2D //////////////////////////////////////////// 75 | 76 | template inline DeviceArray2D::DeviceArray2D() {} 77 | template inline DeviceArray2D::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {} 78 | template inline DeviceArray2D::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes) : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {} 79 | template inline DeviceArray2D::DeviceArray2D(const DeviceArray2D& other) : DeviceMemory2D(other) {} 80 | template inline DeviceArray2D& DeviceArray2D::operator=(const DeviceArray2D& other) 81 | { DeviceMemory2D::operator=(other); return *this; } 82 | 83 | template inline void DeviceArray2D::create(int rows, int cols) 84 | { DeviceMemory2D::create(rows, cols * elem_size); } 85 | template inline void DeviceArray2D::release() 86 | { DeviceMemory2D::release(); } 87 | 88 | template inline void DeviceArray2D::copyTo(DeviceArray2D& other) const 89 | { DeviceMemory2D::copyTo(other); } 90 | template inline void DeviceArray2D::upload(const void *host_ptr, size_t host_step, int rows, int cols) 91 | { DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size); } 92 | template inline void DeviceArray2D::download(void *host_ptr, size_t host_step) const 93 | { DeviceMemory2D::download( host_ptr, host_step ); } 94 | 95 | template template inline void DeviceArray2D::upload(const std::vector& data, int cols) 96 | { upload(&data[0], cols * elem_size, data.size()/cols, cols); } 97 | 98 | template template inline void DeviceArray2D::download(std::vector& data, int& elem_step) const 99 | { elem_step = cols(); data.resize(cols() * rows()); if (!data.empty()) download(&data[0], colsBytes()); } 100 | 101 | template void DeviceArray2D::swap(DeviceArray2D& other_arg) { DeviceMemory2D::swap(other_arg); } 102 | 103 | template inline T* DeviceArray2D::ptr(int y) { return DeviceMemory2D::ptr(y); } 104 | template inline const T* DeviceArray2D::ptr(int y) const { return DeviceMemory2D::ptr(y); } 105 | 106 | template inline DeviceArray2D::operator T*() { return ptr(); } 107 | template inline DeviceArray2D::operator const T*() const { return ptr(); } 108 | 109 | template inline int DeviceArray2D::cols() const { return DeviceMemory2D::colsBytes()/elem_size; } 110 | template inline int DeviceArray2D::rows() const { return DeviceMemory2D::rows(); } 111 | 112 | template inline size_t DeviceArray2D::elem_step() const { return DeviceMemory2D::step()/elem_size; } 113 | 114 | 115 | #endif /* DEVICE_ARRAY_IMPL_HPP_ */ 116 | -------------------------------------------------------------------------------- /ICP.cpp: -------------------------------------------------------------------------------- 1 | #include "ICPOdometry.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | std::ifstream asFile; 9 | std::string directory; 10 | 11 | void tokenize(const std::string &str, std::vector &tokens, 12 | std::string delimiters = " ") { 13 | tokens.clear(); 14 | 15 | std::string::size_type lastPos = str.find_first_not_of(delimiters, 0); 16 | std::string::size_type pos = str.find_first_of(delimiters, lastPos); 17 | 18 | while (std::string::npos != pos || std::string::npos != lastPos) { 19 | tokens.push_back(str.substr(lastPos, pos - lastPos)); 20 | lastPos = str.find_first_not_of(delimiters, pos); 21 | pos = str.find_first_of(delimiters, lastPos); 22 | } 23 | } 24 | 25 | uint64_t loadDepth(pangolin::Image &depth) { 26 | std::string currentLine; 27 | std::vector tokens; 28 | std::vector timeTokens; 29 | 30 | do { 31 | getline(asFile, currentLine); 32 | tokenize(currentLine, tokens); 33 | } while (tokens.size() > 2); 34 | 35 | if (tokens.size() == 0) 36 | return 0; 37 | 38 | std::string depthLoc = directory; 39 | depthLoc.append(tokens[1]); 40 | 41 | pangolin::TypedImage depthRaw = 42 | pangolin::LoadImage(depthLoc, pangolin::ImageFileTypePng); 43 | 44 | pangolin::Image depthRaw16( 45 | (unsigned short *)depthRaw.ptr, depthRaw.w, depthRaw.h, 46 | depthRaw.w * sizeof(unsigned short)); 47 | 48 | tokenize(tokens[0], timeTokens, "."); 49 | 50 | std::string timeString = timeTokens[0]; 51 | timeString.append(timeTokens[1]); 52 | 53 | uint64_t time; 54 | std::istringstream(timeString) >> time; 55 | 56 | for (unsigned int i = 0; i < 480; i++) { 57 | for (unsigned int j = 0; j < 640; j++) { 58 | depth.RowPtr(i)[j] = depthRaw16(j, i) / 5; 59 | } 60 | } 61 | 62 | depthRaw.Dealloc(); 63 | 64 | return time; 65 | } 66 | 67 | void outputFreiburg(const std::string filename, const uint64_t ×tamp, 68 | const Eigen::Matrix4f ¤tPose) { 69 | std::ofstream file; 70 | file.open(filename.c_str(), std::fstream::app); 71 | 72 | std::stringstream strs; 73 | 74 | strs << std::setprecision(6) << std::fixed << (double)timestamp / 1000000.0 75 | << " "; 76 | 77 | Eigen::Vector3f trans = currentPose.topRightCorner(3, 1); 78 | Eigen::Matrix3f rot = currentPose.topLeftCorner(3, 3); 79 | 80 | file << strs.str() << trans(0) << " " << trans(1) << " " << trans(2) << " "; 81 | 82 | Eigen::Quaternionf currentCameraRotation(rot); 83 | 84 | file << currentCameraRotation.x() << " " << currentCameraRotation.y() << " " 85 | << currentCameraRotation.z() << " " << currentCameraRotation.w() << "\n"; 86 | 87 | file.close(); 88 | } 89 | 90 | uint64_t getCurrTime() { 91 | return std::chrono::duration_cast( 92 | std::chrono::high_resolution_clock::now().time_since_epoch()) 93 | .count(); 94 | } 95 | 96 | int main(int argc, char *argv[]) { 97 | assert((argc == 2 || argc == 3) && 98 | "Please supply the depth.txt dir as the first argument"); 99 | 100 | directory.append(argv[1]); 101 | 102 | if (directory.at(directory.size() - 1) != '/') { 103 | directory.append("/"); 104 | } 105 | 106 | std::string associationFile = directory; 107 | associationFile.append("depth.txt"); 108 | 109 | asFile.open(associationFile.c_str()); 110 | 111 | pangolin::ManagedImage firstData(640, 480); 112 | pangolin::ManagedImage secondData(640, 480); 113 | 114 | pangolin::Image firstRaw(firstData.w, firstData.h, 115 | firstData.pitch, 116 | (unsigned short *)firstData.ptr); 117 | pangolin::Image secondRaw(secondData.w, secondData.h, 118 | secondData.pitch, 119 | (unsigned short *)secondData.ptr); 120 | 121 | ICPOdometry icpOdom(640, 480, 319.5, 239.5, 528, 528); 122 | 123 | assert(!asFile.eof() && asFile.is_open()); 124 | 125 | loadDepth(firstRaw); 126 | uint64_t timestamp = loadDepth(secondRaw); 127 | 128 | Sophus::SE3d T_wc_prev; 129 | Sophus::SE3d T_wc_curr; 130 | 131 | std::ofstream file; 132 | file.open("output.poses", std::fstream::out); 133 | file.close(); 134 | 135 | cudaDeviceProp prop; 136 | 137 | cudaGetDeviceProperties(&prop, 0); 138 | 139 | std::string dev(prop.name); 140 | 141 | std::cout << dev << std::endl; 142 | 143 | float mean = std::numeric_limits::max(); 144 | int count = 0; 145 | 146 | int threads = 224; 147 | int blocks = 96; 148 | 149 | int bestThreads = threads; 150 | int bestBlocks = blocks; 151 | float best = mean; 152 | 153 | if (argc == 3) { 154 | std::string searchArg(argv[2]); 155 | 156 | if (searchArg.compare("-v") == 0) { 157 | std::cout 158 | << "Searching for the best thread/block configuration for your GPU..." 159 | << std::endl; 160 | std::cout << "Best: " << bestThreads << " threads, " << bestBlocks 161 | << " blocks (" << best << "ms)"; 162 | std::cout.flush(); 163 | 164 | float counter = 0; 165 | 166 | for (threads = 16; threads <= 512; threads += 16) { 167 | for (blocks = 16; blocks <= 512; blocks += 16) { 168 | mean = 0.0f; 169 | count = 0; 170 | 171 | for (int i = 0; i < 5; i++) { 172 | icpOdom.initICPModel(firstRaw.ptr); 173 | icpOdom.initICP(secondRaw.ptr); 174 | 175 | uint64_t tick = getCurrTime(); 176 | 177 | T_wc_prev = T_wc_curr; 178 | 179 | Sophus::SE3d T_prev_curr = T_wc_prev.inverse() * T_wc_curr; 180 | 181 | icpOdom.getIncrementalTransformation(T_prev_curr, threads, blocks); 182 | 183 | T_wc_curr = T_wc_prev * T_prev_curr; 184 | 185 | uint64_t tock = getCurrTime(); 186 | 187 | mean = (float(count) * mean + (tock - tick) / 1000.0f) / 188 | float(count + 1); 189 | count++; 190 | } 191 | 192 | counter++; 193 | 194 | if (mean < best) { 195 | best = mean; 196 | bestThreads = threads; 197 | bestBlocks = blocks; 198 | } 199 | 200 | std::cout << "\rBest: " << bestThreads << " threads, " << bestBlocks 201 | << " blocks (" << best << "ms), " 202 | << int((counter / 1024.f) * 100.f) << "% "; 203 | std::cout.flush(); 204 | } 205 | } 206 | 207 | std::cout << std::endl; 208 | } 209 | } 210 | 211 | threads = bestThreads; 212 | blocks = bestBlocks; 213 | 214 | mean = 0.0f; 215 | count = 0; 216 | 217 | T_wc_prev = Sophus::SE3d(); 218 | T_wc_curr = Sophus::SE3d(); 219 | 220 | while (!asFile.eof()) { 221 | icpOdom.initICPModel(firstRaw.ptr); 222 | icpOdom.initICP(secondRaw.ptr); 223 | 224 | uint64_t tick = getCurrTime(); 225 | 226 | T_wc_prev = T_wc_curr; 227 | 228 | Sophus::SE3d T_prev_curr = T_wc_prev.inverse() * T_wc_curr; 229 | 230 | icpOdom.getIncrementalTransformation(T_prev_curr, threads, blocks); 231 | 232 | T_wc_curr = T_wc_prev * T_prev_curr; 233 | 234 | uint64_t tock = getCurrTime(); 235 | 236 | mean = (float(count) * mean + (tock - tick) / 1000.0f) / float(count + 1); 237 | count++; 238 | 239 | std::cout << std::setprecision(4) << std::fixed << "\rICP: " << mean 240 | << "ms"; 241 | std::cout.flush(); 242 | 243 | std::swap(firstRaw, secondRaw); 244 | 245 | outputFreiburg("output.poses", timestamp, T_wc_curr.cast().matrix()); 246 | 247 | timestamp = loadDepth(secondRaw); 248 | } 249 | 250 | std::cout << std::endl; 251 | 252 | std::cout << "ICP speed: " << int(1000.f / mean) << "Hz" << std::endl; 253 | 254 | return 0; 255 | } 256 | -------------------------------------------------------------------------------- /Cuda/estimate.cu: -------------------------------------------------------------------------------- 1 | #include "containers/safe_call.hpp" 2 | #include "internal.h" 3 | 4 | template 5 | __inline__ __device__ void 6 | warpReduceSum(Eigen::Matrix &val) { 7 | for (int offset = warpSize / 2; offset > 0; offset /= 2) { 8 | #pragma unroll 9 | for (int i = 0; i < D; i++) { 10 | val[i] += __shfl_down_sync(0xFFFFFFFF, val[i], offset); 11 | } 12 | } 13 | } 14 | 15 | template 16 | __inline__ __device__ void 17 | blockReduceSum(Eigen::Matrix &val) { 18 | // Allocate shared memory in two steps otherwise NVCC complains about Eigen's 19 | // non-empty constructor 20 | static __shared__ unsigned char 21 | sharedMem[32 * sizeof(Eigen::Matrix)]; 22 | 23 | Eigen::Matrix(&shared)[32] = 24 | reinterpret_cast(&)[32]>( 25 | sharedMem); 26 | 27 | int lane = threadIdx.x % warpSize; 28 | 29 | int wid = threadIdx.x / warpSize; 30 | 31 | warpReduceSum(val); 32 | 33 | // write reduced value to shared memory 34 | if (lane == 0) { 35 | shared[wid] = val; 36 | } 37 | __syncthreads(); 38 | 39 | // ensure we only grab a value from shared memory if that warp existed 40 | val = (threadIdx.x < blockDim.x / warpSize) 41 | ? shared[lane] 42 | : Eigen::Matrix::Zero(); 43 | 44 | if (wid == 0) { 45 | warpReduceSum(val); 46 | } 47 | } 48 | 49 | template 50 | __global__ void reduceSum(Eigen::Matrix *in, 51 | Eigen::Matrix *out, 52 | int N) { 53 | Eigen::Matrix sum = 54 | Eigen::Matrix::Zero(); 55 | 56 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; 57 | i += blockDim.x * gridDim.x) { 58 | sum += in[i]; 59 | } 60 | 61 | blockReduceSum(sum); 62 | 63 | if (threadIdx.x == 0) { 64 | out[blockIdx.x] = sum; 65 | } 66 | } 67 | 68 | struct Reduction { 69 | Eigen::Matrix R_prev_curr; 70 | Eigen::Matrix t_prev_curr; 71 | 72 | Intr intr; 73 | 74 | PtrStep vmap_curr; 75 | PtrStep nmap_curr; 76 | 77 | PtrStep vmap_prev; 78 | PtrStep nmap_prev; 79 | 80 | float dist_thresh; 81 | float angle_thresh; 82 | 83 | int cols; 84 | int rows; 85 | int N; 86 | 87 | Eigen::Matrix *out; 88 | 89 | // And now for some template metaprogramming magic 90 | template struct SquareUpperTriangularProduct { 91 | __device__ __forceinline__ static void 92 | apply(Eigen::Matrix &values, 93 | const float (&rows)[end + 1]) { 94 | values[((end + 1) * outer) + inner - (outer * (outer + 1) / 2)] = 95 | rows[outer] * rows[inner]; 96 | 97 | SquareUpperTriangularProduct::apply(values, rows); 98 | } 99 | }; 100 | 101 | // Inner loop base 102 | template 103 | struct SquareUpperTriangularProduct { 104 | __device__ __forceinline__ static void 105 | apply(Eigen::Matrix &values, 106 | const float (&rows)[end + 1]) { 107 | values[((end + 1) * outer) + end - (outer * (outer + 1) / 2)] = 108 | rows[outer] * rows[end]; 109 | 110 | SquareUpperTriangularProduct::apply(values, 111 | rows); 112 | } 113 | }; 114 | 115 | // Outer loop base 116 | template struct SquareUpperTriangularProduct { 117 | __device__ __forceinline__ static void 118 | apply(Eigen::Matrix &values, 119 | const float (&rows)[end + 1]) { 120 | values[((end + 1) * end) + end - (end * (end + 1) / 2)] = 121 | rows[end] * rows[end]; 122 | } 123 | }; 124 | 125 | __device__ __forceinline__ void operator()() const { 126 | Eigen::Matrix sum = 127 | Eigen::Matrix::Zero(); 128 | 129 | SquareUpperTriangularProduct<0, 0, 6> sutp; 130 | 131 | Eigen::Matrix values; 132 | 133 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; 134 | i += blockDim.x * gridDim.x) { 135 | const int y = i / cols; 136 | const int x = i - (y * cols); 137 | 138 | const Eigen::Matrix v_curr( 139 | vmap_curr.ptr(y)[x], vmap_curr.ptr(y + rows)[x], 140 | vmap_curr.ptr(y + 2 * rows)[x]); 141 | 142 | const Eigen::Matrix v_curr_in_prev = 143 | R_prev_curr * v_curr + t_prev_curr; 144 | 145 | const Eigen::Matrix p_curr_in_prev( 146 | __float2int_rn(v_curr_in_prev(0) * intr.fx / v_curr_in_prev(2) + 147 | intr.cx), 148 | __float2int_rn(v_curr_in_prev(1) * intr.fy / v_curr_in_prev(2) + 149 | intr.cy)); 150 | 151 | float row[7] = {0, 0, 0, 0, 0, 0, 0}; 152 | 153 | values[28] = 0; 154 | 155 | if (p_curr_in_prev(0) >= 0 && p_curr_in_prev(1) >= 0 && 156 | p_curr_in_prev(0) < cols && p_curr_in_prev(1) < rows && 157 | v_curr(2) > 0 && v_curr_in_prev(2) > 0) { 158 | const Eigen::Matrix v_prev( 159 | vmap_prev.ptr(p_curr_in_prev(1))[p_curr_in_prev(0)], 160 | vmap_prev.ptr(p_curr_in_prev(1) + rows)[p_curr_in_prev(0)], 161 | vmap_prev.ptr(p_curr_in_prev(1) + 2 * rows)[p_curr_in_prev(0)]); 162 | 163 | const Eigen::Matrix n_curr( 164 | nmap_curr.ptr(y)[x], nmap_curr.ptr(y + rows)[x], 165 | nmap_curr.ptr(y + 2 * rows)[x]); 166 | 167 | const Eigen::Matrix n_curr_in_prev = 168 | R_prev_curr * n_curr; 169 | 170 | const Eigen::Matrix n_prev( 171 | nmap_prev.ptr(p_curr_in_prev(1))[p_curr_in_prev(0)], 172 | nmap_prev.ptr(p_curr_in_prev(1) + rows)[p_curr_in_prev(0)], 173 | nmap_prev.ptr(p_curr_in_prev(1) + 2 * rows)[p_curr_in_prev(0)]); 174 | 175 | if (n_curr_in_prev.cross(n_prev).norm() < angle_thresh && 176 | (v_prev - v_curr_in_prev).norm() < dist_thresh && 177 | !isnan(n_curr(0)) && !isnan(n_prev(0))) { 178 | *(Eigen::Matrix *)&row[0] = n_prev; 179 | *(Eigen::Matrix *)&row[3] = 180 | v_curr_in_prev.cross(n_prev); 181 | row[6] = n_prev.dot(v_prev - v_curr_in_prev); 182 | 183 | values[28] = 1; 184 | 185 | sutp.apply(values, row); 186 | 187 | sum += values; 188 | } 189 | } 190 | } 191 | 192 | blockReduceSum(sum); 193 | 194 | if (threadIdx.x == 0) { 195 | out[blockIdx.x] = sum; 196 | } 197 | } 198 | }; 199 | 200 | __global__ void estimateKernel(const Reduction reduction) { reduction(); } 201 | 202 | void estimateStep( 203 | const Eigen::Matrix &R_prev_curr, 204 | const Eigen::Matrix &t_prev_curr, 205 | const DeviceArray2D &vmap_curr, 206 | const DeviceArray2D &nmap_curr, const Intr &intr, 207 | const DeviceArray2D &vmap_prev, 208 | const DeviceArray2D &nmap_prev, float dist_thresh, 209 | float angle_thresh, 210 | DeviceArray> &sum, 211 | DeviceArray> &out, 212 | float *matrixA_host, float *vectorB_host, float *residual_inliers, 213 | int threads, int blocks) { 214 | int cols = vmap_curr.cols(); 215 | int rows = vmap_curr.rows() / 3; 216 | 217 | Reduction reduction; 218 | 219 | reduction.R_prev_curr = R_prev_curr; 220 | reduction.t_prev_curr = t_prev_curr; 221 | 222 | reduction.vmap_curr = vmap_curr; 223 | reduction.nmap_curr = nmap_curr; 224 | 225 | reduction.intr = intr; 226 | 227 | reduction.vmap_prev = vmap_prev; 228 | reduction.nmap_prev = nmap_prev; 229 | 230 | reduction.dist_thresh = dist_thresh; 231 | reduction.angle_thresh = angle_thresh; 232 | 233 | reduction.cols = cols; 234 | reduction.rows = rows; 235 | 236 | reduction.N = cols * rows; 237 | reduction.out = sum; 238 | 239 | estimateKernel<<>>(reduction); 240 | 241 | reduceSum<29><<<1, MAX_THREADS>>>(sum, out, blocks); 242 | 243 | cudaSafeCall(cudaGetLastError()); 244 | cudaSafeCall(cudaDeviceSynchronize()); 245 | 246 | float host_data[29]; 247 | out.download((Eigen::Matrix *)&host_data[0]); 248 | 249 | int shift = 0; 250 | for (int i = 0; i < 6; ++i) // rows 251 | { 252 | for (int j = i; j < 7; ++j) // cols + b 253 | { 254 | float value = host_data[shift++]; 255 | if (j == 6) // vector b 256 | vectorB_host[i] = value; 257 | else 258 | matrixA_host[j * 6 + i] = matrixA_host[i * 6 + j] = value; 259 | } 260 | } 261 | 262 | residual_inliers[0] = host_data[27]; 263 | residual_inliers[1] = host_data[28]; 264 | } 265 | -------------------------------------------------------------------------------- /Cuda/containers/device_memory.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #include "device_memory.hpp" 38 | #include "safe_call.hpp" 39 | 40 | #include "assert.h" 41 | #include "cuda_runtime_api.h" 42 | 43 | ////////////////////////// XADD /////////////////////////////// 44 | 45 | #ifdef __GNUC__ 46 | 47 | #if __GNUC__ * 10 + __GNUC_MINOR__ >= 42 48 | 49 | #if !defined WIN32 && \ 50 | (defined __i486__ || defined __i586__ || defined __i686__ || \ 51 | defined __MMX__ || defined __SSE__ || defined __ppc__) 52 | #define CV_XADD __sync_fetch_and_add 53 | #else 54 | #include 55 | #define CV_XADD __gnu_cxx::__exchange_and_add 56 | #endif 57 | #else 58 | #include 59 | #if __GNUC__ * 10 + __GNUC_MINOR__ >= 34 60 | #define CV_XADD __gnu_cxx::__exchange_and_add 61 | #else 62 | #define CV_XADD __exchange_and_add 63 | #endif 64 | #endif 65 | 66 | #elif defined WIN32 || defined _WIN32 67 | #include 68 | #define CV_XADD(addr, delta) \ 69 | _InterlockedExchangeAdd((long volatile *)(addr), (delta)) 70 | #else 71 | 72 | template static inline _Tp CV_XADD(_Tp *addr, _Tp delta) { 73 | int tmp = *addr; 74 | *addr += delta; 75 | return tmp; 76 | } 77 | 78 | #endif 79 | 80 | //////////////////////// DeviceArray ///////////////////////////// 81 | 82 | DeviceMemory::DeviceMemory() : data_(0), sizeBytes_(0), refcount_(0) {} 83 | DeviceMemory::DeviceMemory(void *ptr_arg, size_t sizeBytes_arg) 84 | : data_(ptr_arg), sizeBytes_(sizeBytes_arg), refcount_(0) {} 85 | DeviceMemory::DeviceMemory(size_t sizeBtes_arg) 86 | : data_(0), sizeBytes_(0), refcount_(0) { 87 | create(sizeBtes_arg); 88 | } 89 | DeviceMemory::~DeviceMemory() { release(); } 90 | 91 | DeviceMemory::DeviceMemory(const DeviceMemory &other_arg) 92 | : data_(other_arg.data_), sizeBytes_(other_arg.sizeBytes_), 93 | refcount_(other_arg.refcount_) { 94 | if (refcount_) 95 | CV_XADD(refcount_, 1); 96 | } 97 | 98 | DeviceMemory &DeviceMemory::operator=(const DeviceMemory &other_arg) { 99 | if (this != &other_arg) { 100 | if (other_arg.refcount_) 101 | CV_XADD(other_arg.refcount_, 1); 102 | release(); 103 | 104 | data_ = other_arg.data_; 105 | sizeBytes_ = other_arg.sizeBytes_; 106 | refcount_ = other_arg.refcount_; 107 | } 108 | return *this; 109 | } 110 | 111 | void DeviceMemory::create(size_t sizeBytes_arg) { 112 | if (sizeBytes_arg == sizeBytes_) 113 | return; 114 | 115 | if (sizeBytes_arg > 0) { 116 | if (data_) 117 | release(); 118 | 119 | sizeBytes_ = sizeBytes_arg; 120 | 121 | cudaSafeCall(cudaMalloc(&data_, sizeBytes_)); 122 | 123 | // refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_)); 124 | refcount_ = new int; 125 | *refcount_ = 1; 126 | } 127 | } 128 | 129 | void DeviceMemory::copyTo(DeviceMemory &other) const { 130 | if (empty()) 131 | other.release(); 132 | else { 133 | other.create(sizeBytes_); 134 | cudaSafeCall( 135 | cudaMemcpy(other.data_, data_, sizeBytes_, cudaMemcpyDeviceToDevice)); 136 | cudaSafeCall(cudaDeviceSynchronize()); 137 | } 138 | } 139 | 140 | void DeviceMemory::release() { 141 | if (refcount_ && CV_XADD(refcount_, -1) == 1) { 142 | // cv::fastFree(refcount); 143 | delete refcount_; 144 | cudaSafeCall(cudaFree(data_)); 145 | } 146 | data_ = 0; 147 | sizeBytes_ = 0; 148 | refcount_ = 0; 149 | } 150 | 151 | void DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg) { 152 | create(sizeBytes_arg); 153 | cudaSafeCall( 154 | cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice)); 155 | cudaSafeCall(cudaDeviceSynchronize()); 156 | } 157 | 158 | void DeviceMemory::download(void *host_ptr_arg) const { 159 | cudaSafeCall( 160 | cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost)); 161 | cudaSafeCall(cudaDeviceSynchronize()); 162 | } 163 | 164 | void DeviceMemory::swap(DeviceMemory &other_arg) { 165 | std::swap(data_, other_arg.data_); 166 | std::swap(sizeBytes_, other_arg.sizeBytes_); 167 | std::swap(refcount_, other_arg.refcount_); 168 | } 169 | 170 | bool DeviceMemory::empty() const { return !data_; } 171 | size_t DeviceMemory::sizeBytes() const { return sizeBytes_; } 172 | 173 | //////////////////////// DeviceArray2D ///////////////////////////// 174 | 175 | DeviceMemory2D::DeviceMemory2D() 176 | : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) {} 177 | 178 | DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg) 179 | : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) { 180 | create(rows_arg, colsBytes_arg); 181 | } 182 | 183 | DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, 184 | size_t step_arg) 185 | : data_(data_arg), step_(step_arg), colsBytes_(colsBytes_arg), 186 | rows_(rows_arg), refcount_(0) {} 187 | 188 | DeviceMemory2D::~DeviceMemory2D() { release(); } 189 | 190 | DeviceMemory2D::DeviceMemory2D(const DeviceMemory2D &other_arg) 191 | : data_(other_arg.data_), step_(other_arg.step_), 192 | colsBytes_(other_arg.colsBytes_), rows_(other_arg.rows_), 193 | refcount_(other_arg.refcount_) { 194 | if (refcount_) 195 | CV_XADD(refcount_, 1); 196 | } 197 | 198 | DeviceMemory2D &DeviceMemory2D::operator=(const DeviceMemory2D &other_arg) { 199 | if (this != &other_arg) { 200 | if (other_arg.refcount_) 201 | CV_XADD(other_arg.refcount_, 1); 202 | release(); 203 | 204 | colsBytes_ = other_arg.colsBytes_; 205 | rows_ = other_arg.rows_; 206 | data_ = other_arg.data_; 207 | step_ = other_arg.step_; 208 | 209 | refcount_ = other_arg.refcount_; 210 | } 211 | return *this; 212 | } 213 | 214 | void DeviceMemory2D::create(int rows_arg, int colsBytes_arg) { 215 | if (colsBytes_ == colsBytes_arg && rows_ == rows_arg) 216 | return; 217 | 218 | if (rows_arg > 0 && colsBytes_arg > 0) { 219 | if (data_) 220 | release(); 221 | 222 | colsBytes_ = colsBytes_arg; 223 | rows_ = rows_arg; 224 | 225 | cudaSafeCall(cudaMallocPitch((void **)&data_, &step_, colsBytes_, rows_)); 226 | 227 | // refcount = (int*)cv::fastMalloc(sizeof(*refcount)); 228 | refcount_ = new int; 229 | *refcount_ = 1; 230 | } 231 | } 232 | 233 | void DeviceMemory2D::release() { 234 | if (refcount_ && CV_XADD(refcount_, -1) == 1) { 235 | // cv::fastFree(refcount); 236 | delete refcount_; 237 | cudaSafeCall(cudaFree(data_)); 238 | } 239 | 240 | colsBytes_ = 0; 241 | rows_ = 0; 242 | data_ = 0; 243 | step_ = 0; 244 | refcount_ = 0; 245 | } 246 | 247 | void DeviceMemory2D::copyTo(DeviceMemory2D &other) const { 248 | if (empty()) 249 | other.release(); 250 | else { 251 | other.create(rows_, colsBytes_); 252 | cudaSafeCall(cudaMemcpy2D(other.data_, other.step_, data_, step_, 253 | colsBytes_, rows_, cudaMemcpyDeviceToDevice)); 254 | cudaSafeCall(cudaDeviceSynchronize()); 255 | } 256 | } 257 | 258 | void DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, 259 | int rows_arg, int colsBytes_arg) { 260 | create(rows_arg, colsBytes_arg); 261 | cudaSafeCall(cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, 262 | colsBytes_, rows_, cudaMemcpyHostToDevice)); 263 | } 264 | 265 | void DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const { 266 | cudaSafeCall(cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, 267 | colsBytes_, rows_, cudaMemcpyDeviceToHost)); 268 | } 269 | 270 | void DeviceMemory2D::swap(DeviceMemory2D &other_arg) { 271 | std::swap(data_, other_arg.data_); 272 | std::swap(step_, other_arg.step_); 273 | 274 | std::swap(colsBytes_, other_arg.colsBytes_); 275 | std::swap(rows_, other_arg.rows_); 276 | std::swap(refcount_, other_arg.refcount_); 277 | } 278 | 279 | bool DeviceMemory2D::empty() const { return !data_; } 280 | int DeviceMemory2D::colsBytes() const { return colsBytes_; } 281 | int DeviceMemory2D::rows() const { return rows_; } 282 | size_t DeviceMemory2D::step() const { return step_; } 283 | -------------------------------------------------------------------------------- /Cuda/containers/device_memory.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #ifndef DEVICE_MEMORY_HPP_ 38 | #define DEVICE_MEMORY_HPP_ 39 | 40 | #include "kernel_containers.hpp" 41 | 42 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 43 | /** \brief @b DeviceMemory class 44 | * 45 | * \note This is a BLOB container class with reference counting for GPU memory. 46 | * 47 | * \author Anatoly Baksheev 48 | */ 49 | 50 | class DeviceMemory 51 | { 52 | public: 53 | /** \brief Empty constructor. */ 54 | DeviceMemory(); 55 | 56 | /** \brief Destructor. */ 57 | ~DeviceMemory(); 58 | 59 | /** \brief Allocates internal buffer in GPU memory 60 | * \param sizeBytes_arg: amount of memory to allocate 61 | * */ 62 | DeviceMemory(size_t sizeBytes_arg); 63 | 64 | /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case. 65 | * \param ptr_arg: pointer to buffer 66 | * \param sizeBytes_arg: buffer size 67 | * */ 68 | DeviceMemory(void *ptr_arg, size_t sizeBytes_arg); 69 | 70 | /** \brief Copy constructor. Just increments reference counter. */ 71 | DeviceMemory(const DeviceMemory& other_arg); 72 | 73 | /** \brief Assigment operator. Just increments reference counter. */ 74 | DeviceMemory& operator=(const DeviceMemory& other_arg); 75 | 76 | /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing. 77 | * \param sizeBytes_arg: buffer size 78 | * */ 79 | void create(size_t sizeBytes_arg); 80 | 81 | /** \brief Decrements reference counter and releases internal buffer if needed. */ 82 | void release(); 83 | 84 | /** \brief Performs data copying. If destination size differs it will be reallocated. 85 | * \param other_arg: destination container 86 | * */ 87 | void copyTo(DeviceMemory& other) const; 88 | 89 | /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough. 90 | * \param host_ptr_arg: pointer to buffer to upload 91 | * \param sizeBytes_arg: buffer size 92 | * */ 93 | void upload(const void *host_ptr_arg, size_t sizeBytes_arg); 94 | 95 | /** \brief Downloads data from internal buffer to CPU memory 96 | * \param host_ptr_arg: pointer to buffer to download 97 | * */ 98 | void download(void *host_ptr_arg) const; 99 | 100 | /** \brief Performs swap of data pointed with another device memory. 101 | * \param other: device memory to swap with 102 | * */ 103 | void swap(DeviceMemory& other_arg); 104 | 105 | /** \brief Returns pointer for internal buffer in GPU memory. */ 106 | template T* ptr(); 107 | 108 | /** \brief Returns constant pointer for internal buffer in GPU memory. */ 109 | template const T* ptr() const; 110 | 111 | /** \brief Conversion to PtrSz for passing to kernel functions. */ 112 | template operator PtrSz() const; 113 | 114 | /** \brief Returns true if unallocated otherwise false. */ 115 | bool empty() const; 116 | 117 | size_t sizeBytes() const; 118 | 119 | private: 120 | /** \brief Device pointer. */ 121 | void *data_; 122 | 123 | /** \brief Allocated size in bytes. */ 124 | size_t sizeBytes_; 125 | 126 | /** \brief Pointer to reference counter in CPU memory. */ 127 | int* refcount_; 128 | }; 129 | 130 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 131 | /** \brief @b DeviceMemory2D class 132 | * 133 | * \note This is a BLOB container class with reference counting for pitched GPU memory. 134 | * 135 | * \author Anatoly Baksheev 136 | */ 137 | 138 | class DeviceMemory2D 139 | { 140 | public: 141 | /** \brief Empty constructor. */ 142 | DeviceMemory2D(); 143 | 144 | /** \brief Destructor. */ 145 | ~DeviceMemory2D(); 146 | 147 | /** \brief Allocates internal buffer in GPU memory 148 | * \param rows_arg: number of rows to allocate 149 | * \param colsBytes_arg: width of the buffer in bytes 150 | * */ 151 | DeviceMemory2D(int rows_arg, int colsBytes_arg); 152 | 153 | 154 | /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case. 155 | * \param rows_arg: number of rows 156 | * \param colsBytes_arg: width of the buffer in bytes 157 | * \param data_arg: pointer to buffer 158 | * \param stepBytes_arg: stride between two consecutive rows in bytes 159 | * */ 160 | DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg); 161 | 162 | /** \brief Copy constructor. Just increments reference counter. */ 163 | DeviceMemory2D(const DeviceMemory2D& other_arg); 164 | 165 | /** \brief Assigment operator. Just increments reference counter. */ 166 | DeviceMemory2D& operator=(const DeviceMemory2D& other_arg); 167 | 168 | /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing. 169 | * \param ptr_arg: number of rows to allocate 170 | * \param sizeBytes_arg: width of the buffer in bytes 171 | * */ 172 | void create(int rows_arg, int colsBytes_arg); 173 | 174 | /** \brief Decrements reference counter and releases internal buffer if needed. */ 175 | void release(); 176 | 177 | /** \brief Performs data copying. If destination size differs it will be reallocated. 178 | * \param other_arg: destination container 179 | * */ 180 | void copyTo(DeviceMemory2D& other) const; 181 | 182 | /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough. 183 | * \param host_ptr_arg: pointer to host buffer to upload 184 | * \param host_step_arg: stride between two consecutive rows in bytes for host buffer 185 | * \param rows_arg: number of rows to upload 186 | * \param sizeBytes_arg: width of host buffer in bytes 187 | * */ 188 | void upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg); 189 | 190 | /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size. 191 | * \param host_ptr_arg: pointer to host buffer to download 192 | * \param host_step_arg: stride between two consecutive rows in bytes for host buffer 193 | * */ 194 | void download(void *host_ptr_arg, size_t host_step_arg) const; 195 | 196 | /** \brief Performs swap of data pointed with another device memory. 197 | * \param other: device memory to swap with 198 | * */ 199 | void swap(DeviceMemory2D& other_arg); 200 | 201 | /** \brief Returns pointer to given row in internal buffer. 202 | * \param y_arg: row index 203 | * */ 204 | template T* ptr(int y_arg = 0); 205 | 206 | /** \brief Returns constant pointer to given row in internal buffer. 207 | * \param y_arg: row index 208 | * */ 209 | template const T* ptr(int y_arg = 0) const; 210 | 211 | /** \brief Conversion to PtrStep for passing to kernel functions. */ 212 | template operator PtrStep() const; 213 | 214 | /** \brief Conversion to PtrStepSz for passing to kernel functions. */ 215 | template operator PtrStepSz() const; 216 | 217 | /** \brief Returns true if unallocated otherwise false. */ 218 | bool empty() const; 219 | 220 | /** \brief Returns number of bytes in each row. */ 221 | int colsBytes() const; 222 | 223 | /** \brief Returns number of rows. */ 224 | int rows() const; 225 | 226 | /** \brief Returns stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */ 227 | size_t step() const; 228 | private: 229 | /** \brief Device pointer. */ 230 | void *data_; 231 | 232 | /** \brief Stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */ 233 | size_t step_; 234 | 235 | /** \brief Width of the buffer in bytes. */ 236 | int colsBytes_; 237 | 238 | /** \brief Number of rows. */ 239 | int rows_; 240 | 241 | /** \brief Pointer to reference counter in CPU memory. */ 242 | int* refcount_; 243 | }; 244 | 245 | #include "device_memory_impl.hpp" 246 | 247 | #endif /* DEVICE_MEMORY_HPP_ */ 248 | -------------------------------------------------------------------------------- /Cuda/containers/device_array.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #ifndef DEVICE_ARRAY_HPP_ 38 | #define DEVICE_ARRAY_HPP_ 39 | 40 | #include "device_memory.hpp" 41 | 42 | #include 43 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 44 | /** \brief @b DeviceArray class 45 | * 46 | * \note Typed container for GPU memory with reference counting. 47 | * 48 | * \author Anatoly Baksheev 49 | */ 50 | template 51 | class DeviceArray : public DeviceMemory 52 | { 53 | public: 54 | /** \brief Element type. */ 55 | typedef T type; 56 | 57 | /** \brief Element size. */ 58 | enum { elem_size = sizeof(T) }; 59 | 60 | /** \brief Empty constructor. */ 61 | DeviceArray(); 62 | 63 | /** \brief Allocates internal buffer in GPU memory 64 | * \param size_t: number of elements to allocate 65 | * */ 66 | DeviceArray(size_t size); 67 | 68 | /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case. 69 | * \param ptr: pointer to buffer 70 | * \param size: elemens number 71 | * */ 72 | DeviceArray(T *ptr, size_t size); 73 | 74 | /** \brief Copy constructor. Just increments reference counter. */ 75 | DeviceArray(const DeviceArray& other); 76 | 77 | /** \brief Assigment operator. Just increments reference counter. */ 78 | DeviceArray& operator = (const DeviceArray& other); 79 | 80 | /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing. 81 | * \param size: elemens number 82 | * */ 83 | void create(size_t size); 84 | 85 | /** \brief Decrements reference counter and releases internal buffer if needed. */ 86 | void release(); 87 | 88 | /** \brief Performs data copying. If destination size differs it will be reallocated. 89 | * \param other_arg: destination container 90 | * */ 91 | void copyTo(DeviceArray& other) const; 92 | 93 | /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough. 94 | * \param host_ptr_arg: pointer to buffer to upload 95 | * \param size: elemens number 96 | * */ 97 | void upload(const T *host_ptr, size_t size); 98 | 99 | /** \brief Downloads data from internal buffer to CPU memory 100 | * \param host_ptr_arg: pointer to buffer to download 101 | * */ 102 | void download(T *host_ptr) const; 103 | 104 | /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough. 105 | * \param data: host vector to upload from 106 | * */ 107 | template 108 | void upload(const std::vector& data); 109 | 110 | /** \brief Downloads data from internal buffer to CPU memory 111 | * \param data: host vector to download to 112 | * */ 113 | template 114 | void download(std::vector& data) const; 115 | 116 | /** \brief Performs swap of data pointed with another device array. 117 | * \param other: device array to swap with 118 | * */ 119 | void swap(DeviceArray& other_arg); 120 | 121 | /** \brief Returns pointer for internal buffer in GPU memory. */ 122 | T* ptr(); 123 | 124 | /** \brief Returns const pointer for internal buffer in GPU memory. */ 125 | const T* ptr() const; 126 | 127 | //using DeviceMemory::ptr; 128 | 129 | /** \brief Returns pointer for internal buffer in GPU memory. */ 130 | operator T*(); 131 | 132 | /** \brief Returns const pointer for internal buffer in GPU memory. */ 133 | operator const T*() const; 134 | 135 | /** \brief Returns size in elements. */ 136 | size_t size() const; 137 | }; 138 | 139 | 140 | ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 141 | /** \brief @b DeviceArray2D class 142 | * 143 | * \note Typed container for pitched GPU memory with reference counting. 144 | * 145 | * \author Anatoly Baksheev 146 | */ 147 | template 148 | class DeviceArray2D : public DeviceMemory2D 149 | { 150 | public: 151 | /** \brief Element type. */ 152 | typedef T type; 153 | 154 | /** \brief Element size. */ 155 | enum { elem_size = sizeof(T) }; 156 | 157 | /** \brief Empty constructor. */ 158 | DeviceArray2D(); 159 | 160 | /** \brief Allocates internal buffer in GPU memory 161 | * \param rows: number of rows to allocate 162 | * \param cols: number of elements in each row 163 | * */ 164 | DeviceArray2D(int rows, int cols); 165 | 166 | /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case. 167 | * \param rows: number of rows 168 | * \param cols: number of elements in each row 169 | * \param data: pointer to buffer 170 | * \param stepBytes: stride between two consecutive rows in bytes 171 | * */ 172 | DeviceArray2D(int rows, int cols, void *data, size_t stepBytes); 173 | 174 | /** \brief Copy constructor. Just increments reference counter. */ 175 | DeviceArray2D(const DeviceArray2D& other); 176 | 177 | /** \brief Assigment operator. Just increments reference counter. */ 178 | DeviceArray2D& operator = (const DeviceArray2D& other); 179 | 180 | /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing. 181 | * \param rows: number of rows to allocate 182 | * \param cols: number of elements in each row 183 | * */ 184 | void create(int rows, int cols); 185 | 186 | /** \brief Decrements reference counter and releases internal buffer if needed. */ 187 | void release(); 188 | 189 | /** \brief Performs data copying. If destination size differs it will be reallocated. 190 | * \param other: destination container 191 | * */ 192 | void copyTo(DeviceArray2D& other) const; 193 | 194 | /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough. 195 | * \param host_ptr: pointer to host buffer to upload 196 | * \param host_step: stride between two consecutive rows in bytes for host buffer 197 | * \param rows: number of rows to upload 198 | * \param cols: number of elements in each row 199 | * */ 200 | void upload(const void *host_ptr, size_t host_step, int rows, int cols); 201 | 202 | /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size. 203 | * \param host_ptr: pointer to host buffer to download 204 | * \param host_step: stride between two consecutive rows in bytes for host buffer 205 | * */ 206 | void download(void *host_ptr, size_t host_step) const; 207 | 208 | /** \brief Performs swap of data pointed with another device array. 209 | * \param other: device array to swap with 210 | * */ 211 | void swap(DeviceArray2D& other_arg); 212 | 213 | /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough. 214 | * \param data: host vector to upload from 215 | * \param cols: stride in elements between two consecutive rows in bytes for host buffer 216 | * */ 217 | template 218 | void upload(const std::vector& data, int cols); 219 | 220 | /** \brief Downloads data from internal buffer to CPU memory 221 | * \param data: host vector to download to 222 | * \param cols: Output stride in elements between two consecutive rows in bytes for host vector. 223 | * */ 224 | template 225 | void download(std::vector& data, int& cols) const; 226 | 227 | /** \brief Returns pointer to given row in internal buffer. 228 | * \param y_arg: row index 229 | * */ 230 | T* ptr(int y = 0); 231 | 232 | /** \brief Returns const pointer to given row in internal buffer. 233 | * \param y_arg: row index 234 | * */ 235 | const T* ptr(int y = 0) const; 236 | 237 | //using DeviceMemory2D::ptr; 238 | 239 | /** \brief Returns pointer for internal buffer in GPU memory. */ 240 | operator T*(); 241 | 242 | /** \brief Returns const pointer for internal buffer in GPU memory. */ 243 | operator const T*() const; 244 | 245 | /** \brief Returns number of elements in each row. */ 246 | int cols() const; 247 | 248 | /** \brief Returns number of rows. */ 249 | int rows() const; 250 | 251 | /** \brief Returns step in elements. */ 252 | size_t elem_step() const; 253 | }; 254 | 255 | #include "device_array_impl.hpp" 256 | 257 | #endif /* DEVICE_ARRAY_HPP_ */ 258 | -------------------------------------------------------------------------------- /Cuda/containers/initialization.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Software License Agreement (BSD License) 3 | * 4 | * Copyright (c) 2011, Willow Garage, Inc. 5 | * All rights reserved. 6 | * 7 | * Redistribution and use in source and binary forms, with or without 8 | * modification, are permitted provided that the following conditions 9 | * are met: 10 | * 11 | * * Redistributions of source code must retain the above copyright 12 | * notice, this list of conditions and the following disclaimer. 13 | * * Redistributions in binary form must reproduce the above 14 | * copyright notice, this list of conditions and the following 15 | * disclaimer in the documentation and/or other materials provided 16 | * with the distribution. 17 | * * Neither the name of Willow Garage, Inc. nor the names of its 18 | * contributors may be used to endorse or promote products derived 19 | * from this software without specific prior written permission. 20 | * 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 | * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 29 | * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 30 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 31 | * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 | * POSSIBILITY OF SUCH DAMAGE. 33 | * 34 | * Author: Anatoly Baskeheev, Itseez Ltd, (myname.mysurname@mycompany.com) 35 | */ 36 | 37 | #include "initialization.hpp" 38 | #include "cuda.h" 39 | #include "safe_call.hpp" 40 | #include 41 | #include 42 | #include 43 | 44 | int getCudaEnabledDeviceCount() { 45 | int count; 46 | cudaError_t error = cudaGetDeviceCount(&count); 47 | 48 | if (error == cudaErrorInsufficientDriver) 49 | return -1; 50 | 51 | if (error == cudaErrorNoDevice) 52 | return 0; 53 | 54 | cudaSafeCall(error); 55 | return count; 56 | } 57 | 58 | void setDevice(int device) { cudaSafeCall(cudaSetDevice(device)); } 59 | 60 | std::string getDeviceName(int device) { 61 | cudaDeviceProp prop; 62 | cudaSafeCall(cudaGetDeviceProperties(&prop, device)); 63 | 64 | return prop.name; 65 | } 66 | 67 | bool checkIfPreFermiGPU(int device) { 68 | if (device < 0) 69 | cudaSafeCall(cudaGetDevice(&device)); 70 | 71 | cudaDeviceProp prop; 72 | cudaSafeCall(cudaGetDeviceProperties(&prop, device)); 73 | return prop.major < 2; // CC == 1.x 74 | } 75 | 76 | void error(const char *error_string, const char *file, const int line, 77 | const char *func) { 78 | std::cout << "Error: " << error_string << "\t" << file << ":" << line 79 | << std::endl; 80 | exit(0); 81 | } 82 | 83 | template 84 | inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, 85 | int device) { 86 | *attribute = T(); 87 | CUresult error1 = CUDA_SUCCESS; // = cuDeviceGetAttribute( attribute, 88 | // device_attribute, device ); 89 | if (CUDA_SUCCESS == error1) 90 | return; 91 | 92 | printf("Driver API error = %04d\n", error1); 93 | error("driver API error", __FILE__, __LINE__); 94 | } 95 | 96 | inline int convertSMVer2Cores(int major, int minor) { 97 | // Defines for GPU Architecture types (using the SM version to determine the # 98 | // of cores per SM 99 | typedef struct { 100 | int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM 101 | // minor version 102 | int Cores; 103 | } SMtoCores; 104 | 105 | SMtoCores gpuArchCoresPerSM[] = {{0x10, 8}, {0x11, 8}, {0x12, 8}, {0x13, 8}, 106 | {0x20, 32}, {0x21, 48}, {-1, -1}}; 107 | 108 | int index = 0; 109 | while (gpuArchCoresPerSM[index].SM != -1) { 110 | if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor)) 111 | return gpuArchCoresPerSM[index].Cores; 112 | index++; 113 | } 114 | printf("MapSMtoCores undefined SMversion %d.%d!\n", major, minor); 115 | return -1; 116 | } 117 | 118 | void printCudaDeviceInfo(int device) { 119 | int count = getCudaEnabledDeviceCount(); 120 | bool valid = (device >= 0) && (device < count); 121 | 122 | int beg = valid ? device : 0; 123 | int end = valid ? device + 1 : count; 124 | 125 | printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) " 126 | "*** \n\n"); 127 | printf("Device count: %d\n", count); 128 | 129 | int driverVersion = 0, runtimeVersion = 0; 130 | cudaSafeCall(cudaDriverGetVersion(&driverVersion)); 131 | cudaSafeCall(cudaRuntimeGetVersion(&runtimeVersion)); 132 | 133 | const char *computeMode[] = { 134 | "Default (multiple host threads can use ::cudaSetDevice() with device " 135 | "simultaneously)", 136 | "Exclusive (only one host thread in one process is able to use " 137 | "::cudaSetDevice() with this device)", 138 | "Prohibited (no host thread can use ::cudaSetDevice() with this device)", 139 | "Exclusive Process (many threads in one process is able to use " 140 | "::cudaSetDevice() with this device)", 141 | "Unknown", 142 | NULL}; 143 | 144 | for (int dev = beg; dev < end; ++dev) { 145 | cudaDeviceProp prop; 146 | cudaSafeCall(cudaGetDeviceProperties(&prop, dev)); 147 | 148 | printf("\nDevice %d: \"%s\"\n", dev, prop.name); 149 | printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", 150 | driverVersion / 1000, driverVersion % 100, runtimeVersion / 1000, 151 | runtimeVersion % 100); 152 | printf(" CUDA Capability Major/Minor version number: %d.%d\n", 153 | prop.major, prop.minor); 154 | printf(" Total amount of global memory: %.0f MBytes (%llu " 155 | "bytes)\n", 156 | (float)prop.totalGlobalMem / 1048576.0f, 157 | (unsigned long long)prop.totalGlobalMem); 158 | printf(" (%2d) Multiprocessors x (%2d) CUDA Cores/MP: %d CUDA Cores\n", 159 | prop.multiProcessorCount, convertSMVer2Cores(prop.major, prop.minor), 160 | convertSMVer2Cores(prop.major, prop.minor) * 161 | prop.multiProcessorCount); 162 | printf(" GPU Clock Speed: %.2f GHz\n", 163 | prop.clockRate * 1e-6f); 164 | 165 | #if (CUDART_VERSION >= 4000) 166 | // This is not available in the CUDA Runtime API, so we make the necessary 167 | // calls the driver API to support this for output 168 | int memoryClock, memBusWidth, L2CacheSize; 169 | getCudaAttribute(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, 170 | dev); 171 | getCudaAttribute(&memBusWidth, 172 | CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev); 173 | getCudaAttribute(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev); 174 | 175 | printf(" Memory Clock rate: %.2f Mhz\n", 176 | memoryClock * 1e-3f); 177 | printf(" Memory Bus Width: %d-bit\n", 178 | memBusWidth); 179 | if (L2CacheSize) 180 | printf(" L2 Cache Size: %d bytes\n", 181 | L2CacheSize); 182 | 183 | printf(" Max Texture Dimension Size (x,y,z) 1D=(%d), " 184 | "2D=(%d,%d), 3D=(%d,%d,%d)\n", 185 | prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1], 186 | prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]); 187 | printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, " 188 | "2D=(%d,%d) x %d\n", 189 | prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1], 190 | prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], 191 | prop.maxTexture2DLayered[2]); 192 | #endif 193 | printf(" Total amount of constant memory: %u bytes\n", 194 | (int)prop.totalConstMem); 195 | printf(" Total amount of shared memory per block: %u bytes\n", 196 | (int)prop.sharedMemPerBlock); 197 | printf(" Total number of registers available per block: %d\n", 198 | prop.regsPerBlock); 199 | printf(" Warp size: %d\n", 200 | prop.warpSize); 201 | printf(" Maximum number of threads per block: %d\n", 202 | prop.maxThreadsPerBlock); 203 | printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", 204 | prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]); 205 | printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", 206 | prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]); 207 | printf(" Maximum memory pitch: %u bytes\n", 208 | (int)prop.memPitch); 209 | printf(" Texture alignment: %u bytes\n", 210 | (int)prop.textureAlignment); 211 | 212 | #if CUDART_VERSION >= 4000 213 | printf(" Concurrent copy and execution: %s with %d copy " 214 | "engine(s)\n", 215 | (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount); 216 | #else 217 | printf(" Concurrent copy and execution: %s\n", 218 | prop.deviceOverlap ? "Yes" : "No"); 219 | #endif 220 | printf(" Run time limit on kernels: %s\n", 221 | prop.kernelExecTimeoutEnabled ? "Yes" : "No"); 222 | printf(" Integrated GPU sharing Host Memory: %s\n", 223 | prop.integrated ? "Yes" : "No"); 224 | printf(" Support host page-locked memory mapping: %s\n", 225 | prop.canMapHostMemory ? "Yes" : "No"); 226 | 227 | printf(" Concurrent kernel execution: %s\n", 228 | prop.concurrentKernels ? "Yes" : "No"); 229 | printf(" Alignment requirement for Surfaces: %s\n", 230 | prop.surfaceAlignment ? "Yes" : "No"); 231 | printf(" Device has ECC support enabled: %s\n", 232 | prop.ECCEnabled ? "Yes" : "No"); 233 | printf(" Device is using TCC driver mode: %s\n", 234 | prop.tccDriver ? "Yes" : "No"); 235 | #if CUDART_VERSION >= 4000 236 | printf(" Device supports Unified Addressing (UVA): %s\n", 237 | prop.unifiedAddressing ? "Yes" : "No"); 238 | printf(" Device PCI Bus ID / PCI location ID: %d / %d\n", 239 | prop.pciBusID, prop.pciDeviceID); 240 | #endif 241 | printf(" Compute Mode:\n"); 242 | printf(" %s \n", computeMode[prop.computeMode]); 243 | } 244 | 245 | printf("\n"); 246 | printf("deviceQuery, CUDA Driver = CUDART"); 247 | printf(", CUDA Driver Version = %d.%d", driverVersion / 1000, 248 | driverVersion % 100); 249 | printf(", CUDA Runtime Version = %d.%d", runtimeVersion / 1000, 250 | runtimeVersion % 100); 251 | printf(", NumDevs = %d\n\n", count); 252 | fflush(stdout); 253 | } 254 | 255 | void printShortCudaDeviceInfo(int device) { 256 | int count = getCudaEnabledDeviceCount(); 257 | bool valid = (device >= 0) && (device < count); 258 | 259 | int beg = valid ? device : 0; 260 | int end = valid ? device + 1 : count; 261 | 262 | int driverVersion = 0, runtimeVersion = 0; 263 | cudaSafeCall(cudaDriverGetVersion(&driverVersion)); 264 | cudaSafeCall(cudaRuntimeGetVersion(&runtimeVersion)); 265 | 266 | for (int dev = beg; dev < end; ++dev) { 267 | cudaDeviceProp prop; 268 | cudaSafeCall(cudaGetDeviceProperties(&prop, dev)); 269 | 270 | const char *arch_str = prop.major < 2 ? " (not Fermi)" : ""; 271 | printf("Device %d: \"%s\" %.0fMb", dev, prop.name, 272 | (float)prop.totalGlobalMem / 1048576.0f); 273 | printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, 274 | convertSMVer2Cores(prop.major, prop.minor) * 275 | prop.multiProcessorCount); 276 | printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion / 1000, 277 | driverVersion % 100, runtimeVersion / 1000, runtimeVersion % 100); 278 | } 279 | fflush(stdout); 280 | } 281 | --------------------------------------------------------------------------------