├── src
    ├── README.md
    ├── wrappers.h
    ├── cmake
    │   └── FindR.cmake
    ├── tricks.cuh
    ├── CMakeLists.txt
    ├── fp_abstraction.h
    ├── setup.py
    ├── transpose.cu
    ├── test.R
    ├── kmcuda.h
    ├── metric_abstraction.h
    ├── r.cc
    ├── private.h
    ├── knn.cu
    ├── python.cc
    └── kmcuda.cc
├── .github
    └── dco.yml
├── MAINTAINERS
├── img
    ├── sourced.png
    ├── cls_angular.png
    ├── cls_euclidean.png
    └── latex_angular.png
├── doc
    └── Doxyfile
├── .gitignore
├── .travis.yml
├── DCO
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── LICENSE.md
└── README.md


/src/README.md:
--------------------------------------------------------------------------------
1 | ../README.md


--------------------------------------------------------------------------------
/.github/dco.yml:
--------------------------------------------------------------------------------
1 | require:
2 |   members: false


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | Vadim Markovtsev <vadim@sourced.tech> (@vmarkovtsev)


--------------------------------------------------------------------------------
/img/sourced.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/kmcuda/HEAD/img/sourced.png


--------------------------------------------------------------------------------
/img/cls_angular.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/kmcuda/HEAD/img/cls_angular.png


--------------------------------------------------------------------------------
/img/cls_euclidean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/kmcuda/HEAD/img/cls_euclidean.png


--------------------------------------------------------------------------------
/img/latex_angular.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/src-d/kmcuda/HEAD/img/latex_angular.png


--------------------------------------------------------------------------------
/doc/Doxyfile:
--------------------------------------------------------------------------------
 1 | INPUT = ../src ../README.md
 2 | FILE_PATTERNS = *.h *.cc *.cu *.cuh *.md
 3 | EXTENSION_MAPPING = cu=C++
 4 | EXTRACT_ALL = YES
 5 | EXTRACT_ANON_NSPACES = YES
 6 | EXCLUDE_PATTERNS = *.py *.R
 7 | DOXYFILE_ENCODING = UTF-8
 8 | PROJECT_NAME = KMeansCUDA
 9 | OUTPUT_LANGUAGE = English
10 | GENERATE_XML = NO
11 | GENERATE_LATEX = NO
12 | GENERATE_HTML = YES
13 | HTML_OUTPUT = doxyhtml/
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | cmake-build-*
 3 | **/*.cbp
 4 | **/CMakeCache.txt
 5 | **/CMakeFiles
 6 | **/.DS_Store
 7 | **/Makefile
 8 | **/cmake_install.cmake
 9 | **/dist
10 | **/libKMCUDA.egg-info
11 | # Compiled Object files
12 | *.slo
13 | *.lo
14 | *.o
15 | *.obj
16 | 
17 | # Precompiled Headers
18 | *.gch
19 | *.pch
20 | 
21 | # Compiled Dynamic libraries
22 | *.so
23 | *.dylib
24 | *.dll
25 | 
26 | # Fortran module files
27 | *.mod
28 | *.smod
29 | 
30 | # Compiled Static libraries
31 | *.lai
32 | *.la
33 | *.a
34 | *.lib
35 | 
36 | # Executables
37 | *.exe
38 | *.out
39 | *.app
40 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: cpp
 2 | os:
 3 |   - linux
 4 |   # - osx TODO(vmarkovtsev): osx used to be supported once, but then CUDA updated, Homebrew updated, and I gave up fixing the CI in #63 
 5 | dist: xenial
 6 | sudo: required
 7 | 
 8 | before_install:
 9 |   - rm -rf /opt/python
10 |   - wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/cuda-repo-ubuntu1604_8.0.44-1_amd64.deb
11 |   - sudo dpkg -i cuda-repo-ubuntu1604_8.0.44-1_amd64.deb
12 |   - sudo apt-get update
13 |   - sudo apt-get install -y --no-install-suggests --no-install-recommends python3-dev python3-numpy r-base-core cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
14 | 
15 | before_script:
16 |   - mkdir src/build
17 |   - cd src/build
18 |   - cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0 ..
19 | 
20 | script:
21 |   - make -j2 VERBOSE=1
22 | 
23 | notifications:
24 |   email: false
25 | 


--------------------------------------------------------------------------------
/src/wrappers.h:
--------------------------------------------------------------------------------
 1 | #ifndef KMCUDA_WRAPPERS_H
 2 | #define KMCUDA_WRAPPERS_H
 3 | 
 4 | #include <cuda_runtime_api.h>
 5 | #include <functional>
 6 | #include <memory>
 7 | #include <vector>
 8 | 
 9 | template <typename T>
10 | using unique_devptr_parent = std::unique_ptr<T, std::function<void(T*)>>;
11 | 
12 | /// RAII for CUDA arrays. Calls cudaFree() on the bound pointer, but only
13 | /// if it is not nullptr (funnily enough, CUDA segfaults otherwise).
14 | /// As can be seen, inherits the rest of the methods from std::unique_ptr.
15 | /// @param T The type of the array element.
16 | template <typename T>
17 | class unique_devptr : public unique_devptr_parent<T> {
18 |  public:
19 |   explicit unique_devptr(T *ptr, bool fake = false) : unique_devptr_parent<T>(
20 |       ptr, fake? [](T*){} : [](T *p){ cudaFree(p); }) {}
21 | };
22 | 
23 | /// std::vector of unique_devptr-s. Used to pass device arrays inside .cu
24 | /// wrapping functions.
25 | /// @param T The type of the array element.
26 | template <class T>
27 | using udevptrs = std::vector<unique_devptr<T>>;
28 | 
29 | #endif //KMCUDA_WRAPPERS_H
30 | 


--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 1 Letterman Drive
 6 | Suite D4700
 7 | San Francisco, CA, 94129
 8 | 
 9 | Everyone is permitted to copy and distribute verbatim copies of this
10 | license document, but changing it is not allowed.
11 | 
12 | 
13 | Developer's Certificate of Origin 1.1
14 | 
15 | By making a contribution to this project, I certify that:
16 | 
17 | (a) The contribution was created in whole or in part by me and I
18 |     have the right to submit it under the open source license
19 |     indicated in the file; or
20 | 
21 | (b) The contribution is based upon previous work that, to the best
22 |     of my knowledge, is covered under an appropriate open source
23 |     license and I have the right under that license to submit that
24 |     work with modifications, whether created in whole or in part
25 |     by me, under the same open source license (unless I am
26 |     permitted to submit under a different license), as indicated
27 |     in the file; or
28 | 
29 | (c) The contribution was provided directly to me by some other
30 |     person who certified (a), (b) or (c) and I have not modified
31 |     it.
32 | 
33 | (d) I understand and agree that this project and the contribution
34 |     are public and that a record of the contribution (including all
35 |     personal information I submit with it, including my sign-off) is
36 |     maintained indefinitely and may be redistributed consistent with
37 |     this project or the open source license(s) involved.


--------------------------------------------------------------------------------
/src/cmake/FindR.cmake:
--------------------------------------------------------------------------------
 1 | # CMake module to find R
 2 | # - Try to find R
 3 | # Once done, this will define
 4 | #
 5 | #  R_FOUND - system has R
 6 | #  R_INCLUDE_DIRS - the R include directories
 7 | #  R_LIBRARIES - link these to use R
 8 | #  R_ROOT_DIR - As reported by R
 9 | # Autor: Omar Andres Zapata Mesa 31/05/2013
10 | if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
11 |   set(CMAKE_FIND_APPBUNDLE "LAST")
12 | endif()
13 | find_program(R_EXECUTABLE NAMES R R.exe)
14 | #---searching R installtion unsing R executable
15 | if(R_EXECUTABLE)
16 |   execute_process(COMMAND ${R_EXECUTABLE} RHOME
17 |                   OUTPUT_VARIABLE R_ROOT_DIR
18 |                   OUTPUT_STRIP_TRAILING_WHITESPACE)
19 |   find_path(R_INCLUDE_DIR R.h
20 |             HINTS ${R_ROOT_DIR}
21 |             PATHS /usr/local/lib /usr/local/lib64 /usr/share
22 |             PATH_SUFFIXES include R/include
23 |             DOC "Path to file R.h")
24 |   find_library(R_LIBRARY R
25 |             HINTS ${R_ROOT_DIR}/lib
26 |             DOC "R library (example libR.a, libR.dylib, etc.).")
27 | endif()
28 | #---setting include dirs and libraries
29 | set(R_LIBRARIES ${R_LIBRARY})
30 | set(R_INCLUDE_DIRS ${R_INCLUDE_DIR})
31 | foreach(_cpt ${R_FIND_COMPONENTS})
32 |   execute_process(COMMAND echo "cat(find.package('${_cpt}'))"
33 |                   COMMAND ${R_EXECUTABLE} --vanilla --slave
34 |                   OUTPUT_VARIABLE _cpt_path
35 |                   OUTPUT_STRIP_TRAILING_WHITESPACE)
36 |   find_library(R_${_cpt}_LIBRARY
37 |                lib${_cpt}.so lib${_cpt}.dylib
38 |                HINTS ${_cpt_path}/lib)
39 |   if(R_${_cpt}_LIBRARY)
40 |     mark_as_advanced(R_${_cpt}_LIBRARY)
41 |     list(APPEND R_LIBRARIES ${R_${_cpt}_LIBRARY})
42 |   endif()
43 |   find_path(R_${_cpt}_INCLUDE_DIR ${_cpt}.h HINTS  ${_cpt_path} PATH_SUFFIXES include R/include)
44 |   if(R_${_cpt}_INCLUDE_DIR)
45 |     mark_as_advanced(R_${_cpt}_INCLUDE_DIR)
46 |     list(APPEND R_INCLUDE_DIRS ${R_${_cpt}_INCLUDE_DIR})
47 |   endif()
48 |   if(R_${_cpt}_INCLUDE_DIR AND R_${_cpt}_LIBRARY)
49 |     list(REMOVE_ITEM R_FIND_COMPONENTS ${_cpt})
50 |   endif()
51 | endforeach()
52 | # Handle the QUIETLY and REQUIRED arguments and set R_FOUND to TRUE if all listed variables are TRUE
53 | include(FindPackageHandleStandardArgs)
54 | find_package_handle_standard_args(R DEFAULT_MSG R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
55 | mark_as_advanced(R_FOUND R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
56 | 


--------------------------------------------------------------------------------
/src/tricks.cuh:
--------------------------------------------------------------------------------
 1 | #include <cstdint>
 2 | 
 3 | #define warpSize 32
 4 | 
 5 | /// Inline function which rounds the ratio between size and each to the nearest
 6 | /// greater than or equal integer.
 7 | /// @param T Any integer type. Calling dupper() on floating point types is useless.
 8 | template <typename T>
 9 | __device__ __forceinline__ T dupper(T size, T each) {
10 |   T div = size / each;
11 |   if (div * each == size) {
12 |     return div;
13 |   }
14 |   return div + 1;
15 | }
16 | 
17 | /*template <typename T>
18 | __device__ __forceinline__ T dmin(T a, T b) {
19 |   return a <= b? a : b;
20 | }*/
21 | 
22 | /// Optimized aggregation, equivalent to and a drop-in replacement for atomicInc.
23 | /// https://devblogs.nvidia.com/parallelforall/cuda-pro-tip-optimized-filtering-warp-aggregated-atomics/
24 | __device__ __forceinline__ uint32_t atomicAggInc(uint32_t *ctr) {
25 |   int mask = ballot(1);
26 |   int leader = __ffs(mask) - 1;
27 |   uint32_t res;
28 |   if ((threadIdx.x % warpSize) == leader) {
29 |     res = atomicAdd(ctr, __popc(mask));
30 |   }
31 |   res = shfl(res, leader);
32 |   return res + __popc(mask & ((1 << (threadIdx.x % warpSize)) - 1));
33 | }
34 | 
35 | /// Optimized sum reduction, sums all the values across the warp.
36 | /// https://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/
37 | template <typename T>
38 | __device__ __forceinline__ T warpReduceSum(T val) {
39 |   #pragma unroll
40 |   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
41 |     val += shfl_down(val, offset);
42 |   }
43 |   return val;
44 | }
45 | 
46 | /// Optimized minimum reduction, finds the minimum across the values in the warp.
47 | template <typename T>
48 | __device__ __forceinline__ T warpReduceMin(T val) {
49 |   #pragma unroll
50 |   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
51 |     val = min(val, shfl_down(val, offset));
52 |   }
53 |   return val;
54 | }
55 | 
56 | /// This is how would atomicMin() for float-s look like.
57 | /// https://github.com/parallel-forall/code-samples/blob/master/posts/cuda-aware-mpi-example/src/Device.cu#L53
58 | __device__ __forceinline__ void atomicMin(
59 |     float *const address, const float value) {
60 | 	if (*address <= value) {
61 | 		return;
62 | 	}
63 | 
64 | 	int32_t *const address_as_i = reinterpret_cast<int32_t*>(address);
65 | 	int32_t old = *address_as_i, assumed;
66 | 
67 | 	do {
68 | 		assumed = old;
69 | 		if (__int_as_float(assumed) <= value) {
70 | 			break;
71 | 		}
72 | 		old = atomicCAS(address_as_i, assumed, __float_as_int(value));
73 | 	} while (assumed != old);
74 | }
75 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | KMCUDA project is [Apache licensed](LICENSE.md) and accepts
 4 | contributions via GitHub pull requests.  This document outlines some of the
 5 | conventions on development workflow, commit message formatting, contact points,
 6 | and other resources to make it easier to get your contribution accepted.
 7 | 
 8 | ## Certificate of Origin
 9 | 
10 | By contributing to this project you agree to the [Developer Certificate of
11 | Origin (DCO)](DCO). This document was created by the Linux Kernel community and is a
12 | simple statement that you, as a contributor, have the legal right to make the
13 | contribution.
14 | 
15 | In order to show your agreement with the DCO you should include at the end of commit message,
16 | the following line: `Signed-off-by: John Doe <john.doe@example.com>`, using your real name.
17 | 
18 | This can be done easily using the [`-s`](https://github.com/git/git/blob/b2c150d3aa82f6583b9aadfecc5f8fa1c74aca09/Documentation/git-commit.txt#L154-L161) flag on the `git commit`.
19 | 
20 | 
21 | ## Support Channels
22 | 
23 | The official support channels, for both users and contributors, are:
24 | 
25 | - GitHub [issues](https://github.com/src-d/kmcuda/issues)*
26 | - Slack: #machine-learning room in the [source{d} Slack](https://join.slack.com/t/sourced-community/shared_invite/enQtMjc4Njk5MzEyNzM2LTFjNzY4NjEwZGEwMzRiNTM4MzRlMzQ4MmIzZjkwZmZlM2NjODUxZmJjNDI1OTcxNDAyMmZlNmFjODZlNTg0YWM)
27 | 
28 | *Before opening a new issue or submitting a new pull request, it's helpful to
29 | search the project - it's likely that another user has already reported the
30 | issue you're facing, or it's a known issue that we're already aware of.
31 | 
32 | 
33 | ## How to Contribute
34 | 
35 | Pull Requests (PRs) are the main and exclusive way to contribute to the official go-git project.
36 | In order for a PR to be accepted it needs to pass a list of requirements:
37 | 
38 | - All the tests pass.
39 | - Python code is formatted according to [![PEP8](https://img.shields.io/badge/code%20style-pep8-orange.svg)](https://www.python.org/dev/peps/pep-0008/).
40 | - If the PR is a bug fix, it has to include a new unit test that fails before the patch is merged.
41 | - If the PR is a new feature, it has to come with a suite of unit tests, that tests the new functionality.
42 | - In any case, all the PRs have to pass the personal evaluation of at least one of the [maintainers](MAINTAINERS.md).
43 | 
44 | 
45 | ### Format of the commit message
46 | 
47 | The commit summary must start with a capital letter and with a verb in present tense. No dot in the end.
48 | 
49 | ```
50 | Add a feature
51 | Remove unused code
52 | Fix a bug
53 | ```
54 | 
55 | Every commit details should describe what was changed, under which context and, if applicable, the GitHub issue it relates to.
56 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | education, socio-economic status, nationality, personal appearance, race,
10 | religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at conduct@sourced.tech. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.2)
 2 | project(KMCUDA)
 3 | set(CMAKE_MODULE_PATH ${CMAKE_HOME_DIRECTORY}/cmake)
 4 | find_package(OpenMP REQUIRED)
 5 | if (APPLE AND NOT CUDA_HOST_COMPILER)
 6 |   # https://gitlab.kitware.com/cmake/cmake/issues/13674
 7 |   message(WARNING "CUDA_HOST_COMPILER is set to the default /usr/bin/clang; CUDA 8.0 requires 7.3 or older.")
 8 |   set(CUDA_HOST_COMPILER "/usr/bin/clang" CACHE FILEPATH "" FORCE)
 9 | endif()
10 | find_package(CUDA QUIET REQUIRED)
11 | if (NOT DISABLE_PYTHON)
12 |   if (APPLE)
13 |     # workaround
14 |     # https://github.com/Homebrew/legacy-homebrew/issues/25118
15 |     # https://cmake.org/Bug/view.php?id=14809
16 |     find_program(PYTHON_CONFIG_EXECUTABLE python3-config)
17 |     message("-- Found python3-config: ${PYTHON_CONFIG_EXECUTABLE}")
18 |     execute_process(COMMAND ${PYTHON_CONFIG_EXECUTABLE} --prefix OUTPUT_VARIABLE PYTHON_PREFIX OUTPUT_STRIP_TRAILING_WHITESPACE)
19 |     message("-- Discovered Python 3.x prefix: ${PYTHON_PREFIX}")
20 |     set(PYTHON_EXECUTABLE "${PYTHON_PREFIX}/bin/python3")
21 |   endif()
22 |   find_package(PythonInterp 3 REQUIRED)
23 |   find_package(PythonLibs 3 REQUIRED)
24 |   if (NOT NUMPY_INCLUDES)
25 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.get_include())" OUTPUT_VARIABLE NUMPY_INCLUDES)
26 |   endif()
27 | endif()
28 | if (NOT DISABLE_R)
29 |   find_package(R)
30 | endif()
31 | if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
32 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPROFILE")
33 | endif()
34 | #set(CMAKE_VERBOSE_MAKEFILE on)
35 | if (NOT DEFINED CUDA_ARCH)
36 |   set(CUDA_ARCH "61")
37 | endif()
38 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
39 | set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh
40 |                  metric_abstraction.h kmeans.cu knn.cu transpose.cu)
41 | if (PYTHONLIBS_FOUND)
42 |   list(APPEND SOURCE_FILES python.cc)
43 | endif()
44 | if (R_FOUND)
45 |   list(APPEND SOURCE_FILES r.cc)
46 | endif()
47 | if (CMAKE_BUILD_TYPE STREQUAL "Debug")
48 |   set(NVCC_FLAGS "-G -g")
49 | endif()
50 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -arch sm_${CUDA_ARCH} -Xptxas=-v -D_MWAITXINTRIN_H_INCLUDED -D_FORCE_INLINES")
51 | if (CMAKE_MAJOR_VERSION LESS 4 AND CMAKE_MINOR_VERSION LESS 3)
52 |   # workaround https://github.com/Kitware/CMake/commit/99abebdea01b9ef73e091db5594553f7b1694a1b
53 |   message(STATUS "Applied CUDA C++11 workaround on CMake < 3.3")
54 |   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} --std c++11")
55 | endif()
56 | if (APPLE)
57 |   message(STATUS "Applied CUDA OpenMP macOS workaround")
58 |   set(CUDA_PROPAGATE_HOST_FLAGS OFF)
59 |   set(CMAKE_SHARED_LIBRARY_CXX_FLAGS_BACKUP "${CMAKE_SHARED_LIBRARY_CXX_FLAGS}")
60 |   set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS} ${CMAKE_CXX_FLAGS} -Wno-unused-function")
61 |   string(REGEX REPLACE "-fopenmp[^ ]*" "" CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS}")
62 | endif()
63 | cuda_add_library(KMCUDA SHARED ${SOURCE_FILES} OPTIONS ${NVCC_FLAGS})
64 | if (APPLE)
65 |   set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS_BACKUP}")
66 | endif()
67 | target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY})
68 | if (PYTHONLIBS_FOUND)
69 |   include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES})
70 |   target_link_libraries(KMCUDA ${PYTHON_LIBRARIES})
71 | endif()
72 | if (R_FOUND)
73 |   include_directories(${R_INCLUDE_DIRS})
74 |   target_link_libraries(KMCUDA ${R_LIBRARIES})
75 | endif()
76 | if (SUFFIX)
77 |   set_target_properties(KMCUDA PROPERTIES SUFFIX ${SUFFIX})
78 | endif()
79 | 


--------------------------------------------------------------------------------
/src/fp_abstraction.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // half, half2 and float functions.
  3 | //
  4 | 
  5 | #ifndef KMCUDA_FP_ABSTRACTION_H
  6 | #define KMCUDA_FP_ABSTRACTION_H
  7 | 
  8 | #include <cfloat>
  9 | 
 10 | #if CUDA_ARCH >= 60
 11 | #undef volatile
 12 | #define volatile volatile
 13 | #include <cuda_fp16.h>
 14 | #undef volatile
 15 | #define volatile
 16 | #endif
 17 | 
 18 | #define FPATTR __device__ __forceinline__
 19 | 
 20 | template <typename F>
 21 | struct HALF;
 22 | 
 23 | template <>
 24 | struct HALF<float> {
 25 |     using type = float;
 26 | };
 27 | 
 28 | template <typename HF>
 29 | FPATTR HF _fmax();
 30 | 
 31 | template <>
 32 | FPATTR float _fmax() {
 33 |   return FLT_MAX;
 34 | }
 35 | 
 36 | template <typename F>
 37 | FPATTR F _const(int);
 38 | 
 39 | template <>
 40 | FPATTR float _const(int v) {
 41 |   return v;
 42 | }
 43 | 
 44 | FPATTR float _fin(float v) {
 45 |   return v;
 46 | }
 47 | 
 48 | FPATTR float _fout(float v) {
 49 |   return v;
 50 | }
 51 | 
 52 | FPATTR float _float(float v) {
 53 |   return v;
 54 | }
 55 | 
 56 | template <class F>
 57 | FPATTR typename HALF<F>::type _half(float v);
 58 | 
 59 | template <>
 60 | FPATTR float _half<float>(float v) {
 61 |   return v;
 62 | }
 63 | 
 64 | FPATTR bool _eq(float v1, float v2) {
 65 |   return v1 == v2;
 66 | }
 67 | 
 68 | FPATTR bool _neq(float v1, float v2) {
 69 |   return v1 != v2;
 70 | }
 71 | 
 72 | FPATTR float _add(float v1, float v2) {
 73 |   return v1 + v2;
 74 | }
 75 | 
 76 | FPATTR float _sub(float v1, float v2) {
 77 |   return v1 - v2;
 78 | }
 79 | 
 80 | FPATTR float _mul(float v1, float v2) {
 81 |   return v1 * v2;
 82 | }
 83 | 
 84 | FPATTR float _reciprocal(float v) {
 85 |   return __frcp_rn(v);
 86 | }
 87 | 
 88 | FPATTR float _fma(float acc, float v1, float v2) {
 89 |   return __fmaf_rd(v1, v2, acc);
 90 | }
 91 | 
 92 | FPATTR bool _lt(float v1, float v2) {
 93 |   return v1 < v2;
 94 | }
 95 | 
 96 | FPATTR float _sqrt(float v) {
 97 |   return __fsqrt_rn(v);
 98 | }
 99 | 
100 | #if CUDA_ARCH >= 60
101 | template <>
102 | struct HALF<half2> {
103 |   using type = half;
104 | };
105 | 
106 | template <>
107 | FPATTR half _fmax() {
108 |   return __int2half_rd(65504);
109 | }
110 | 
111 | template <>
112 | FPATTR half2 _const(int v) {
113 |   return __half2half2(__int2half_rd(v));
114 | }
115 | 
116 | template <>
117 | FPATTR half _const(int v) {
118 |   return __int2half_rd(v);
119 | }
120 | 
121 | FPATTR half _fin(half2 v) {
122 |   return __hadd(__high2half(v), __low2half(v));
123 | }
124 | 
125 | FPATTR half2 _fout(half v) {
126 |   return __half2half2(v);
127 | }
128 | 
129 | FPATTR float _float(half v) {
130 |   return __half2float(v);
131 | }
132 | 
133 | template <>
134 | FPATTR half _half<half2>(float v) {
135 |   return __float2half(v);
136 | }
137 | 
138 | FPATTR bool _eq(half2 v1, half2 v2) {
139 |   return __hbeq2(v1, v2);
140 | }
141 | 
142 | FPATTR bool _neq(half2 v1, half2 v2) {
143 |   return !__hbeq2(v1, v2);
144 | }
145 | 
146 | FPATTR half2 _add(half2 v1, half2 v2) {
147 |   return __hadd2(v1, v2);
148 | }
149 | 
150 | FPATTR half2 _sub(half2 v1, half2 v2) {
151 |   return __hsub2(v1, v2);
152 | }
153 | 
154 | FPATTR half2 _mul(half2 v1, half2 v2) {
155 |   return __hmul2(v1, v2);
156 | }
157 | 
158 | FPATTR half _reciprocal(half v) {
159 |   return hrcp(v);
160 | }
161 | 
162 | FPATTR half2 _reciprocal(half2 v) {
163 |   return h2rcp(v);
164 | }
165 | 
166 | FPATTR half2 _fma(half2 acc, half2 v1, half2 v2) {
167 |   return __hfma2(v1, v2, acc);
168 | }
169 | 
170 | FPATTR bool _lt(half v1, half v2) {
171 |   return __hlt(v1, v2);
172 | }
173 | 
174 | FPATTR half _sqrt(half v) {
175 |   return hsqrt(v);
176 | }
177 | 
178 | FPATTR half2 _sqrt(half2 v) {
179 |   return h2sqrt(v);
180 | }
181 | 
182 | #endif // CUDA_ARCH >= 60
183 | 
184 | #endif // KMCUDA_FP_ABSTRACTION_H
185 | 


--------------------------------------------------------------------------------
/src/setup.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import cpu_count
  2 | import os
  3 | from setuptools import setup
  4 | from setuptools.command.build_py import build_py
  5 | from setuptools.command.sdist import sdist
  6 | from setuptools.dist import Distribution
  7 | from shutil import copyfile
  8 | from subprocess import check_call
  9 | from sys import platform
 10 | 
 11 | class SetupConfigurationError(Exception):
 12 |     pass
 13 | 
 14 | 
 15 | class CMakeBuild(build_py):
 16 |     SHLIB = "libKMCUDA.so"
 17 | 
 18 |     def run(self):
 19 |         if not self.dry_run:
 20 |             self._build()
 21 |         super(CMakeBuild, self).run()
 22 | 
 23 |     def get_outputs(self, *args, **kwargs):
 24 |         outputs = super(CMakeBuild, self).get_outputs(*args, **kwargs)
 25 |         outputs.extend(self._shared_lib)
 26 |         return outputs
 27 | 
 28 |     def _build(self, builddir=None):
 29 |         if platform != "darwin":
 30 |             cuda_toolkit_dir = os.getenv("CUDA_TOOLKIT_ROOT_DIR")
 31 |             cuda_arch = os.getenv("CUDA_ARCH", "61")
 32 |             if cuda_toolkit_dir is None:
 33 |                 raise SetupConfigurationError(
 34 |                     "CUDA_TOOLKIT_ROOT_DIR environment variable must be defined")
 35 |             check_call(("cmake", "-DCMAKE_BUILD_TYPE=Release", "-DDISABLE_R=y",
 36 |                         "-DCUDA_TOOLKIT_ROOT_DIR=%s" % cuda_toolkit_dir,
 37 |                         "-DCUDA_ARCH=%s" % cuda_arch,
 38 |                         "."))
 39 |         else:
 40 |             ccbin = os.getenv("CUDA_HOST_COMPILER", "/usr/bin/clang")
 41 |             env = dict(os.environ)
 42 |             env.setdefault("CC", "/usr/local/opt/llvm/bin/clang")
 43 |             env.setdefault("CXX", "/usr/local/opt/llvm/bin/clang++")
 44 |             env.setdefault("LDFLAGS", "-L/usr/local/opt/llvm/lib/")
 45 |             check_call(("cmake", "-DCMAKE_BUILD_TYPE=Release", "-DDISABLE_R=y",
 46 |                         "-DCUDA_HOST_COMPILER=%s" % ccbin, "-DSUFFIX=.so", "."),
 47 |                        env=env)
 48 |         check_call(("make", "-j%d" % cpu_count()))
 49 |         self.mkpath(self.build_lib)
 50 |         dest = os.path.join(self.build_lib, self.SHLIB)
 51 |         copyfile(self.SHLIB, dest)
 52 |         self._shared_lib = [dest]
 53 | 
 54 | 
 55 | class BinaryDistribution(Distribution):
 56 |     """Distribution which always forces a binary package with platform name"""
 57 |     def has_ext_modules(self):
 58 |         return True
 59 | 
 60 |     def is_pure(self):
 61 |         return False
 62 |         
 63 |         
 64 | class HackedSdist(sdist):
 65 |     def run_command(self, command):
 66 |         super().run_command(command)
 67 |         if command == "egg_info":
 68 |             self.get_finalized_command("egg_info").filelist.extend([
 69 |                 "fp_abstraction.h",
 70 |                 "CMakeLists.txt",
 71 |                 "kmcuda.cc",
 72 |                 "kmcuda.h",
 73 |                 "kmeans.cu",
 74 |                 "knn.cu",
 75 |                 "metric_abstraction.h",
 76 |                 "private.h",
 77 |                 "python.cc",
 78 |                 "test.py",
 79 |                 "transpose.cu",
 80 |                 "tricks.cuh",
 81 |                 "wrappers.h",
 82 |             ])
 83 | 
 84 | 
 85 | setup(
 86 |     name="libKMCUDA",
 87 |     description="Accelerated K-means and K-nn on GPU",
 88 |     version="6.2.3",
 89 |     license="Apache Software License",
 90 |     author="Vadim Markovtsev",
 91 |     author_email="vadim@sourced.tech",
 92 |     url="https://github.com/src-d/kmcuda",
 93 |     download_url="https://github.com/src-d/kmcuda",
 94 |     py_modules=["libKMCUDA"],
 95 |     install_requires=["numpy"],
 96 |     distclass=BinaryDistribution,
 97 |     cmdclass={'build_py': CMakeBuild, "sdist": HackedSdist},
 98 |     classifiers=[
 99 |         "Development Status :: 5 - Production/Stable",
100 |         "Intended Audience :: Developers",
101 |         "License :: OSI Approved :: Apache Software License",
102 |         "Operating System :: POSIX :: Linux",
103 |         "Topic :: Scientific/Engineering :: Information Analysis",
104 |         "Programming Language :: Python :: 3.4",
105 |         "Programming Language :: Python :: 3.5",
106 |         "Programming Language :: Python :: 3.6",
107 |         "Programming Language :: Python :: 3.7",
108 |     ]
109 | )
110 | 
111 | # python3 setup.py bdist_wheel
112 | # auditwheel repair -w dist dist/*
113 | # twine upload dist/*manylinux*
114 | 


--------------------------------------------------------------------------------
/src/transpose.cu:
--------------------------------------------------------------------------------
  1 | #include "private.h"
  2 | 
  3 | #define TILE_DIM 32
  4 | #define BLOCK_ROWS 8
  5 | 
  6 | __global__ void copy_sample_t(
  7 |     uint32_t index, uint32_t samples_size, uint16_t features_size,
  8 |     const float *__restrict__ samples, float *__restrict__ dest) {
  9 |   uint32_t ti = blockIdx.x * blockDim.x + threadIdx.x;
 10 |   if (ti >= features_size) {
 11 |     return;
 12 |   }
 13 |   dest[ti] = samples[static_cast<uint64_t>(samples_size) * static_cast<uint64_t>(ti) + index];
 14 | }
 15 | 
 16 | template <bool xyswap>
 17 | __global__ void transpose(
 18 |     const float *__restrict__ input, uint32_t rows, uint32_t cols,
 19 |     float *__restrict__ output) {
 20 |   __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 21 |   volatile uint32_t x = xyswap?
 22 |       blockIdx.y * TILE_DIM + threadIdx.y:
 23 |       blockIdx.x * TILE_DIM + threadIdx.x;
 24 |   volatile uint32_t y = xyswap?
 25 |       blockIdx.x * TILE_DIM + threadIdx.x:
 26 |       blockIdx.y * TILE_DIM + threadIdx.y;
 27 |   volatile uint32_t tx = xyswap? threadIdx.y : threadIdx.x;
 28 |   volatile uint32_t ty = xyswap? threadIdx.x : threadIdx.y;
 29 | 
 30 |   if (x < cols && y < rows) {
 31 |     for (uint32_t j = 0;
 32 |          j < min(static_cast<unsigned int>(TILE_DIM), rows - y);
 33 |          j += BLOCK_ROWS) {
 34 |       tile[ty + j][tx] = input[static_cast<uint64_t>(y + j) * cols + x];
 35 |     }
 36 |   }
 37 | 
 38 |   __syncthreads();
 39 | 
 40 |   x = xyswap?
 41 |       blockIdx.x * TILE_DIM + threadIdx.y:
 42 |       blockIdx.y * TILE_DIM + threadIdx.x;
 43 |   y = xyswap?
 44 |       blockIdx.y * TILE_DIM + threadIdx.x:
 45 |       blockIdx.x * TILE_DIM + threadIdx.y;
 46 | 
 47 |   if (x < rows && y < cols) {
 48 |     for (uint32_t j = 0;
 49 |          j < min(static_cast<unsigned int>(TILE_DIM), cols - y);
 50 |          j += BLOCK_ROWS) {
 51 |       output[static_cast<uint64_t>(y + j) * rows + x] = tile[tx][ty + j];
 52 |     }
 53 |   }
 54 | }
 55 | 
 56 | extern "C" {
 57 | 
 58 | KMCUDAResult cuda_copy_sample_t(
 59 |     uint32_t index, uint32_t offset, uint32_t samples_size, uint16_t features_size,
 60 |     const std::vector<int> &devs, int verbosity, const udevptrs<float> &samples,
 61 |     udevptrs<float> *dest) {
 62 |   FOR_EACH_DEVI(
 63 |     dim3 block(min(1024, features_size), 1, 1);
 64 |     dim3 grid(upper(static_cast<unsigned>(features_size), block.x), 1, 1);
 65 |     copy_sample_t<<<grid, block>>>(
 66 |         index, samples_size, features_size, samples[devi].get(),
 67 |         (*dest)[devi].get() + offset);
 68 |   );
 69 |   return kmcudaSuccess;
 70 | }
 71 | 
 72 | KMCUDAResult cuda_extract_sample_t(
 73 |     uint32_t index, uint32_t samples_size, uint16_t features_size,
 74 |     int verbosity, const float *samples, float *dest) {
 75 |   dim3 block(min(1024, features_size), 1, 1);
 76 |   dim3 grid(upper(static_cast<unsigned>(features_size), block.x), 1, 1);
 77 |   copy_sample_t<<<grid, block>>>(
 78 |       index, samples_size, features_size, samples, dest);
 79 |   CUCH(cudaDeviceSynchronize(), kmcudaRuntimeError);
 80 |   return kmcudaSuccess;
 81 | }
 82 | 
 83 | KMCUDAResult cuda_transpose(
 84 |     uint32_t samples_size, uint16_t features_size, bool forward,
 85 |     const std::vector<int> &devs, int verbosity, udevptrs<float> *samples) {
 86 |   INFO("transposing the samples...\n");
 87 |   uint64_t size = static_cast<uint64_t>(samples_size) * features_size * sizeof(float);
 88 |   float *ptr;
 89 |   CUCH(cudaMallocManaged(&ptr, size), kmcudaMemoryAllocationFailure);
 90 |   unique_devptr<float> managed(ptr);
 91 |   cudaSetDevice(devs[0]);
 92 |   CUCH(cudaMemcpy(ptr, (*samples)[0].get(), size, cudaMemcpyDefault), kmcudaMemoryCopyError);
 93 |   uint32_t cols, rows;
 94 |   if (forward) {
 95 |     cols = features_size;
 96 |     rows = samples_size;
 97 |   } else {
 98 |     cols = samples_size;
 99 |     rows = features_size;
100 |   }
101 |   int xdim = upper(cols, static_cast<uint32_t>(TILE_DIM));
102 |   int ydim = upper(rows, static_cast<uint32_t>(TILE_DIM));
103 |   bool xyswap = xdim < ydim;
104 |   dim3 block(xyswap? BLOCK_ROWS : TILE_DIM, xyswap? TILE_DIM : BLOCK_ROWS, 1);
105 |   dim3 grid(max(xdim, ydim), min(xdim, ydim), 1);
106 |   DEBUG("transpose <<<(%d, %d), (%d, %d)>>> %" PRIu32 ", %" PRIu32 "%s\n",
107 |         grid.x, grid.y, block.x, block.y, rows, cols, xyswap? ", xyswap" : "");
108 |   FOR_EACH_DEVI(
109 |     if (xyswap) {
110 |       transpose<true><<<grid, block>>>(ptr, rows, cols, (*samples)[devi].get());
111 |     } else {
112 |       transpose<false><<<grid, block>>>(ptr, rows, cols, (*samples)[devi].get());
113 |     }
114 |   );
115 |   SYNC_ALL_DEVS;
116 |   return kmcudaSuccess;
117 | }
118 | 
119 | }  // extern "C"


--------------------------------------------------------------------------------
/src/test.R:
--------------------------------------------------------------------------------
  1 | library(testthat)
  2 | 
  3 | if (exists("testing")) {
  4 |   setwd(cwd)
  5 |   dyn.load("libKMCUDA.so")
  6 | 
  7 |   context("K-means")
  8 |   test_that("Random",{
  9 |     set.seed(42)
 10 |     samples <- replicate(4, runif(16000))
 11 |     result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
 12 |                        init="random", seed=777, verbosity=2)
 13 |     kmcuda_asses = replicate(1, result$assignments)
 14 |     attach(kmeans(samples, result$centroids, iter.max=1))
 15 |     reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
 16 |     print(sprintf("Reassignments: %f", reasses))
 17 |     expect_lt(reasses, 0.01)
 18 |   })
 19 |   test_that("SingleDeviceKmeans++Lloyd",{
 20 |     set.seed(42)
 21 |     samples <- replicate(4, runif(16000))
 22 |     result = .External("kmeans_cuda", samples, 50, yinyang_t=0, device=1,
 23 |                        init="kmeans++", seed=777, verbosity=2)
 24 |     kmcuda_asses = replicate(1, result$assignments)
 25 |     attach(kmeans(samples, result$centroids, iter.max=1))
 26 |     reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
 27 |     print(sprintf("Reassignments: %f", reasses))
 28 |     expect_lt(reasses, 0.01)
 29 |   })
 30 |   test_that("MultiSamples",{
 31 |     set.seed(42)
 32 |     samples1 <- replicate(4, runif(16000))
 33 |     samples2 <- replicate(4, runif(16000))
 34 |     result = .External("kmeans_cuda", list(samples1, samples2), 50,
 35 |                        init="kmeans++", seed=777, verbosity=2)
 36 |     kmcuda_asses = replicate(1, result$assignments)
 37 |     expect_equal(length(kmcuda_asses), 32000)
 38 |     attach(kmeans(rbind(samples1, samples2), result$centroids, iter.max=1))
 39 |     reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
 40 |     print(sprintf("Reassignments: %f", reasses))
 41 |     expect_lt(reasses, 0.01)
 42 |   })
 43 |   test_that("AFK-MC2",{
 44 |     set.seed(42)
 45 |     samples <- replicate(4, runif(16000))
 46 |     result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
 47 |                        init=c("afkmc2", 100), seed=777, verbosity=2)
 48 |     kmcuda_asses = replicate(1, result$assignments)
 49 |     attach(kmeans(samples, result$centroids, iter.max=1))
 50 |     reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
 51 |     print(sprintf("Reassignments: %f", reasses))
 52 |     expect_lt(reasses, 0.01)
 53 |   })
 54 |   test_that("ImportCentroids",{
 55 |     set.seed(42)
 56 |     samples <- replicate(4, runif(16000))
 57 |     centroids <- replicate(4, runif(50))
 58 |     result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
 59 |                        init=centroids, seed=777, verbosity=2)
 60 |     kmcuda_asses = replicate(1, result$assignments)
 61 |     attach(kmeans(samples, result$centroids, iter.max=1))
 62 |     reasses = length(intersect(kmcuda_asses, cluster)) / length(cluster)
 63 |     print(sprintf("Reassignments: %f", reasses))
 64 |     expect_lt(reasses, 0.01)
 65 |   })
 66 |   test_that("RandomPlusAverageDistance",{
 67 |     set.seed(42)
 68 |     samples <- replicate(4, runif(16000))
 69 |     result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
 70 |                        init="random", seed=777, verbosity=2,
 71 |                        average_distance=TRUE)
 72 |     print(result$average_distance)
 73 |     expect_equal(result$average_distance, 0.2124216, tolerance=0.0000001);
 74 |   })
 75 | 
 76 |   context("K-nn")
 77 |   test_that("Cosine",{
 78 |     set.seed(42)
 79 |     samples <- replicate(4, runif(16000))
 80 |     samples <- samples / sqrt(rowSums(samples^2))
 81 |     cls = .External("kmeans_cuda", samples, 50, tolerance=0.01, metric="cos",
 82 |                     seed=777, verbosity=2, yinyang_t=0)
 83 |     lapply(rowSums(cls$centroids^2), function(r) expect_equal(r, 1, 0.0001))
 84 |     result = .External("knn_cuda", 20, samples, cls$centroids,
 85 |                        cls$assignments, metric="cos", verbosity=2)
 86 |     # the result is properly validated in test.py
 87 |     expect_equal(dim(result), c(16000, 20))
 88 |     expect_equal(class(result), "matrix")
 89 |     expect_equal(sum(apply(result, 1, function(r) length(unique(r)))), 16000 * 20)
 90 |   })
 91 | } else {
 92 |   testing <- TRUE
 93 |   cwd <- getwd()
 94 |   thisFile <- function() {
 95 |     cmdArgs <- commandArgs(trailingOnly=FALSE)
 96 |     needle <- "--file="
 97 |     match <- grep(needle, cmdArgs)
 98 |     if (length(match) > 0) {
 99 |       return(normalizePath(sub(needle, "", cmdArgs[match])))
100 |     } else {
101 |       return(normalizePath(sys.frames()[[1]]$ofile))
102 |     }
103 |   }
104 |   test_results <- test_dir(dirname(thisFile()), reporter="summary")
105 | }
106 | 


--------------------------------------------------------------------------------
/src/kmcuda.h:
--------------------------------------------------------------------------------
  1 | #ifndef KMCUDA_KMCUDA_H
  2 | #define KMCUDA_KMCUDA_H
  3 | 
  4 | /*! @mainpage KMeansCUDA documentation
  5 |  *
  6 |  * @section s1 Description
  7 |  *
  8 |  * K-means and K-nn on NVIDIA CUDA which are designed for production usage and
  9 |  * simplicity.
 10 |  *
 11 |  * K-means is based on ["Yinyang K-Means: A Drop-In Replacement
 12 |  * of the Classic K-Means with Consistent Speedup"](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf).
 13 |  * While it introduces some overhead and many conditional clauses
 14 |  * which are bad for CUDA, it still shows 1.6-2x speedup against the Lloyd
 15 |  * algorithm. K-nearest neighbors employ the same triangle inequality idea and
 16 |  * require precalculated centroids and cluster assignments, similar to the flattened
 17 |  * ball tree.
 18 |  *
 19 |  * Project: https://github.com/src-d/kmcuda
 20 |  *
 21 |  * README: @ref ignore_this_doxygen_anchor
 22 |  *
 23 |  * @section s2 C/C++ API
 24 |  *
 25 |  * kmcuda.h exports two functions: kmeans_cuda() and knn_cuda(). They are not
 26 |  * thread safe.
 27 |  *
 28 |  * @section s3 Python 3 API
 29 |  *
 30 |  * The shared library exports kmeans_cuda() and knn_cuda() Python wrappers.
 31 |  *
 32 |  * @section s4 R API
 33 |  *
 34 |  * The shared library exports kmeans_cuda() and knn_cuda() R wrappers (.External).
 35 |  *
 36 |  */
 37 | 
 38 | #include <stdint.h>
 39 | 
 40 | /// All possible error codes in public API.
 41 | typedef enum {
 42 |   /// Everything's all right.
 43 |   kmcudaSuccess = 0,
 44 |   /// Arguments which were passed into a function failed the validation.
 45 |   kmcudaInvalidArguments,
 46 |   /// The requested CUDA device does not exist.
 47 |   kmcudaNoSuchDevice,
 48 |   /// Failed to allocate memory on CUDA device. Too big size? Switch off Yinyang?
 49 |   kmcudaMemoryAllocationFailure,
 50 |   /// Something bad and unidentified happened on the CUDA side.
 51 |   kmcudaRuntimeError,
 52 |   /// Failed to copy memory to/from CUDA device.
 53 |   kmcudaMemoryCopyError
 54 | } KMCUDAResult;
 55 | 
 56 | /// Centroid initialization method.
 57 | typedef enum {
 58 |   /// Pick initial centroids randomly.
 59 |   kmcudaInitMethodRandom = 0,
 60 |   /// Use kmeans++ initialization method. Theoretically proven to yield
 61 |   /// better clustering than kmcudaInitMethodRandom. O(n * k) complexity.
 62 |   /// https://en.wikipedia.org/wiki/K-means%2B%2B
 63 |   kmcudaInitMethodPlusPlus,
 64 |   /// AFK-MC2 initialization method. Theoretically proven to yield
 65 |   /// better clustering results than kmcudaInitMethodRandom; matches
 66 |   /// kmcudaInitMethodPlusPlus asymptotically and fast. O(n + k) complexity.
 67 |   /// Use it when kmcudaInitMethodPlusPlus takes too long to finish.
 68 |   /// http://olivierbachem.ch/files/afkmcmc-oral-pdf.pdf
 69 |   kmcudaInitMethodAFKMC2,
 70 |   /// Take user supplied centroids.
 71 |   kmcudaInitMethodImport
 72 | } KMCUDAInitMethod;
 73 | 
 74 | /// Specifies how to calculate the distance between each pair of dots.
 75 | typedef enum {
 76 |   /// Mesasure the distance between dots using Euclidean distance.
 77 |   kmcudaDistanceMetricL2,
 78 |   /// Measure the distance between dots using the angle between them.
 79 |   /// @note This metric requires all the supplied data to be normalized by L2 to 1.
 80 |   kmcudaDistanceMetricCosine
 81 | } KMCUDADistanceMetric;
 82 | 
 83 | #ifdef __cplusplus
 84 | extern "C" {
 85 | #endif
 86 | 
 87 | /// @brief Performs K-means clustering on GPU / CUDA.
 88 | /// @param init centroids initialization method.
 89 | /// @param init_params pointer to a struct / number with centroid initialization
 90 | ///                    parameters. Ignored unless init == kmcudaInitMethodAFKMC2.
 91 | ///                    In case with kmcudaInitMethodAFKMC2 it is expected to be
 92 | ///                    uint32_t* to m; m == 0 means the default value (200).
 93 | /// @param tolerance if the number of reassignments drop below this ratio, stop.
 94 | /// @param yinyang_t the relative number of cluster groups, usually 0.1.
 95 | /// @param metric the distance metric to use. The default is Euclidean (L2), can be
 96 | ///               changed to cosine to behave as Spherical K-means with the angular
 97 | ///               distance. Please note that samples *must* be normalized in that
 98 | ///               case.
 99 | /// @param samples_size number of samples.
100 | /// @param features_size number of features (vector dimensionality).
101 | /// @param clusters_size number of clusters.
102 | /// @param seed random generator seed passed to srand().
103 | /// @param device used CUDA device mask. E.g., 1 means #0, 2 means #1 and 3 means
104 | ///               #0 and #1. n-th bit corresponds to n-th device.
105 | /// @param device_ptrs If negative, input and output pointers are taken from host;
106 | ///                    otherwise, device number where to load and store data.
107 | /// @param fp16x2 If true, the input is treated as half2 instead of float. In that case,
108 | ///               features_size must be 2 times smaller than the real size.
109 | /// @param verbosity 0 - no output; 1 - progress output; >=2 - debug output.
110 | /// @param samples input array of size samples_size x features_size in row major format.
111 | /// @param centroids output array of centroids of size clusters_size x features_size
112 | ///                  in row major format.
113 | /// @param assignments output array of cluster indices for each sample of size
114 | ///                    samples_size x 1.
115 | /// @param average_distance output mean distance between cluster elements and
116 | ///                         the corresponding centroids. If nullptr, not calculated.
117 | /// @return KMCUDAResult.
118 | KMCUDAResult kmeans_cuda(
119 |     KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t,
120 |     KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
121 |     uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
122 |     int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
123 |     uint32_t *assignments, float *average_distance);
124 | 
125 | /// @brief Calculates K nearest neighbors for every sample using
126 | ///        the precalculated K-means clusters.
127 | /// @param k the number of neighbors to search for every dot.
128 | /// @param metric the distance metric to use. The default is Euclidean (L2), can be
129 | ///               changed to cosine to behave as Spherical K-means with the angular
130 | ///               distance. Please note that samples *must* be normalized in that
131 | ///               case.
132 | /// @param samples_size number of samples.
133 | /// @param features_size number of features (vector dimensionality).
134 | /// @param clusters_size number of clusters.
135 | /// @param device used CUDA device mask. E.g., 1 means #0, 2 means #1 and 3 means
136 | ///               #0 and #1. n-th bit corresponds to n-th device.
137 | /// @param device_ptrs If negative, input and output pointers are taken from host;
138 | ///                    otherwise, device number where to load and store data.
139 | /// @param fp16x2 If true, the input is treated as half2 instead of float. In that case,
140 | ///               features_size must be 2 times smaller than the real size.
141 | /// @param verbosity 0 - no output; 1 - progress output; >=2 - debug output.
142 | /// @param samples input array of size samples_size x features_size in row major format.
143 | /// @param centroids input array of centroids of size clusters_size x features_size
144 | ///                  in row major format.
145 | /// @param assignments input array of cluster indices for each sample of size
146 | ///                    samples_size x 1.
147 | /// @param neighbors output array with the nearest neighbors of size
148 | ///                  samples_size x k in row major format.
149 | /// @return KMCUDAResult.
150 | KMCUDAResult knn_cuda(
151 |     uint16_t k, KMCUDADistanceMetric metric, uint32_t samples_size,
152 |     uint16_t features_size, uint32_t clusters_size, uint32_t device,
153 |     int32_t device_ptrs, int32_t fp16x2, int32_t verbosity,
154 |     const float *samples, const float *centroids, const uint32_t *assignments,
155 |     uint32_t *neighbors);
156 | 
157 | #ifdef __cplusplus
158 | }  // extern "C"
159 | #endif
160 | 
161 | #ifdef __cplusplus
162 | #include <string>
163 | #include <unordered_map>
164 | 
165 | namespace {
166 | namespace kmcuda {
167 | /// Mapping from strings to KMCUDAInitMethod - useful for wrappers.
168 | const std::unordered_map<std::string, KMCUDAInitMethod> init_methods {
169 |     {"kmeans++", kmcudaInitMethodPlusPlus},
170 |     {"k-means++", kmcudaInitMethodPlusPlus},
171 |     {"afkmc2", kmcudaInitMethodAFKMC2},
172 |     {"afk-mc2", kmcudaInitMethodAFKMC2},
173 |     {"random", kmcudaInitMethodRandom}
174 | };
175 | 
176 | /// Mapping from strings to KMCUDADistanceMetric - useful for wrappers.
177 | const std::unordered_map<std::string, KMCUDADistanceMetric> metrics {
178 |     {"euclidean", kmcudaDistanceMetricL2},
179 |     {"L2", kmcudaDistanceMetricL2},
180 |     {"l2", kmcudaDistanceMetricL2},
181 |     {"cos", kmcudaDistanceMetricCosine},
182 |     {"cosine", kmcudaDistanceMetricCosine},
183 |     {"angular", kmcudaDistanceMetricCosine}
184 | };
185 | 
186 | /// Mapping from KMCUDAResult to strings - useful for wrappers.
187 | const std::unordered_map<int, const char *> statuses {
188 |     {kmcudaSuccess, "Success"},
189 |     {kmcudaInvalidArguments, "InvalidArguments"},
190 |     {kmcudaNoSuchDevice, "NoSuchDevice"},
191 |     {kmcudaMemoryAllocationFailure, "MemoryAllocationFailure"},
192 |     {kmcudaRuntimeError, "RuntimeError"},
193 |     {kmcudaMemoryCopyError, "MemoryCopyError"}
194 | };
195 | }  // namespace kmcuda
196 | }  // namespace
197 | #endif  // __cplusplus
198 | 
199 | #endif //KMCUDA_KMCUDA_H
200 | 


--------------------------------------------------------------------------------
/src/metric_abstraction.h:
--------------------------------------------------------------------------------
  1 | //
  2 | // distance and normalization functions.
  3 | //
  4 | 
  5 | #ifndef KMCUDA_METRIC_ABSTRACTION_H
  6 | #define KMCUDA_METRIC_ABSTRACTION_H
  7 | 
  8 | #include "fp_abstraction.h"
  9 | 
 10 | __constant__ uint16_t d_features_size;
 11 | 
 12 | template <KMCUDADistanceMetric M, typename F>
 13 | struct METRIC {
 14 |   FPATTR static typename HALF<F>::type distance(F sqr1, F sqr2, F prod);
 15 |   FPATTR static float distance(const F *__restrict__ v1, const F *__restrict__ v2);
 16 |   FPATTR static void normalize(uint32_t count, F *vec);
 17 | };
 18 | 
 19 | template <typename F>
 20 | struct METRIC<kmcudaDistanceMetricL2, F> {
 21 |   FPATTR static F sum_squares(
 22 |       const F *__restrict__ vec, F *__restrict__ cache) {
 23 |     F ssqr = _const<F>(0), corr = _const<F>(0);
 24 |     #pragma unroll 4
 25 |     for (int f = 0; f < d_features_size; f++) {
 26 |       F v = vec[f];
 27 |       if (cache) {
 28 |         cache[f] = v;
 29 |       }
 30 |       F y = _fma(corr, v, v);
 31 |       F t = _add(ssqr, y);
 32 |       corr = _sub(y, _sub(t, ssqr));
 33 |       ssqr = t;
 34 |     }
 35 |     return ssqr;
 36 |   }
 37 | 
 38 |   FPATTR static F sum_squares_t(
 39 |       const F *__restrict__ vec, F *__restrict__ cache, uint64_t size, uint64_t index) {
 40 |     F ssqr = _const<F>(0), corr = _const<F>(0);
 41 |     #pragma unroll 4
 42 |     for (uint64_t f = 0; f < d_features_size; f++) {
 43 |       F v = vec[f * size + index];
 44 |       if (cache) {
 45 |         cache[f] = v;
 46 |       }
 47 |       F y = _fma(corr, v, v);
 48 |       F t = _add(ssqr, y);
 49 |       corr = _sub(y, _sub(t, ssqr));
 50 |       ssqr = t;
 51 |     }
 52 |     return ssqr;
 53 |   }
 54 | 
 55 |   FPATTR static typename HALF<F>::type distance(F sqr1, F sqr2, F prod) {
 56 |     return _fin(_fma(_add(sqr1, sqr2), _const<F>(-2), prod));
 57 |   }
 58 | 
 59 |   FPATTR static float distance(const F *__restrict__ v1, const F *__restrict__ v2) {
 60 |     // Kahan summation with inverted c
 61 |     F dist = _const<F>(0), corr = _const<F>(0);
 62 |     #pragma unroll 4
 63 |     for (int f = 0; f < d_features_size; f++) {
 64 |       F d = _sub(v1[f], v2[f]);
 65 |       F y = _fma(corr, d, d);
 66 |       F t = _add(dist, y);
 67 |       corr = _sub(y, _sub(t, dist));
 68 |       dist = t;
 69 |     }
 70 |     return _sqrt(_float(_fin(dist)));
 71 |   }
 72 | 
 73 |   FPATTR static float distance_t(const F *__restrict__ v1, const F *__restrict__ v2,
 74 |                                  uint64_t v1_size, uint64_t v1_index) {
 75 |     // Kahan summation with inverted c
 76 |     F dist = _const<F>(0), corr = _const<F>(0);
 77 |     #pragma unroll 4
 78 |     for (uint64_t f = 0; f < d_features_size; f++) {
 79 |       F d = _sub(v1[v1_size * f + v1_index], v2[f]);
 80 |       F y = _fma(corr, d, d);
 81 |       F t = _add(dist, y);
 82 |       corr = _sub(y, _sub(t, dist));
 83 |       dist = t;
 84 |     }
 85 |     return _sqrt(_float(_fin(dist)));
 86 |   }
 87 | 
 88 |   FPATTR static float distance_tt(const F *__restrict__ v, uint64_t size,
 89 |                                   uint64_t index1, uint64_t index2) {
 90 |     // Kahan summation with inverted c
 91 |     F dist = _const<F>(0), corr = _const<F>(0);
 92 |     #pragma unroll 4
 93 |     for (uint64_t f = 0; f < d_features_size; f++) {
 94 |       F d = _sub(v[size * f + index1], v[size * f + index2]);
 95 |       F y = _fma(corr, d, d);
 96 |       F t = _add(dist, y);
 97 |       corr = _sub(y, _sub(t, dist));
 98 |       dist = t;
 99 |     }
100 |     return _sqrt(_float(_fin(dist)));
101 |   }
102 | 
103 |   FPATTR static float partial(const F *__restrict__ v1, const F *__restrict__ v2,
104 |                               uint16_t size) {
105 |     // Kahan summation with inverted c
106 |     F dist = _const<F>(0), corr = _const<F>(0);
107 |     #pragma unroll 4
108 |     for (int f = 0; f < size; f++) {
109 |       F d = _sub(v1[f], v2[f]);
110 |       F y = _fma(corr, d, d);
111 |       F t = _add(dist, y);
112 |       corr = _sub(y, _sub(t, dist));
113 |       dist = t;
114 |     }
115 |     return _float(_fin(dist));
116 |   }
117 | 
118 |   FPATTR static float partial_t(
119 |       const F *__restrict__ v1, const F *__restrict__ v2, uint16_t f_size,
120 |       uint64_t v1_size, uint64_t v1_offset, uint64_t v1_index) {
121 |     // Kahan summation with inverted c
122 |     F dist = _const<F>(0), corr = _const<F>(0);
123 |     #pragma unroll 4
124 |     for (int f = 0; f < f_size; f++) {
125 |       F d = _sub(v1[v1_size * (f + v1_offset) + v1_index], v2[f]);
126 |       F y = _fma(corr, d, d);
127 |       F t = _add(dist, y);
128 |       corr = _sub(y, _sub(t, dist));
129 |       dist = t;
130 |     }
131 |     return _float(_fin(dist));
132 |   }
133 | 
134 |   FPATTR static float finalize(float partial) {
135 |     return _sqrt(partial);
136 |   }
137 | 
138 |   FPATTR static void normalize(uint32_t count, F *vec) {
139 |     F rc = _reciprocal(_const<F>(count));
140 |     #pragma unroll 4
141 |     for (int f = 0; f < d_features_size; f++) {
142 |       vec[f] = _mul(vec[f], rc);
143 |     }
144 |   }
145 | };
146 | 
147 | template <typename F>
148 | struct METRIC<kmcudaDistanceMetricCosine, F> {
149 |   FPATTR static F sum_squares(
150 |       const F *__restrict__ vec, F *__restrict__ cache) {
151 |     if (cache) {
152 |       #pragma unroll 4
153 |       for (int f = 0; f < d_features_size; f++) {
154 |         cache[f] = vec[f];
155 |       }
156 |     }
157 |     return _const<F>(1);
158 |   }
159 | 
160 |   FPATTR static F sum_squares_t(
161 |       const F *__restrict__ vec, F *__restrict__ cache, uint64_t size, uint64_t index) {
162 |     if (cache) {
163 |       #pragma unroll 4
164 |       for (uint64_t f = 0; f < d_features_size; f++) {
165 |         cache[f] = vec[f * size + index];
166 |       }
167 |     }
168 |     return _const<F>(1);
169 |   }
170 | 
171 |   FPATTR static typename HALF<F>::type distance(
172 |       F sqr1 __attribute__((unused)), F sqr2 __attribute__((unused)), F prod) {
173 |     float fp = _float(_fin(prod));
174 |     if (fp >= 1.f) return _half<F>(0.f);
175 |     if (fp <= -1.f) return _half<F>(M_PI);
176 |     return _half<F>(acos(fp));
177 |   }
178 | 
179 |   FPATTR static float distance(const F *__restrict__ v1, const F *__restrict__ v2) {
180 |     // Kahan summation with inverted c
181 |     F prod = _const<F>(0), corr = _const<F>(0);
182 |     #pragma unroll 4
183 |     for (int f = 0; f < d_features_size; f++) {
184 |       F yprod = _fma(corr, v1[f], v2[f]);
185 |       F tprod = _add(prod, yprod);
186 |       corr = _sub(yprod, _sub(tprod, prod));
187 |       prod = tprod;
188 |     }
189 |     return _float(distance(_const<F>(1), _const<F>(1), prod));
190 |   }
191 | 
192 |   FPATTR static float distance_t(const F *__restrict__ v1, const F *__restrict__ v2,
193 |                                  uint64_t v1_size, uint64_t v1_index) {
194 |     // Kahan summation with inverted c
195 |     F prod = _const<F>(0), corr = _const<F>(0);
196 |     #pragma unroll 4
197 |     for (uint64_t f = 0; f < d_features_size; f++) {
198 |       F yprod = _fma(corr, v1[v1_size * f + v1_index], v2[f]);
199 |       F tprod = _add(prod, yprod);
200 |       corr = _sub(yprod, _sub(tprod, prod));
201 |       prod = tprod;
202 |     }
203 |     return _float(distance(_const<F>(1), _const<F>(1), prod));
204 |   }
205 | 
206 |   FPATTR static float distance_tt(const F *__restrict__ v, uint64_t size,
207 |                                   uint64_t index1, uint64_t index2) {
208 |     // Kahan summation with inverted c
209 |     F prod = _const<F>(0), corr = _const<F>(0);
210 |     #pragma unroll 4
211 |     for (uint64_t f = 0; f < d_features_size; f++) {
212 |       F yprod = _fma(corr, v[size * f + index1], v[size * f + index2]);
213 |       F tprod = _add(prod, yprod);
214 |       corr = _sub(yprod, _sub(tprod, prod));
215 |       prod = tprod;
216 |     }
217 |     return _float(distance(_const<F>(1), _const<F>(1), prod));
218 |   }
219 | 
220 |   FPATTR static float partial(const F *__restrict__ v1, const F *__restrict__ v2,
221 |                               uint16_t size) {
222 |     // Kahan summation with inverted c
223 |     F prod = _const<F>(0), corr = _const<F>(0);
224 |     #pragma unroll 4
225 |     for (int f = 0; f < size; f++) {
226 |       F yprod = _fma(corr, v1[f], v2[f]);
227 |       F tprod = _add(prod, yprod);
228 |       corr = _sub(yprod, _sub(tprod, prod));
229 |       prod = tprod;
230 |     }
231 |     return _float(_fin(prod));
232 |   }
233 | 
234 |   FPATTR static float partial_t(
235 |       const F *__restrict__ v1, const F *__restrict__ v2, uint16_t f_size,
236 |       uint64_t v1_size, uint64_t v1_offset, uint64_t v1_index) {
237 |     // Kahan summation with inverted c
238 |     F prod = _const<F>(0), corr = _const<F>(0);
239 |     #pragma unroll 4
240 |     for (int f = 0; f < f_size; f++) {
241 |       F yprod = _fma(corr, v1[v1_size * (f + v1_offset) + v1_index], v2[f]);
242 |       F tprod = _add(prod, yprod);
243 |       corr = _sub(yprod, _sub(tprod, prod));
244 |       prod = tprod;
245 |     }
246 |     return _float(_fin(prod));
247 |   }
248 | 
249 |   FPATTR static float finalize(float partial) {
250 |     if (partial >= 1.f) return 0.f;
251 |     if (partial <= -1.f) return M_PI;
252 |     return acos(partial);
253 |   }
254 | 
255 |   FPATTR static void normalize(uint32_t count __attribute__((unused)), float *vec) {
256 |     // Kahan summation with inverted c
257 |     float norm = 0, corr = 0;
258 |     #pragma unroll 4
259 |     for (int f = 0; f < d_features_size; f++) {
260 |       float v = vec[f];
261 |       float y = _fma(corr, v, v);
262 |       float t = norm + y;
263 |       corr = y - (t - norm);
264 |       norm = t;
265 |     }
266 |     norm = _reciprocal(_sqrt(norm));
267 | 
268 |     #pragma unroll 4
269 |     for (int f = 0; f < d_features_size; f++) {
270 |       vec[f] = vec[f] * norm;
271 |     }
272 |   }
273 | 
274 |   #if CUDA_ARCH >= 60
275 |   FPATTR static void normalize(uint32_t count __attribute__((unused)), half2 *vec) {
276 |     // We really have to calculate norm in 32-bit floats because the maximum
277 |     // value which 16-bit float may represent is 2^16.
278 |     float norm = 0, corr = 0;
279 |     #pragma unroll 4
280 |     for (int f = 0; f < d_features_size; f++) {
281 |       half2 v = vec[f];
282 |       float v1 = _float(__high2half(v));
283 |       float v2 = _float(__low2half(v));
284 | 
285 |       float y = _fma(corr, v1, v1);
286 |       float t = norm + y;
287 |       corr = y - (t - norm);
288 |       norm = t;
289 | 
290 |       y = _fma(corr, v2, v2);
291 |       t = norm + y;
292 |       corr = y - (t - norm);
293 |       norm = t;
294 |     }
295 |     norm = _reciprocal(_sqrt(norm));
296 |     half2 norm2 = _fout(_half<half2>(norm));
297 |     #pragma unroll 4
298 |     for (int f = 0; f < d_features_size; f++) {
299 |       vec[f] = _mul(vec[f], norm2);
300 |     }
301 |   }
302 |   #endif
303 | };
304 | 
305 | #endif //KMCUDA_METRIC_ABSTRACTION_H
306 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | Apache License
  2 | ==============
  3 | 
  4 | _Version 2.0, January 2004_  
  5 | _&lt;<http://www.apache.org/licenses/>&gt;_
  6 | 
  7 | ### Terms and Conditions for use, reproduction, and distribution
  8 | 
  9 | #### 1. Definitions
 10 | 
 11 | “License” shall mean the terms and conditions for use, reproduction, and
 12 | distribution as defined by Sections 1 through 9 of this document.
 13 | 
 14 | “Licensor” shall mean the copyright owner or entity authorized by the copyright
 15 | owner that is granting the License.
 16 | 
 17 | “Legal Entity” shall mean the union of the acting entity and all other entities
 18 | that control, are controlled by, or are under common control with that entity.
 19 | For the purposes of this definition, “control” means **(i)** the power, direct or
 20 | indirect, to cause the direction or management of such entity, whether by
 21 | contract or otherwise, or **(ii)** ownership of fifty percent (50%) or more of the
 22 | outstanding shares, or **(iii)** beneficial ownership of such entity.
 23 | 
 24 | “You” (or “Your”) shall mean an individual or Legal Entity exercising
 25 | permissions granted by this License.
 26 | 
 27 | “Source” form shall mean the preferred form for making modifications, including
 28 | but not limited to software source code, documentation source, and configuration
 29 | files.
 30 | 
 31 | “Object” form shall mean any form resulting from mechanical transformation or
 32 | translation of a Source form, including but not limited to compiled object code,
 33 | generated documentation, and conversions to other media types.
 34 | 
 35 | “Work” shall mean the work of authorship, whether in Source or Object form, made
 36 | available under the License, as indicated by a copyright notice that is included
 37 | in or attached to the work (an example is provided in the Appendix below).
 38 | 
 39 | “Derivative Works” shall mean any work, whether in Source or Object form, that
 40 | is based on (or derived from) the Work and for which the editorial revisions,
 41 | annotations, elaborations, or other modifications represent, as a whole, an
 42 | original work of authorship. For the purposes of this License, Derivative Works
 43 | shall not include works that remain separable from, or merely link (or bind by
 44 | name) to the interfaces of, the Work and Derivative Works thereof.
 45 | 
 46 | “Contribution” shall mean any work of authorship, including the original version
 47 | of the Work and any modifications or additions to that Work or Derivative Works
 48 | thereof, that is intentionally submitted to Licensor for inclusion in the Work
 49 | by the copyright owner or by an individual or Legal Entity authorized to submit
 50 | on behalf of the copyright owner. For the purposes of this definition,
 51 | “submitted” means any form of electronic, verbal, or written communication sent
 52 | to the Licensor or its representatives, including but not limited to
 53 | communication on electronic mailing lists, source code control systems, and
 54 | issue tracking systems that are managed by, or on behalf of, the Licensor for
 55 | the purpose of discussing and improving the Work, but excluding communication
 56 | that is conspicuously marked or otherwise designated in writing by the copyright
 57 | owner as “Not a Contribution.”
 58 | 
 59 | “Contributor” shall mean Licensor and any individual or Legal Entity on behalf
 60 | of whom a Contribution has been received by Licensor and subsequently
 61 | incorporated within the Work.
 62 | 
 63 | #### 2. Grant of Copyright License
 64 | 
 65 | Subject to the terms and conditions of this License, each Contributor hereby
 66 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 67 | irrevocable copyright license to reproduce, prepare Derivative Works of,
 68 | publicly display, publicly perform, sublicense, and distribute the Work and such
 69 | Derivative Works in Source or Object form.
 70 | 
 71 | #### 3. Grant of Patent License
 72 | 
 73 | Subject to the terms and conditions of this License, each Contributor hereby
 74 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free,
 75 | irrevocable (except as stated in this section) patent license to make, have
 76 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where
 77 | such license applies only to those patent claims licensable by such Contributor
 78 | that are necessarily infringed by their Contribution(s) alone or by combination
 79 | of their Contribution(s) with the Work to which such Contribution(s) was
 80 | submitted. If You institute patent litigation against any entity (including a
 81 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 82 | Contribution incorporated within the Work constitutes direct or contributory
 83 | patent infringement, then any patent licenses granted to You under this License
 84 | for that Work shall terminate as of the date such litigation is filed.
 85 | 
 86 | #### 4. Redistribution
 87 | 
 88 | You may reproduce and distribute copies of the Work or Derivative Works thereof
 89 | in any medium, with or without modifications, and in Source or Object form,
 90 | provided that You meet the following conditions:
 91 | 
 92 | * **(a)** You must give any other recipients of the Work or Derivative Works a copy of
 93 | this License; and
 94 | * **(b)** You must cause any modified files to carry prominent notices stating that You
 95 | changed the files; and
 96 | * **(c)** You must retain, in the Source form of any Derivative Works that You distribute,
 97 | all copyright, patent, trademark, and attribution notices from the Source form
 98 | of the Work, excluding those notices that do not pertain to any part of the
 99 | Derivative Works; and
100 | * **(d)** If the Work includes a “NOTICE” text file as part of its distribution, then any
101 | Derivative Works that You distribute must include a readable copy of the
102 | attribution notices contained within such NOTICE file, excluding those notices
103 | that do not pertain to any part of the Derivative Works, in at least one of the
104 | following places: within a NOTICE text file distributed as part of the
105 | Derivative Works; within the Source form or documentation, if provided along
106 | with the Derivative Works; or, within a display generated by the Derivative
107 | Works, if and wherever such third-party notices normally appear. The contents of
108 | the NOTICE file are for informational purposes only and do not modify the
109 | License. You may add Your own attribution notices within Derivative Works that
110 | You distribute, alongside or as an addendum to the NOTICE text from the Work,
111 | provided that such additional attribution notices cannot be construed as
112 | modifying the License.
113 | 
114 | You may add Your own copyright statement to Your modifications and may provide
115 | additional or different license terms and conditions for use, reproduction, or
116 | distribution of Your modifications, or for any such Derivative Works as a whole,
117 | provided Your use, reproduction, and distribution of the Work otherwise complies
118 | with the conditions stated in this License.
119 | 
120 | #### 5. Submission of Contributions
121 | 
122 | Unless You explicitly state otherwise, any Contribution intentionally submitted
123 | for inclusion in the Work by You to the Licensor shall be under the terms and
124 | conditions of this License, without any additional terms or conditions.
125 | Notwithstanding the above, nothing herein shall supersede or modify the terms of
126 | any separate license agreement you may have executed with Licensor regarding
127 | such Contributions.
128 | 
129 | #### 6. Trademarks
130 | 
131 | This License does not grant permission to use the trade names, trademarks,
132 | service marks, or product names of the Licensor, except as required for
133 | reasonable and customary use in describing the origin of the Work and
134 | reproducing the content of the NOTICE file.
135 | 
136 | #### 7. Disclaimer of Warranty
137 | 
138 | Unless required by applicable law or agreed to in writing, Licensor provides the
139 | Work (and each Contributor provides its Contributions) on an “AS IS” BASIS,
140 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied,
141 | including, without limitation, any warranties or conditions of TITLE,
142 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are
143 | solely responsible for determining the appropriateness of using or
144 | redistributing the Work and assume any risks associated with Your exercise of
145 | permissions under this License.
146 | 
147 | #### 8. Limitation of Liability
148 | 
149 | In no event and under no legal theory, whether in tort (including negligence),
150 | contract, or otherwise, unless required by applicable law (such as deliberate
151 | and grossly negligent acts) or agreed to in writing, shall any Contributor be
152 | liable to You for damages, including any direct, indirect, special, incidental,
153 | or consequential damages of any character arising as a result of this License or
154 | out of the use or inability to use the Work (including but not limited to
155 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or
156 | any and all other commercial damages or losses), even if such Contributor has
157 | been advised of the possibility of such damages.
158 | 
159 | #### 9. Accepting Warranty or Additional Liability
160 | 
161 | While redistributing the Work or Derivative Works thereof, You may choose to
162 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or
163 | other liability obligations and/or rights consistent with this License. However,
164 | in accepting such obligations, You may act only on Your own behalf and on Your
165 | sole responsibility, not on behalf of any other Contributor, and only if You
166 | agree to indemnify, defend, and hold each Contributor harmless for any liability
167 | incurred by, or claims asserted against, such Contributor by reason of your
168 | accepting any such warranty or additional liability.
169 | 
170 | _END OF TERMS AND CONDITIONS_
171 | 
172 | ### APPENDIX: How to apply the Apache License to your work
173 | 
174 | To apply the Apache License to your work, attach the following boilerplate
175 | notice, with the fields enclosed by brackets `[]` replaced with your own
176 | identifying information. (Don't include the brackets!) The text should be
177 | enclosed in the appropriate comment syntax for the file format. We also
178 | recommend that a file or class name and description of purpose be included on
179 | the same “printed page” as the copyright notice for easier identification within
180 | third-party archives.
181 | 
182 |     Copyright [yyyy] [name of copyright owner]
183 |     
184 |     Licensed under the Apache License, Version 2.0 (the "License");
185 |     you may not use this file except in compliance with the License.
186 |     You may obtain a copy of the License at
187 |     
188 |       http://www.apache.org/licenses/LICENSE-2.0
189 |     
190 |     Unless required by applicable law or agreed to in writing, software
191 |     distributed under the License is distributed on an "AS IS" BASIS,
192 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
193 |     See the License for the specific language governing permissions and
194 |     limitations under the License.
195 | 


--------------------------------------------------------------------------------
/src/r.cc:
--------------------------------------------------------------------------------
  1 | #include <memory>
  2 | #include <string>
  3 | #include <unordered_map>
  4 | #include <unordered_set>
  5 | 
  6 | #include <R.h>
  7 | #include <Rinternals.h>
  8 | #include <R_ext/Rdynload.h>
  9 | #include "kmcuda.h"
 10 | 
 11 | namespace {
 12 |   std::unordered_map<std::string, SEXP> parse_args(
 13 |       const std::unordered_set<std::string> &allowed,
 14 |       std::initializer_list<std::string> required, SEXP args) {
 15 |     std::unordered_map<std::string, SEXP> result;
 16 |     args = CDR(args);
 17 |     bool pure = true;
 18 |     for (unsigned i = 0; args != R_NilValue; i++, args = CDR(args)) {
 19 |       SEXP arg = CAR(args);
 20 |       if (isNull(TAG(args))) {
 21 |         if (pure && i == result.size()) {
 22 |           result.emplace(required.begin()[i], arg);
 23 |         } else {
 24 |           error("positional argument follows keyword argument");
 25 |         }
 26 |       } else {
 27 |         pure = false;
 28 |         const char *name = CHAR(PRINTNAME(TAG(args)));
 29 |         if (allowed.find(name) == allowed.end()) {
 30 |           error("got an unexpected keyword argument \"%s\"", name);
 31 |         }
 32 |         result.emplace(name, arg);
 33 |       }
 34 |     }
 35 |     return result;
 36 |   }
 37 | 
 38 |   template<typename R>
 39 |   R parse_dict(const std::unordered_map<std::string, R>& dict, const char *arg,
 40 |                SEXP name) {
 41 |     if (!isString(name)) {
 42 |       error("\"%s\" name must be a string", arg);
 43 |     }
 44 |     const char *init_name = CHAR(asChar(name));
 45 |     auto init_iter = dict.find(init_name);
 46 |     if (init_iter == dict.end()) {
 47 |       error("\"%s\" = \"%s\" is not supported", arg, init_name);
 48 |     }
 49 |     return init_iter->second;
 50 |   }
 51 | 
 52 |   int parse_int(SEXP value) {
 53 |     if (isInteger(value)) {
 54 |       return INTEGER(value)[0];
 55 |     }
 56 |     return REAL(value)[0];
 57 |   }
 58 | 
 59 |   int parse_int(const std::unordered_map<std::string, SEXP> &kwargs,
 60 |                 const std::string &name, int def) {
 61 |     auto iter = kwargs.find(name);
 62 |     if (iter == kwargs.end()) {
 63 |       return def;
 64 |     }
 65 |     if (!isNumeric(iter->second)) {
 66 |       error("\"%s\" must be an integer", name.c_str());
 67 |     }
 68 |     return parse_int(iter->second);
 69 |   }
 70 | 
 71 |   void parse_samples(
 72 |       const std::unordered_map<std::string, SEXP> &kwargs,
 73 |       std::unique_ptr<float[]> *samples, uint32_t *samples_size,
 74 |       uint16_t *features_size) {
 75 |     std::unique_ptr<SEXP[]> samples_chunks;
 76 |     int chunks_size = 0;
 77 |     {
 78 |       auto samples_iter = kwargs.find("samples");
 79 |       if (samples_iter == kwargs.end()) {
 80 |         error("\"samples\" must be defined");
 81 |       }
 82 |       SEXP samples_obj = samples_iter->second;
 83 |       if (isReal(samples_obj)) {
 84 |         chunks_size = 1;
 85 |         samples_chunks.reset(new SEXP[1]);
 86 |         samples_chunks[0] = samples_obj;
 87 |       } else if (TYPEOF(samples_obj) == VECSXP) {
 88 |         chunks_size = length(samples_obj);
 89 |         samples_chunks.reset(new SEXP[chunks_size]);
 90 |         for (int i = 0; i < chunks_size; i++) {
 91 |           samples_chunks[i] = VECTOR_ELT(samples_obj, i);
 92 |         }
 93 |       } else {
 94 |         error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices");
 95 |       }
 96 |     }
 97 |     *samples_size = 0;
 98 |     for (int i = 0; i < chunks_size; i++) {
 99 |       if (!isReal(samples_chunks[i])) {
100 |         error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices");
101 |       }
102 |       SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol);
103 |       if (length(dims) != 2) {
104 |         error("\"samples\" must be a 2D real matrix or a vector of 2D real matrices");
105 |       }
106 |       int samples_size_i = INTEGER(dims)[0];
107 |       if (static_cast<int64_t>(*samples_size) + samples_size_i > INT32_MAX) {
108 |         error("too many samples (>INT32_MAX)");
109 |       }
110 |       *samples_size += samples_size_i;
111 |       int features_size_i = INTEGER(dims)[1];
112 |       if (features_size_i > UINT16_MAX) {
113 |         error("too many features (>UINT16_MAX)");
114 |       }
115 |       if (i == 0) {
116 |         *features_size = features_size_i;
117 |       } else if (*features_size != features_size_i) {
118 |         error("\"samples\" vector contains matrices with different number of columns");
119 |       }
120 |     }
121 |     samples->reset(new float[
122 |         static_cast<uint64_t>(*samples_size) * *features_size]);
123 |     float *samples_float = samples->get();
124 |     {
125 |       int offset = 0;
126 |       for (int i = 0; i < chunks_size; i++) {
127 |         double *samples_double = REAL(samples_chunks[i]);
128 |         SEXP dims = getAttrib(samples_chunks[i], R_DimSymbol);
129 |         uint32_t fsize = *features_size;
130 |         uint32_t ssize = INTEGER(dims)[0];
131 |         #pragma omp parallel for
132 |         for (uint64_t f = 0; f < fsize; f++) {
133 |           for (uint64_t s = 0; s < ssize; s++) {
134 |             samples_float[offset + s * fsize + f] = samples_double[f * ssize + s];
135 |           }
136 |         }
137 |         offset += ssize * fsize;
138 |       }
139 |     }
140 |   }
141 | 
142 |   KMCUDADistanceMetric parse_metric(
143 |       const std::unordered_map<std::string, SEXP> &kwargs) {
144 |     KMCUDADistanceMetric metric = kmcudaDistanceMetricL2;
145 |     auto metric_iter = kwargs.find("metric");
146 |     if (metric_iter != kwargs.end()) {
147 |       metric = parse_dict(kmcuda::metrics, "metric", metric_iter->second);
148 |     }
149 |     return metric;
150 |   }
151 | 
152 |   int parse_device(
153 |       const std::unordered_map<std::string, SEXP> &kwargs) {
154 |     int device = parse_int(kwargs, "device", 0);
155 |     if (device < 0) {
156 |       error("\"device\" may not be negative");
157 |     }
158 |     return device;
159 |   }
160 | 
161 |   static const std::unordered_set<std::string> kmeans_kwargs {
162 |       "samples", "clusters", "tolerance", "init", "yinyang_t", "metric",
163 |       "average_distance", "seed", "device", "verbosity"
164 |   };
165 | 
166 |   static const std::unordered_set<std::string> knn_kwargs {
167 |       "k", "samples", "centroids", "assignments", "metric", "device",
168 |       "verbosity"
169 |   };
170 | }
171 | 
172 | extern "C" {
173 | 
174 | static SEXP r_kmeans_cuda(SEXP args);
175 | static SEXP r_knn_cuda(SEXP args);
176 | 
177 | static R_ExternalMethodDef externalMethods[] = {
178 |    {"kmeans_cuda", (DL_FUNC) &r_kmeans_cuda, -1},
179 |    {"knn_cuda", (DL_FUNC) &r_knn_cuda, -1},
180 |    {NULL, NULL, 0}
181 | };
182 | 
183 | void R_init_libKMCUDA(DllInfo *info) {
184 |   R_registerRoutines(info, NULL, NULL, NULL, externalMethods);
185 | }
186 | 
187 | static SEXP r_kmeans_cuda(SEXP args) {
188 |   auto kwargs = parse_args(kmeans_kwargs, {"samples", "clusters"}, args);
189 |   std::unique_ptr<float[]> samples;
190 |   uint32_t samples_size;
191 |   uint16_t features_size;
192 |   parse_samples(kwargs, &samples, &samples_size, &features_size);
193 |   SEXP clusters_obj = kwargs["clusters"];
194 |   if (!isNumeric(clusters_obj)) {
195 |     error("\"clusters\" must be a positive integer");
196 |   }
197 |   int clusters_size = parse_int(clusters_obj);
198 |   if (clusters_size <= 0) {
199 |     error("\"clusters\" must be a positive integer");
200 |   }
201 |   if (static_cast<uint64_t>(clusters_size) * features_size > INT32_MAX
202 |       || static_cast<uint32_t>(clusters_size) >= samples_size) {
203 |     error("\"clusters\" is too big");
204 |   }
205 |   auto centroids = std::unique_ptr<float[]>(
206 |       new float[clusters_size * features_size]);
207 |   KMCUDAInitMethod init = kmcudaInitMethodPlusPlus;
208 |   int afkmc2_m = 0;
209 |   auto init_iter = kwargs.find("init");
210 |   if (init_iter != kwargs.end()) {
211 |     if (isString(init_iter->second)) {
212 |       init = parse_dict(kmcuda::init_methods, "init", init_iter->second);
213 |     } else if (TYPEOF(init_iter->second) == VECSXP) {
214 |       if (length(init_iter->second) == 0) {
215 |         error("\"init\" may not be an empty list");
216 |       }
217 |       init = parse_dict(kmcuda::init_methods, "init", CAR(init_iter->second));
218 |       if (init == kmcudaInitMethodAFKMC2 && length(init_iter->second) > 1) {
219 |         SEXP afkmc2_m_obj = CAAR(init_iter->second);
220 |         if (!isNumeric(afkmc2_m_obj)) {
221 |           error("\"init\" = %s: parameter must be a positive integer",
222 |                 CHAR(asChar(CAR(init_iter->second))));
223 |         }
224 |         afkmc2_m = parse_int(afkmc2_m_obj);
225 |         if (afkmc2_m <= 0) {
226 |           error("\"init\" = %s: parameter must be a positive integer",
227 |                 CHAR(asChar(CAR(init_iter->second))));
228 |         }
229 |       } else if (length(init_iter->second) != 1) {
230 |         error("\"init\" has wrong number of parameters");
231 |       }
232 |     } else if (isReal(init_iter->second)) {
233 |       init = kmcudaInitMethodImport;
234 |       SEXP dims = getAttrib(init_iter->second, R_DimSymbol);
235 |       if (length(dims) != 2
236 |           || INTEGER(dims)[0] != clusters_size
237 |           || INTEGER(dims)[1] != features_size) {
238 |         error("invalid centroids dimensions in \"init\"");
239 |       }
240 |       double *centroids_double = REAL(init_iter->second);
241 |       #pragma omp parallel for
242 |       for (uint64_t f = 0; f < features_size; f++) {
243 |         for (int64_t c = 0; c < clusters_size; c++) {
244 |           centroids[c * features_size + f] = centroids_double[f * clusters_size + c];
245 |         }
246 |       }
247 |     } else {
248 |       error("\"init\" must be either a string or a list or a 2D matrix");
249 |     }
250 |   }
251 |   float tolerance = 0.01;
252 |   auto tolerance_iter = kwargs.find("tolerance");
253 |   if (tolerance_iter != kwargs.end()) {
254 |     if (!isReal(tolerance_iter->second)) {
255 |       error("\"tolerance\" must be a real value");
256 |     }
257 |     tolerance = REAL(tolerance_iter->second)[0];
258 |     if (tolerance < 0 || tolerance > 1) {
259 |       error("\"tolerance\" must be in [0, 1]");
260 |     }
261 |   }
262 |   float yinyang_t = 0.1;
263 |   auto yinyang_t_iter = kwargs.find("yinyang_t");
264 |   if (yinyang_t_iter != kwargs.end()) {
265 |     if (!isReal(yinyang_t_iter->second)) {
266 |       error("\"yinyang_t\" must be a real value");
267 |     }
268 |     yinyang_t = REAL(yinyang_t_iter->second)[0];
269 |     if (yinyang_t < 0 || yinyang_t > 0.5) {
270 |       error("\"tolerance\" must be in [0, 0.5]");
271 |     }
272 |   }
273 |   KMCUDADistanceMetric metric = parse_metric(kwargs);
274 |   uint32_t seed = parse_int(kwargs, "seed", time(NULL));
275 |   int device = parse_device(kwargs);
276 |   int verbosity = parse_int(kwargs, "verbosity", 0);
277 |   float average_distance, *average_distance_ptr = nullptr;
278 |   auto average_distance_iter = kwargs.find("average_distance");
279 |   if (average_distance_iter != kwargs.end()) {
280 |     if (LOGICAL(average_distance_iter->second)[0]) {
281 |       average_distance_ptr = &average_distance;
282 |     }
283 |   }
284 |   auto assignments = std::unique_ptr<uint32_t[]>(new uint32_t[samples_size]);
285 |   auto result = kmeans_cuda(
286 |     init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size, features_size,
287 |     clusters_size, seed, device, -1, 0, verbosity, samples.get(),
288 |     centroids.get(), assignments.get(), average_distance_ptr);
289 |   if (result != kmcudaSuccess) {
290 |     error("kmeans_cuda error %d %s%s", result,
291 |           kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; "
292 |             "\"verbosity\" = 2 would reveal the details");
293 |   }
294 |   SEXP centroids2 = PROTECT(allocMatrix(REALSXP, clusters_size, features_size));
295 |   double *centroids_double = REAL(centroids2);
296 |   float *centroids_float = centroids.get();
297 |   #pragma omp parallel for
298 |   for (uint64_t f = 0; f < features_size; f++) {
299 |     for (int64_t c = 0; c < clusters_size; c++) {
300 |       centroids_double[f * clusters_size + c] = centroids_float[c * features_size + f];
301 |     }
302 |   }
303 |   SEXP assignments2 = PROTECT(allocVector(INTSXP, samples_size));
304 |   uint32_t *assignments_ptr = assignments.get();
305 |   int *assignments2_ptr = INTEGER(assignments2);
306 |   #ifndef __APPLE__
307 |   #pragma omp parallel for simd
308 |   for (uint32_t i = 0; i < samples_size; i++) {
309 |     assignments2_ptr[i] = assignments_ptr[i] + 1;
310 |   }
311 |   #else
312 |   #pragma omp simd
313 |   for (uint32_t i = 0; i < samples_size; i++) {
314 |     assignments2_ptr[i] = assignments_ptr[i] + 1;
315 |   }
316 |   #endif
317 |   SEXP tuple = PROTECT(allocVector(VECSXP, 2 + (average_distance_ptr != nullptr)));
318 |   SET_VECTOR_ELT(tuple, 0, centroids2);
319 |   SET_VECTOR_ELT(tuple, 1, assignments2);
320 |   SEXP names = PROTECT(allocVector(
321 |       STRSXP, 2 + (average_distance_ptr != nullptr)));
322 |   SET_STRING_ELT(names, 0, mkChar("centroids"));
323 |   SET_STRING_ELT(names, 1, mkChar("assignments"));
324 |   if (average_distance_ptr != nullptr) {
325 |     SEXP average_distance2 = PROTECT(allocVector(REALSXP, 1));
326 |     REAL(average_distance2)[0] = average_distance;
327 |     SET_VECTOR_ELT(tuple, 2, average_distance2);
328 |     SET_STRING_ELT(names, 2, mkChar("average_distance"));
329 |   }
330 |   setAttrib(tuple, R_NamesSymbol, names);
331 |   UNPROTECT(4 + (average_distance_ptr != nullptr));
332 |   return tuple;
333 | }
334 | 
335 | static SEXP r_knn_cuda(SEXP args) {
336 |   auto kwargs = parse_args(
337 |       knn_kwargs, {"k", "samples", "centroids", "assignments"}, args);
338 |   int k = parse_int(kwargs, "k", 0);
339 |   if (k <= 0) {
340 |     error("\"k\" must be positive");
341 |   }
342 |   std::unique_ptr<float[]> samples;
343 |   uint32_t samples_size;
344 |   uint16_t features_size;
345 |   parse_samples(kwargs, &samples, &samples_size, &features_size);
346 |   if (static_cast<uint64_t>(samples_size) * k > INT32_MAX) {
347 |     error("too big \"k\": dim(samples)[0] * k > INT32_MAX");
348 |   }
349 |   auto centroids_iter = kwargs.find("centroids");
350 |   if (centroids_iter == kwargs.end()) {
351 |     error("\"centroids\" must be specified");
352 |   }
353 |   if (!isReal(centroids_iter->second)) {
354 |     error("\"centroids\" must be a 2D real matrix");
355 |   }
356 |   SEXP dims = getAttrib(centroids_iter->second, R_DimSymbol);
357 |   if (length(dims) != 2 || INTEGER(dims)[1] != features_size) {
358 |     error("invalid \"centroids\"'s dimensions");
359 |   }
360 |   int clusters_size = INTEGER(dims)[0];
361 |   std::unique_ptr<float[]> centroids(new float[clusters_size * features_size]);
362 |   double *centroids_double = REAL(centroids_iter->second);
363 |   float *centroids_float = centroids.get();
364 |   #pragma omp parallel for
365 |   for (uint64_t f = 0; f < features_size; f++) {
366 |     for (int64_t c = 0; c < clusters_size; c++) {
367 |       centroids_float[c * features_size + f] = centroids_double[f * clusters_size + c];
368 |     }
369 |   }
370 |   auto assignments_iter = kwargs.find("assignments");
371 |   if (assignments_iter == kwargs.end()) {
372 |     error("\"assignments\" must be specified");
373 |   }
374 |   if (!isInteger(assignments_iter->second)) {
375 |     error("\"assignments\" must be an integer vector");
376 |   }
377 |   if (static_cast<uint32_t>(length(assignments_iter->second)) != samples_size) {
378 |     error("invalid \"assignments\"'s length");
379 |   }
380 |   std::unique_ptr<uint32_t[]> assignments(new uint32_t[samples_size]);
381 |   int *assignments_obj_ptr = INTEGER(assignments_iter->second);
382 |   uint32_t *assignments_ptr = assignments.get();
383 |   #ifndef __APPLE__
384 |   #pragma omp parallel for simd
385 |   for (uint32_t i = 0; i < samples_size; i++) {
386 |     assignments_ptr[i] = assignments_obj_ptr[i] - 1;
387 |   }
388 |   #else
389 |   #pragma omp simd
390 |   for (uint32_t i = 0; i < samples_size; i++) {
391 |     assignments_ptr[i] = assignments_obj_ptr[i] - 1;
392 |   }
393 |   #endif
394 |   KMCUDADistanceMetric metric = parse_metric(kwargs);
395 |   int device = parse_device(kwargs);
396 |   int verbosity = parse_int(kwargs, "verbosity", 0);
397 |   std::unique_ptr<uint32_t[]> neighbors(new uint32_t[samples_size * k]);
398 |   auto result = knn_cuda(
399 |       k, metric, samples_size, features_size, clusters_size, device, -1, 0,
400 |       verbosity, samples.get(), centroids.get(), assignments_ptr, neighbors.get());
401 |   if (result != kmcudaSuccess) {
402 |     error("knn_cuda error %d %s%s", result,
403 |           kmcuda::statuses.find(result)->second, (verbosity > 0)? "" : "; "
404 |             "\"verbosity\" = 2 would reveal the details");
405 |   }
406 |   SEXP neighbors_obj = PROTECT(allocMatrix(INTSXP, samples_size, k));
407 |   const uint32_t *neighbors_ptr = neighbors.get();
408 |   int *neighbors_obj_ptr = INTEGER(neighbors_obj);
409 |   #pragma omp parallel for
410 |   for (int i = 0; i < k; i++) {
411 |     for (uint32_t s = 0; s < samples_size; s++) {
412 |       neighbors_obj_ptr[i * samples_size + s] = neighbors_ptr[s * k + i] + 1;
413 |     }
414 |   }
415 |   UNPROTECT(1);
416 |   return neighbors_obj;
417 | }
418 | 
419 | }  // extern "C"
420 | 


--------------------------------------------------------------------------------
/src/private.h:
--------------------------------------------------------------------------------
  1 | #ifndef KMCUDA_PRIVATE_H
  2 | #define KMCUDA_PRIVATE_H
  3 | 
  4 | #include "kmcuda.h"
  5 | #include <cinttypes>
  6 | #include <tuple>
  7 | #include "wrappers.h"
  8 | 
  9 | #if CUDA_ARCH >= 60
 10 | typedef double atomic_float;
 11 | #else
 12 | typedef float atomic_float;
 13 | #endif
 14 | 
 15 | 
 16 | #if CUDART_VERSION >= 9000
 17 | #define shfl(...) __shfl_sync(0xFFFFFFFF, __VA_ARGS__)
 18 | #define ballot(...) __ballot_sync(0xFFFFFFFF, __VA_ARGS__)
 19 | #define shfl_down(...) __shfl_down_sync(0xFFFFFFFF, __VA_ARGS__)
 20 | // This one removes all the registry usage optimizations which helped in CUDA 8
 21 | #define volatile
 22 | #else
 23 | #define shfl __shfl
 24 | #define ballot __ballot
 25 | #define shfl_down __shfl_down
 26 | #endif
 27 | 
 28 | /// printf() under INFO log level (0).
 29 | #define INFO(...) do { if (verbosity > 0) { printf(__VA_ARGS__); } } while (false)
 30 | /// printf() under DEBUG log level (1).
 31 | #define DEBUG(...) do { if (verbosity > 1) { printf(__VA_ARGS__); } } while (false)
 32 | /// printf() under TRACE log level (2).
 33 | #define TRACE(...) do { if (verbosity > 2) { printf(__VA_ARGS__); } } while (false)
 34 | 
 35 | #define CUERRSTR() cudaGetErrorString(cudaGetLastError())
 36 | 
 37 | /// Checks the CUDA call for errors, in case of an error logs it and returns.
 38 | /// "return" forces this to be a macro.
 39 | #define CUCH(cuda_call, ret, ...) \
 40 | do { \
 41 |   auto __res = cuda_call; \
 42 |   if (__res != cudaSuccess) { \
 43 |     DEBUG("%s\n", #cuda_call); \
 44 |     INFO("%s:%d -> %s\n", __FILE__, __LINE__, cudaGetErrorString(__res)); \
 45 |     __VA_ARGS__; \
 46 |     return ret; \
 47 |   } \
 48 | } while (false)
 49 | 
 50 | /// Checks whether the call returns 0; if not, executes arbitrary code and returns.
 51 | /// "return" forces this to be a macro.
 52 | #define RETERR(call, ...) \
 53 | do { \
 54 |   auto __res = call; \
 55 |   if (__res != 0) { \
 56 |     __VA_ARGS__; \
 57 |     return __res; \
 58 |   } \
 59 | } while (false)
 60 | 
 61 | /// Executes arbitrary code for every CUDA device.
 62 | #define FOR_EACH_DEV(...) do { for (int dev : devs) { \
 63 |   cudaSetDevice(dev); \
 64 |   __VA_ARGS__; \
 65 | } } while(false)
 66 | 
 67 | /// Executes arbitrary code for every CUDA device and supplies the device index
 68 | /// into the scope.
 69 | #define FOR_EACH_DEVI(...) do { for (size_t devi = 0; devi < devs.size(); devi++) { \
 70 |   cudaSetDevice(devs[devi]); \
 71 |   __VA_ARGS__; \
 72 | } } while(false)
 73 | 
 74 | /// Invokes cudaDeviceSynchronize() on every CUDA device.
 75 | #define SYNC_ALL_DEVS do { \
 76 | if (devs.size() > 1) { \
 77 | FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), kmcudaRuntimeError)); \
 78 | } } while (false)
 79 | 
 80 | /// Copies memory from device to host asynchronously across all the CUDA devices.
 81 | #define CUMEMCPY_D2H_ASYNC(dst, dst_stride, src, src_offset, size) do { \
 82 |   FOR_EACH_DEVI(CUCH(cudaMemcpyAsync( \
 83 |       dst + dst_stride * devi, (src)[devi].get() + src_offset, \
 84 |       (size) * sizeof(typename std::remove_reference<decltype(src)>::type::value_type \
 85 |           ::element_type), \
 86 |       cudaMemcpyDeviceToHost), \
 87 |                      kmcudaMemoryCopyError)); \
 88 | } while(false)
 89 | 
 90 | /// Copies memory from device to host synchronously across all the CUDA devices.
 91 | #define CUMEMCPY_D2H(dst, src, size) do { \
 92 |   CUMEMCPY_D2H_ASYNC(dst, src, size); \
 93 |   FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), kmcudaMemoryCopyError)); \
 94 | } while(false)
 95 | 
 96 | /// Copies memory from host to device asynchronously across all the CUDA devices.
 97 | #define CUMEMCPY_H2D_ASYNC(dst, dst_offset, src, size) do { \
 98 |   FOR_EACH_DEVI(CUCH(cudaMemcpyAsync( \
 99 |       (dst)[devi].get() + dst_offset, src, \
100 |       (size) * sizeof(typename std::remove_reference<decltype(dst)>::type::value_type \
101 |           ::element_type), \
102 |       cudaMemcpyHostToDevice), \
103 |                      kmcudaMemoryCopyError)); \
104 | } while(false)
105 | 
106 | /// Copies memory from host to device synchronously across all the CUDA devices.
107 | #define CUMEMCPY_H2D(dst, src, size) do { \
108 |   CUMEMCPY_H2D_ASYNC(dst, src, size); \
109 |   FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), kmcudaMemoryCopyError)); \
110 | } while(false)
111 | 
112 | /// Copies memory from device to device asynchronously across all the CUDA devices.
113 | #define CUMEMCPY_D2D_ASYNC(dst, dst_offset, src, src_offset, size) do { \
114 |   FOR_EACH_DEVI(CUCH(cudaMemcpyAsync( \
115 |       (dst)[devi].get() + dst_offset, (src)[devi].get() + src_offset, \
116 |       (size) * sizeof(typename std::remove_reference<decltype(dst)>::type::value_type \
117 |           ::element_type), \
118 |       cudaMemcpyDeviceToDevice), \
119 |                      kmcudaMemoryCopyError)); \
120 | } while(false)
121 | 
122 | /// Copies memory from device to host synchronously across all the CUDA devices.
123 | #define CUMEMCPY_D2D(dst, dst_offset, src, src_offset, size) do { \
124 |   CUMEMCPY_D2D_ASYNC(dst, dst_offset, src, src_offset, size); \
125 |   FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), kmcudaMemoryCopyError)); \
126 | } while(false)
127 | 
128 | /// Allocates memory on CUDA device and adds the created pointer to the list.
129 | #define CUMALLOC_ONEN(dest, size, name, dev) do { \
130 |   void *__ptr; \
131 |   size_t __size = (size) * \
132 |       sizeof(typename std::remove_reference<decltype(dest)>::type::value_type::element_type); \
133 |   CUCH(cudaMalloc(&__ptr, __size), kmcudaMemoryAllocationFailure, \
134 |        INFO("failed to allocate %zu bytes for " name "\n", __size)); \
135 |   (dest).emplace_back(reinterpret_cast<typename std::remove_reference<decltype(dest)> \
136 |       ::type::value_type::element_type *>(__ptr)); \
137 |   TRACE("[%d] " name ": %p - %p (%zu)\n", dev, __ptr, \
138 |         reinterpret_cast<char *>(__ptr) + __size, __size); \
139 | } while(false)
140 | 
141 | /// Shortcut for CUMALLOC_ONEN which defines the log name.
142 | #define CUMALLOC_ONE(dest, size, dev) CUMALLOC_ONEN(dest, size, #dest, dev)
143 | 
144 | /// Allocates memory on all CUDA devices.
145 | #define CUMALLOCN(dest, size, name) do { \
146 |   FOR_EACH_DEV(CUMALLOC_ONEN(dest, size, name, dev)); \
147 | } while(false)
148 | 
149 | /// Allocates memory on all CUDA devices. Does not require the log name, infers
150 | /// it from dest.
151 | #define CUMALLOC(dest, size) CUMALLOCN(dest, size, #dest)
152 | 
153 | /// Invokes cudaMemsetAsync() on all CUDA devices.
154 | #define CUMEMSET_ASYNC(dst, val, size) do { \
155 |   FOR_EACH_DEVI(CUCH(cudaMemsetAsync( \
156 |       (dst)[devi].get(), val, \
157 |       size * sizeof(typename std::remove_reference<decltype(dst)>::type::value_type::element_type)), \
158 |                      kmcudaRuntimeError)); \
159 | } while(false)
160 | 
161 | /// Invokes cudaMemset() on all CUDA devices.
162 | #define CUMEMSET(dst, val, size) do { \
163 |   CUMEMSET_ASYNC(dst, val, size); \
164 |   FOR_EACH_DEV(CUCH(cudaDeviceSynchronize(), kmcudaRuntimeError)); \
165 | } while(false)
166 | 
167 | /// Executes the specified code on all devices except the given one - number devi.
168 | #define FOR_OTHER_DEVS(...) do { \
169 |   for (size_t odevi = 0; odevi < devs.size(); odevi++) { \
170 |     if (odevi == devi) { \
171 |       continue; \
172 |     } \
173 |     __VA_ARGS__; \
174 |   } } while(false)
175 | 
176 | /// Copies memory peer to peer (device to other device device).
177 | #define CUP2P(what, offset, size) do { \
178 |   CUCH(cudaMemcpyPeerAsync( \
179 |       (*what)[odevi].get() + offset, devs[odevi], (*what)[devi].get() + offset, \
180 |       devs[devi], (size) * sizeof(typename std::remove_reference<decltype(*what)>::type \
181 |       ::value_type::element_type)), \
182 |        kmcudaMemoryCopyError); \
183 | } while(false)
184 | 
185 | #if CUDA_ARCH >= 60
186 | /// Bridges the code from single branch to multiple template branches.
187 | #define KERNEL_SWITCH(f, ...) do { switch (metric) { \
188 |   case kmcudaDistanceMetricL2: \
189 |     if (!fp16x2) { \
190 |         using F = float; \
191 |         f<kmcudaDistanceMetricL2, float>__VA_ARGS__; \
192 |     } else { \
193 |         using F = half2; \
194 |         f<kmcudaDistanceMetricL2, half2>__VA_ARGS__; \
195 |     } \
196 |     break; \
197 |   case kmcudaDistanceMetricCosine: \
198 |     if (!fp16x2) { \
199 |         using F = float; \
200 |         f<kmcudaDistanceMetricCosine, float>__VA_ARGS__; \
201 |     } else { \
202 |         using F = half2; \
203 |         f<kmcudaDistanceMetricCosine, half2>__VA_ARGS__; \
204 |     } \
205 |     break; \
206 | } } while(false)
207 | #else
208 | #define KERNEL_SWITCH(f, ...) do { switch (metric) { \
209 |   case kmcudaDistanceMetricL2: \
210 |     using F = float; \
211 |     f<kmcudaDistanceMetricL2, float>__VA_ARGS__; \
212 |     break; \
213 |   case kmcudaDistanceMetricCosine: \
214 |     using F = float; \
215 |     f<kmcudaDistanceMetricCosine, float>__VA_ARGS__; \
216 |     break; \
217 | } } while(false)
218 | #endif
219 | 
220 | /// Alternative to dupper() for host.
221 | template <typename T>
222 | inline T upper(T size, T each) {
223 |   T div = size / each;
224 |   if (div * each == size) {
225 |     return div;
226 |   }
227 |   return div + 1;
228 | }
229 | 
230 | using plan_t = std::vector<std::tuple<uint32_t, uint32_t>>;
231 | 
232 | /// @brief Generates the split across CUDA devices: (offset, size) pairs.
233 | /// It aligns every chunk at 512 bytes without breaking elements.
234 | /// @param amount The total work size - array size in elements.
235 | /// @param size_each Element size in bytes. Thus the total memory size is
236 | ///                  amount * size_each.
237 | /// @param devs The list with device numbers.
238 | /// @return The list with offset-size pairs. The measurement unit is the element
239 | /// size.
240 | inline plan_t distribute(
241 |     uint32_t amount, uint32_t size_each, const std::vector<int> &devs) {
242 |   if (devs.size() == 0) {
243 |     return {};
244 |   }
245 |   if (devs.size() == 1) {
246 |     return {std::make_tuple(0, amount)};
247 |   }
248 |   const uint32_t alignment = 512;
249 |   uint32_t a = size_each, b = alignment, gcd = 0;
250 |   for (;;) {
251 |     if (a == 0) {
252 |       gcd = b;
253 |       break;
254 |     }
255 |     b %= a;
256 |     if (b == 0) {
257 |       gcd = a;
258 |       break;
259 |     }
260 |     a %= b;
261 |   }
262 |   uint32_t stride = alignment / gcd;
263 |   uint32_t offset = 0;
264 |   std::vector<std::tuple<uint32_t, uint32_t>> res;
265 |   for (size_t i = 0; i < devs.size() - 1; i++) {
266 |     float step = (amount - offset + .0f) / (devs.size() - i);
267 |     uint32_t len = roundf(step / stride) * stride;
268 |     res.emplace_back(offset, len);
269 |     offset += len;
270 |   }
271 |   res.emplace_back(offset, amount - offset);
272 |   return res;
273 | }
274 | 
275 | /// Extracts the maximum split length from the device distribution plan.
276 | /// It calls distribute() and finds the maximum size.
277 | inline uint32_t max_distribute_length(
278 |     uint32_t amount, uint32_t size_each, const std::vector<int> &devs) {
279 |   auto plan = distribute(amount, size_each, devs);
280 |   uint32_t max_length = 0;
281 |   for (auto& p : plan) {
282 |     uint32_t length = std::get<1>(p);
283 |     if (length > max_length) {
284 |       max_length = length;
285 |     }
286 |   }
287 |   return max_length;
288 | }
289 | 
290 | /// Dumps the device split distribution to stdout.
291 | inline void print_plan(const char *name, const plan_t& plan) {
292 |   printf("%s: [", name);
293 |   bool first = true;
294 |   for (auto& p : plan) {
295 |     if (!first) {
296 |       printf(", ");
297 |     }
298 |     first = false;
299 |     printf("(%" PRIu32 ", %" PRIu32 ")", std::get<0>(p), std::get<1>(p));
300 |   }
301 |   printf("]\n");
302 | }
303 | 
304 | extern "C" {
305 | 
306 | /// Copies the single sample within the same device. Defined in transpose.cu.
307 | KMCUDAResult cuda_copy_sample_t(
308 |     uint32_t index, uint32_t offset, uint32_t samples_size, uint16_t features_size,
309 |     const std::vector<int> &devs, int verbosity, const udevptrs<float> &samples,
310 |     udevptrs<float> *dest);
311 | 
312 | /// Copies the single sample from device to host. Defined in transpose.cu.
313 | KMCUDAResult cuda_extract_sample_t(
314 |     uint32_t index, uint32_t samples_size, uint16_t features_size,
315 |     int verbosity, const float *samples, float *dest);
316 | 
317 | /// Transposes the samples matrix. Defined in transpose.cu.
318 | KMCUDAResult cuda_transpose(
319 |     uint32_t samples_size, uint16_t features_size, bool forward,
320 |     const std::vector<int> &devs, int verbosity, udevptrs<float> *samples);
321 | 
322 | /// Invokes kmeans++ kernel. Defined in kmeans.cu.
323 | KMCUDAResult kmeans_cuda_plus_plus(
324 |     uint32_t samples_size, uint32_t features_size, uint32_t cc,
325 |     KMCUDADistanceMetric metric, const std::vector<int> &devs, int fp16x2,
326 |     int verbosity, const udevptrs<float> &samples, udevptrs<float> *centroids,
327 |     udevptrs<float> *dists, float *host_dists, atomic_float *dists_sum);
328 | 
329 | /// Invokes afk-mc2 kernel "calc_q". Defined in kmeans.cu.
330 | KMCUDAResult kmeans_cuda_afkmc2_calc_q(
331 |     uint32_t samples_size, uint32_t features_size, uint32_t firstc,
332 |     KMCUDADistanceMetric metric, const std::vector<int> &devs, int fp16x2,
333 |     int verbosity, const udevptrs<float> &samples, udevptrs<float> *d_q,
334 |     float *h_q);
335 | 
336 | /// Invokes afk-mc2 kernel "random_step". Defined in kmeans.cu.
337 | KMCUDAResult kmeans_cuda_afkmc2_random_step(
338 |     uint32_t k, uint32_t m, uint64_t seed, int verbosity, const float *q,
339 |     uint32_t *d_choices, uint32_t *h_choices, float *d_samples, float *h_samples);
340 | 
341 | /// Invokes afk-mc2 kernel "min_dist". Defined in kmeans.cu.
342 | KMCUDAResult kmeans_cuda_afkmc2_min_dist(
343 |     uint32_t k, uint32_t m, KMCUDADistanceMetric metric, int fp16x2,
344 |     int32_t verbosity, const float *samples, const uint32_t *choices,
345 |     const float *centroids, float *d_min_dists, float *h_min_dists);
346 | 
347 | /// Initializes the CUDA environment, e.g. assigns values to symbols.
348 | /// Defined in kmeans.cu.
349 | KMCUDAResult kmeans_cuda_setup(
350 |     uint32_t samples_size, uint16_t features_size, uint32_t clusters_size,
351 |     uint32_t yy_groups_size, const std::vector<int> &devs, int32_t verbosity);
352 | 
353 | /// Performs the centroids initialization. Defined in kmcuda.cc.
354 | KMCUDAResult kmeans_init_centroids(
355 |     KMCUDAInitMethod method, const void *init_params, uint32_t samples_size,
356 |     uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric,
357 |     uint32_t seed, const std::vector<int> &devs, int device_ptrs, int fp16x2,
358 |     int32_t verbosity, const float *host_centroids,  const udevptrs<float> &samples,
359 |     udevptrs<float> *dists, udevptrs<float> *aux, udevptrs<float> *centroids);
360 | 
361 | /// Complementing implementation of kmeans_cuda() which requires nvcc.
362 | /// Defined in kmeans.cu.
363 | KMCUDAResult kmeans_cuda_yy(
364 |     float tolerance, uint32_t yy_groups_size, uint32_t samples_size,
365 |     uint32_t clusters_size, uint16_t features_size, KMCUDADistanceMetric metric,
366 |     const std::vector<int> &devs, int fp16x2, int32_t verbosity,
367 |     const udevptrs<float> &samples, udevptrs<float> *centroids,
368 |     udevptrs<uint32_t> *ccounts, udevptrs<uint32_t> *assignments_prev,
369 |     udevptrs<uint32_t> *assignments, udevptrs<uint32_t> *assignments_yy,
370 |     udevptrs<float> *centroids_yy, udevptrs<float> *bounds_yy,
371 |     udevptrs<float> *drifts_yy, udevptrs<uint32_t> *passed_yy);
372 | 
373 | /// Calculates the average distance between cluster members and the corresponding
374 | /// centroid. Defined in kmeans.cu.
375 | KMCUDAResult kmeans_cuda_calc_average_distance(
376 |     uint32_t samples_size, uint16_t features_size,
377 |     KMCUDADistanceMetric metric, const std::vector<int> &devs, int fp16x2,
378 |     int32_t verbosity, const udevptrs<float> &samples,
379 |     const udevptrs<float> &centroids, const udevptrs<uint32_t> &assignments,
380 |     float *average_distance);
381 | 
382 | /// Prepares the CUDA environment for K-nn calculation, e.g., assigns values to
383 | /// symbols. Defined in knn.cu.
384 | KMCUDAResult knn_cuda_setup(
385 |     uint32_t samples_size, uint16_t features_size, uint32_t clusters_size,
386 |     const std::vector<int> &devs, int32_t verbosity);
387 | 
388 | /// Complementing implementation of knn_cuda() which requires nvcc.
389 | /// Defined in knn.cu.
390 | KMCUDAResult knn_cuda_calc(
391 |     uint16_t k, uint32_t h_samples_size, uint32_t h_clusters_size,
392 |     uint16_t h_features_size, KMCUDADistanceMetric metric,
393 |     const std::vector<int> &devs, int fp16x2, int verbosity,
394 |     const udevptrs<float> &samples, const udevptrs<float> &centroids,
395 |     const udevptrs<uint32_t> &assignments, const udevptrs<uint32_t> &inv_asses,
396 |     const udevptrs<uint32_t> &inv_asses_offsets, udevptrs<float> *distances,
397 |     udevptrs<float>* sample_dists, udevptrs<float> *radiuses,
398 |     udevptrs<uint32_t> *neighbors);
399 | 
400 | /// Looks at the amount of available shared memory and decides on the
401 | /// performance critical property of knn_cuda_calc() - which of the two variants
402 | /// to follow.
403 | int knn_cuda_neighbors_mem_multiplier(uint16_t k, int dev, int verbosity);
404 | }  // extern "C"
405 | 
406 | #endif  // KMCUDA_PRIVATE_H
407 | 


--------------------------------------------------------------------------------
/src/knn.cu:
--------------------------------------------------------------------------------
  1 | #include <cfloat>
  2 | 
  3 | #include "private.h"
  4 | #include "metric_abstraction.h"
  5 | #include "tricks.cuh"
  6 | 
  7 | #define CLUSTER_DISTANCES_BLOCK_SIZE 512
  8 | #define CLUSTER_DISTANCES_SHMEM 12288  // in float-s
  9 | #define CLUSTER_RADIUSES_BLOCK_SIZE 512
 10 | #define CLUSTER_RADIUSES_SHMEM 8192  // in float-s
 11 | #define KNN_BLOCK_SIZE_SHMEM 512
 12 | #define KNN_BLOCK_SIZE_GMEM 1024
 13 | 
 14 | __constant__ uint32_t d_samples_size;
 15 | __constant__ uint32_t d_clusters_size;
 16 | __device__ unsigned long long int d_dists_calced;
 17 | 
 18 | /// sample_dists musr be zero-ed!
 19 | template <KMCUDADistanceMetric M, typename F>
 20 | __global__ void knn_calc_cluster_radiuses(
 21 |     uint32_t offset, uint32_t length, const uint32_t *__restrict__ inv_asses,
 22 |     const uint32_t *__restrict__ inv_asses_offsets,
 23 |     const F *__restrict__ centroids, const F *__restrict__ samples,
 24 |     float *__restrict__ sample_dists, float *__restrict__ radiuses) {
 25 |   volatile uint32_t ci = blockIdx.x * blockDim.x + threadIdx.x;
 26 |   if (ci >= length) {
 27 |     return;
 28 |   }
 29 |   ci += offset;
 30 | 
 31 |   // stage 1 - accumulate partial distances for every sample
 32 |   __shared__ F shcents[CLUSTER_RADIUSES_SHMEM];
 33 |   volatile const int cent_step = min(
 34 |       CLUSTER_RADIUSES_SHMEM / blockDim.x, static_cast<unsigned>(d_features_size));
 35 |   F *volatile const my_cent = shcents + cent_step * threadIdx.x;
 36 |   for (int cfi = 0; cfi < d_features_size; cfi += cent_step) {
 37 |     const int fsize = min(cent_step, d_features_size - cfi);
 38 |     for (int f = 0; f < fsize; f++) {
 39 |       my_cent[f] = centroids[ci * d_features_size + cfi + f];
 40 |     }
 41 |     for (uint32_t ass = inv_asses_offsets[ci]; ass < inv_asses_offsets[ci + 1];
 42 |          ass++) {
 43 |        uint64_t sample = inv_asses[ass];  // uint64_t!
 44 |        sample_dists[sample] += METRIC<M, F>::partial_t(
 45 |            samples, my_cent, fsize, d_samples_size, cfi, sample);
 46 |     }
 47 |   }
 48 |   // stage 2 - find the maximum distance
 49 |   float max_dist = -1;
 50 |   for (uint32_t ass = inv_asses_offsets[ci]; ass < inv_asses_offsets[ci + 1];
 51 |        ass++) {
 52 |     float dist = METRIC<M, F>::finalize(sample_dists[inv_asses[ass]]);
 53 |     if (dist > max_dist) {
 54 |       max_dist = dist;
 55 |     }
 56 |   }
 57 |   radiuses[ci] = max_dist > -1? max_dist : NAN;
 58 | }
 59 | 
 60 | /// distances must be zero-ed!
 61 | template <KMCUDADistanceMetric M, typename F>
 62 | __global__ void knn_calc_cluster_distances(
 63 |     uint32_t offset, const F *__restrict__ centroids, float *distances) {
 64 |   volatile const uint32_t bi = blockIdx.x + offset;
 65 |   const uint32_t bs = CLUSTER_DISTANCES_BLOCK_SIZE;
 66 |   uint32_t x, y;
 67 |   const uint32_t n = dupper(d_clusters_size, bs);
 68 |   {
 69 |     float tmp = n + 0.5;
 70 |     float d = _sqrt(tmp * tmp - 2 * bi);
 71 |     y = tmp - d;
 72 |     x = bi + y + (n - y) * (n - y + 1) / 2 - n * (n + 1) / 2;
 73 |   }
 74 |   __shared__ F shcents[CLUSTER_DISTANCES_SHMEM];
 75 |   const uint32_t fstep = CLUSTER_DISTANCES_SHMEM / bs;
 76 |   F *volatile my_cent = shcents + fstep * threadIdx.x;
 77 | 
 78 |   // stage 1 - accumulate distances
 79 |   for (uint16_t fpos = 0; fpos < d_features_size; fpos += fstep) {
 80 |     __syncthreads();
 81 |     const uint16_t fsize = min(
 82 |         fstep, static_cast<uint32_t>(d_features_size - fpos));
 83 |     uint32_t cbase = x * bs + threadIdx.x;
 84 |     if (cbase < d_clusters_size) {
 85 |       for (uint16_t f = 0; f < fsize; f++) {
 86 |         my_cent[f] = centroids[cbase * d_features_size + fpos + f];
 87 |       }
 88 |     }
 89 |     __syncthreads();
 90 |     for (uint32_t ti = 0; ti < bs; ti++) {
 91 |       if ((y * bs + threadIdx.x) < d_clusters_size
 92 |           && (x * bs + ti) < d_clusters_size) {
 93 |         auto other_cent = d_clusters_size <= bs?
 94 |             shcents + (y * bs + threadIdx.x) * fstep
 95 |             :
 96 |             centroids + (y * bs + threadIdx.x) * d_features_size + fpos;
 97 |         distances[(y * bs + threadIdx.x) * d_clusters_size + x * bs + ti] +=
 98 |             METRIC<M, F>::partial(other_cent, shcents + ti * fstep, fsize);
 99 |       }
100 |     }
101 |   }
102 | 
103 |   // stage 2 - finalize the distances
104 |   for (uint32_t ti = 0; ti < bs; ti++) {
105 |     if ((y * bs + threadIdx.x) < d_clusters_size
106 |         && (x * bs + ti) < d_clusters_size) {
107 |       uint32_t di = (y * bs + threadIdx.x) * d_clusters_size + x * bs + ti;
108 |       float dist = distances[di];
109 |       dist = METRIC<M, F>::finalize(dist);
110 |       distances[di] = dist;
111 |     }
112 |   }
113 | }
114 | 
115 | __global__ void knn_mirror_cluster_distances(float *__restrict__ distances) {
116 |   const uint32_t bs = CLUSTER_DISTANCES_BLOCK_SIZE;
117 |   uint32_t x, y;
118 |   const uint32_t n = dupper(d_clusters_size, bs);
119 |   {
120 |     float tmp = n + 0.5;
121 |     float d = _sqrt(tmp * tmp - 2 * blockIdx.x);
122 |     y = tmp - d;
123 |     x = blockIdx.x + y + (n - y) * (n - y + 1) / 2 - n * (n + 1) / 2;
124 |   }
125 |   for (uint32_t ti = 0; ti < bs; ti++) {
126 |     if ((y * bs + threadIdx.x) < d_clusters_size && (x * bs + ti) < d_clusters_size) {
127 |       distances[(x * bs + ti) * d_clusters_size + y * bs + threadIdx.x] =
128 |           distances[(y * bs + threadIdx.x) * d_clusters_size + x * bs + ti];
129 |     }
130 |   }
131 | }
132 | 
133 | FPATTR void push_sample(uint16_t k, float dist, uint32_t index, float *heap) {
134 |   uint16_t pos = 0;
135 |   while (true) {
136 |     float left, right;
137 |     bool left_le, right_le;
138 |     if ((2 * pos + 1) < k) {
139 |       left = heap[4 * pos + 2];
140 |       left_le = dist >= left;
141 |     } else {
142 |       left_le = true;
143 |     }
144 |     if ((2 * pos + 2) < k) {
145 |       right = heap[4 * pos + 4];
146 |       right_le = dist >= right;
147 |     } else {
148 |       right_le = true;
149 |     }
150 |     if (left_le && right_le) {
151 |       heap[2 * pos] = dist;
152 |       *reinterpret_cast<uint32_t *>(heap + 2 * pos + 1) = index;
153 |       break;
154 |     }
155 |     if (!left_le && !right_le) {
156 |       if (left <= right) {
157 |         heap[2 * pos] = right;
158 |         heap[2 * pos + 1] = heap[4 * pos + 5];
159 |         pos = 2 * pos + 2;
160 |       } else {
161 |         heap[2 * pos] = left;
162 |         heap[2 * pos + 1] = heap[4 * pos + 3];
163 |         pos = 2 * pos + 1;
164 |       }
165 |     } else if (left_le) {
166 |       heap[2 * pos] = right;
167 |       heap[2 * pos + 1] = heap[4 * pos + 5];
168 |       pos = 2 * pos + 2;
169 |     } else {
170 |       heap[2 * pos] = left;
171 |       heap[2 * pos + 1] = heap[4 * pos + 3];
172 |       pos = 2 * pos + 1;
173 |     }
174 |   }
175 | }
176 | 
177 | template <KMCUDADistanceMetric M, typename F>
178 | __global__ void knn_assign_shmem(
179 |     uint32_t offset, uint32_t length, uint16_t k,
180 |     const float *__restrict__ cluster_distances,
181 |     const float *__restrict__ cluster_radiuses,
182 |     const F *__restrict__ samples, const F *__restrict__ centroids,
183 |     const uint32_t *assignments, const uint32_t *inv_asses,
184 |     const uint32_t *inv_asses_offsets, uint32_t *neighbors) {
185 |   volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x;
186 |   if (sample >= length) {
187 |     return;
188 |   }
189 |   sample += offset;
190 |   volatile uint32_t mycls = assignments[sample];
191 |   volatile float mydist = METRIC<M, F>::distance_t(
192 |       samples, centroids + mycls * d_features_size, d_samples_size, sample);
193 |   extern __shared__ float buffer[];
194 |   float *volatile mynearest = buffer + k * 2 * threadIdx.x;
195 |   volatile float mndist = FLT_MAX;
196 |   for (int i = 0; i < static_cast<int>(k); i++) {
197 |     mynearest[i * 2] = FLT_MAX;
198 |   }
199 |   uint32_t pos_start = inv_asses_offsets[mycls];
200 |   uint32_t pos_finish = inv_asses_offsets[mycls + 1];
201 |   atomicAdd(&d_dists_calced, pos_finish - pos_start);
202 |   for (uint32_t pos = pos_start; pos < pos_finish; pos++) {
203 |     uint64_t other_sample = inv_asses[pos];
204 |     if (sample == other_sample) {
205 |       continue;
206 |     }
207 |     float dist = METRIC<M, F>::distance_tt(
208 |         samples, d_samples_size, sample, other_sample);
209 |     if (dist <= mndist) {
210 |       push_sample(k, dist, other_sample, mynearest);
211 |       mndist = mynearest[0];
212 |     }
213 |   }
214 |   for (uint32_t cls = 0; cls < d_clusters_size; cls++) {
215 |     if (cls == mycls) {
216 |       continue;
217 |     }
218 |     float cdist = cluster_distances[cls * d_clusters_size + mycls];
219 |     if (cdist != cdist) {
220 |       continue;
221 |     }
222 |     float dist = cdist - mydist - cluster_radiuses[cls];
223 |     if (dist > mndist) {
224 |       continue;
225 |     }
226 |     uint32_t pos_start = inv_asses_offsets[cls];
227 |     uint32_t pos_finish = inv_asses_offsets[cls + 1];
228 |     atomicAdd(&d_dists_calced, pos_finish - pos_start);
229 |     for (uint32_t pos = pos_start; pos < pos_finish; pos++) {
230 |       uint64_t other_sample = inv_asses[pos];
231 |       dist = METRIC<M, F>::distance_tt(
232 |           samples, d_samples_size, sample, other_sample);
233 |       if (dist <= mndist) {
234 |         push_sample(k, dist, other_sample, mynearest);
235 |         mndist = mynearest[0];
236 |       }
237 |     }
238 |   }
239 |   for (int i = k - 1; i >= 0; i--) {
240 |     neighbors[(sample - offset) * k + i] = reinterpret_cast<uint32_t*>(mynearest)[1];
241 |     push_sample(k, -1, UINT32_MAX, mynearest);
242 |   }
243 | }
244 | 
245 | template <KMCUDADistanceMetric M, typename F>
246 | __global__ void knn_assign_gmem(
247 |     uint32_t offset, uint32_t length, uint16_t k,
248 |     const float *__restrict__ cluster_distances,
249 |     const float *__restrict__ cluster_radiuses,
250 |     const F *__restrict__ samples, const F *__restrict__ centroids,
251 |     const uint32_t *assignments, const uint32_t *inv_asses,
252 |     const uint32_t *inv_asses_offsets, uint32_t *neighbors) {
253 |   volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x;
254 |   if (sample >= length) {
255 |     return;
256 |   }
257 |   sample += offset;
258 |   volatile uint32_t mycls = assignments[sample];
259 |   volatile float mydist = METRIC<M, F>::distance_t(
260 |       samples, centroids + mycls * d_features_size, d_samples_size, sample);
261 |   float *volatile mynearest =
262 |       reinterpret_cast<float*>(neighbors) + (sample - offset) * k * 2;
263 |   volatile float mndist = FLT_MAX;
264 |   for (int i = 0; i < static_cast<int>(k); i++) {
265 |     mynearest[i * 2] = FLT_MAX;
266 |   }
267 |   uint32_t pos_start = inv_asses_offsets[mycls];
268 |   uint32_t pos_finish = inv_asses_offsets[mycls + 1];
269 |   atomicAdd(&d_dists_calced, pos_finish - pos_start);
270 |   for (uint32_t pos = pos_start; pos < pos_finish; pos++) {
271 |     uint64_t other_sample = inv_asses[pos];
272 |     if (sample == other_sample) {
273 |       continue;
274 |     }
275 |     float dist = METRIC<M, F>::distance_tt(
276 |         samples, d_samples_size, sample, other_sample);
277 |     if (dist <= mndist) {
278 |       push_sample(k, dist, other_sample, mynearest);
279 |       mndist = mynearest[0];
280 |     }
281 |   }
282 |   for (uint32_t cls = 0; cls < d_clusters_size; cls++) {
283 |     if (cls == mycls) {
284 |       continue;
285 |     }
286 |     float cdist = cluster_distances[cls * d_clusters_size + mycls];
287 |     if (cdist != cdist) {
288 |       continue;
289 |     }
290 |     float dist = cdist - mydist - cluster_radiuses[cls];
291 |     if (dist > mndist) {
292 |       continue;
293 |     }
294 |     pos_start = inv_asses_offsets[cls];
295 |     pos_finish = inv_asses_offsets[cls + 1];
296 |     atomicAdd(&d_dists_calced, pos_finish - pos_start);
297 |     for (uint32_t pos = pos_start; pos < pos_finish; pos++) {
298 |       uint64_t other_sample = inv_asses[pos];
299 |       dist = METRIC<M, F>::distance_tt(
300 |           samples, d_samples_size, sample, other_sample);
301 |       if (dist <= mndist) {
302 |         push_sample(k, dist, other_sample, mynearest);
303 |         mndist = mynearest[0];
304 |       }
305 |     }
306 |   }
307 |   for (int i = 0; i < k; i++) {
308 |     uint32_t imax = reinterpret_cast<uint32_t*>(mynearest)[1];
309 |     push_sample(k - i - 1, mynearest[2 * k - 2 * i - 2],
310 |                 reinterpret_cast<uint32_t*>(mynearest)[2 * k - 2 * i - 1],
311 |                 mynearest);
312 |     reinterpret_cast<uint32_t*>(mynearest)[2 * k - 2 * i - 1] = imax;
313 |   }
314 |   for (int i = 0; i < k; i++) {
315 |     reinterpret_cast<uint32_t*>(mynearest)[i] =
316 |         reinterpret_cast<uint32_t*>(mynearest)[2 * i + 1];
317 |   }
318 | }
319 | 
320 | __global__ void knn_assign_gmem_deinterleave1(
321 |     uint32_t length, uint16_t k, uint32_t *neighbors) {
322 |   volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x;
323 |   if (sample >= length) {
324 |     return;
325 |   }
326 |   if (sample % 2 == 1) {
327 |     for (int i = 0; i < k; i++) {
328 |       neighbors[sample * k + i] = neighbors[sample * 2 * k + i];
329 |     }
330 |   } else {
331 |     for (int i = 0; i < k; i++) {
332 |       neighbors[(length + sample) * k + k + i] = neighbors[sample * 2 * k + i];
333 |     }
334 |   }
335 | }
336 | 
337 | __global__ void knn_assign_gmem_deinterleave2(
338 |     uint32_t length, uint16_t k, uint32_t *neighbors) {
339 |   volatile uint64_t sample = blockIdx.x * blockDim.x + threadIdx.x;
340 |   sample *= 2;
341 |   if (sample >= length) {
342 |     return;
343 |   }
344 |   for (int i = 0; i < k; i++) {
345 |     neighbors[sample * k + i] = neighbors[(length + sample) * k + k + i];
346 |   }
347 | }
348 | 
349 | extern "C" {
350 | 
351 | KMCUDAResult knn_cuda_setup(
352 |     uint32_t h_samples_size, uint16_t h_features_size, uint32_t h_clusters_size,
353 |     const std::vector<int> &devs, int32_t verbosity) {
354 |   FOR_EACH_DEV(
355 |     CUCH(cudaMemcpyToSymbol(d_samples_size, &h_samples_size, sizeof(h_samples_size)),
356 |          kmcudaMemoryCopyError);
357 |     CUCH(cudaMemcpyToSymbol(d_features_size, &h_features_size, sizeof(h_features_size)),
358 |          kmcudaMemoryCopyError);
359 |     CUCH(cudaMemcpyToSymbol(d_clusters_size, &h_clusters_size, sizeof(h_clusters_size)),
360 |          kmcudaMemoryCopyError);
361 |     uint64_t zero = 0;
362 |     CUCH(cudaMemcpyToSymbol(d_dists_calced, &zero, sizeof(d_dists_calced)),
363 |          kmcudaMemoryCopyError);
364 |   );
365 |   return kmcudaSuccess;
366 | }
367 | 
368 | int knn_cuda_neighbors_mem_multiplier(uint16_t k, int dev, int verbosity) {
369 |   cudaDeviceProp props;
370 |   cudaGetDeviceProperties(&props, dev);
371 |   int shmem_size = static_cast<int>(props.sharedMemPerBlock);
372 |   int needed_shmem_size = KNN_BLOCK_SIZE_SHMEM * 2 * k * sizeof(uint32_t);
373 |   if (needed_shmem_size > shmem_size) {
374 |     INFO("device #%d: needed shmem size %d > %d => using global memory\n",
375 |          dev, needed_shmem_size, shmem_size);
376 |     return 2;
377 |   }
378 |   return 1;
379 | }
380 | 
381 | KMCUDAResult knn_cuda_calc(
382 |     uint16_t k, uint32_t h_samples_size, uint32_t h_clusters_size,
383 |     uint16_t h_features_size, KMCUDADistanceMetric metric,
384 |     const std::vector<int> &devs, int fp16x2, int verbosity,
385 |     const udevptrs<float> &samples, const udevptrs<float> &centroids,
386 |     const udevptrs<uint32_t> &assignments, const udevptrs<uint32_t> &inv_asses,
387 |     const udevptrs<uint32_t> &inv_asses_offsets, udevptrs<float> *distances,
388 |     udevptrs<float>* sample_dists, udevptrs<float> *radiuses,
389 |     udevptrs<uint32_t> *neighbors) {
390 |   auto plan = distribute(h_clusters_size, h_features_size * sizeof(float), devs);
391 |   if (verbosity > 1) {
392 |     print_plan("plan_calc_radiuses", plan);
393 |   }
394 |   INFO("calculating the cluster radiuses...\n");
395 |   FOR_EACH_DEVI(
396 |     uint32_t offset, length;
397 |     std::tie(offset, length) = plan[devi];
398 |     if (length == 0) {
399 |       continue;
400 |     }
401 |     dim3 block(CLUSTER_RADIUSES_BLOCK_SIZE, 1, 1);
402 |     dim3 grid(upper(h_clusters_size, block.x), 1, 1);
403 |     float *dsd;
404 |     if (h_clusters_size * h_clusters_size >= h_samples_size) {
405 |       dsd = (*distances)[devi].get();
406 |     } else {
407 |       dsd = (*sample_dists)[devi].get();
408 |     }
409 |     KERNEL_SWITCH(knn_calc_cluster_radiuses, <<<grid, block>>>(
410 |         offset, length, inv_asses[devi].get(), inv_asses_offsets[devi].get(),
411 |         reinterpret_cast<const F*>(centroids[devi].get()),
412 |         reinterpret_cast<const F*>(samples[devi].get()),
413 |         dsd, (*radiuses)[devi].get()));
414 |   );
415 |   FOR_EACH_DEVI(
416 |     uint32_t offset, length;
417 |     std::tie(offset, length) = plan[devi];
418 |     FOR_OTHER_DEVS(
419 |       CUP2P(radiuses, offset, length);
420 |     );
421 |   );
422 |   if (h_clusters_size * h_clusters_size >= h_samples_size) {
423 |     CUMEMSET_ASYNC(*distances, 0, h_samples_size);
424 |   }
425 |   uint32_t dist_blocks_dim = upper(
426 |       h_clusters_size, static_cast<uint32_t>(CLUSTER_DISTANCES_BLOCK_SIZE));
427 |   uint32_t dist_blocks_n = (2 * dist_blocks_dim + 1) * (2 * dist_blocks_dim + 1) / 8;
428 |   plan = distribute(dist_blocks_n, 512, devs);
429 |   {  // align across CLUSTER_DISTANCES_BLOCK_SIZE horizontal boundaries
430 |     uint32_t align = 0;
431 |     for (auto& p : plan) {
432 |       uint32_t offset, length;
433 |       std::tie(offset, length) = p;
434 |       offset += align;
435 |       std::get<0>(p) = offset;
436 |       uint32_t n = dist_blocks_dim;
437 |       float tmp = n + 0.5;
438 |       float d = sqrt(tmp * tmp - 2 * (offset + length));
439 |       uint32_t y = tmp - d;
440 |       uint32_t x = offset + length + (n - y) * (n - y + 1) / 2 - n * (n + 1) / 2;
441 |       if (x > 0) {
442 |         align = n - y - x;
443 |         std::get<1>(p) += align;
444 |       }
445 |     }
446 |   }
447 |   if (verbosity > 1) {
448 |     print_plan("plan_calc_cluster_distances", plan);
449 |   }
450 |   INFO("calculating the centroid distance matrix...\n");
451 |   FOR_EACH_DEVI(
452 |     uint32_t offset, length;
453 |     std::tie(offset, length) = plan[devi];
454 |     if (length == 0) {
455 |       continue;
456 |     }
457 |     dim3 block(CLUSTER_DISTANCES_BLOCK_SIZE, 1, 1);
458 |     dim3 grid(length, 1, 1);
459 |     KERNEL_SWITCH(knn_calc_cluster_distances, <<<grid, block>>>(
460 |         offset, reinterpret_cast<const F*>(centroids[devi].get()),
461 |         (*distances)[devi].get()));
462 |   );
463 |   FOR_EACH_DEVI(
464 |     uint32_t y_start, y_finish;
465 |     {
466 |       uint32_t offset, length;
467 |       std::tie(offset, length) = plan[devi];
468 |       float tmp = dist_blocks_dim + 0.5;
469 |       float d = sqrt(tmp * tmp - 2 * offset);
470 |       y_start = tmp - d;
471 |       d = sqrt(tmp * tmp - 2 * (offset + length));
472 |       y_finish = tmp - d;
473 |     }
474 |     if (y_finish == y_start) {
475 |       continue;
476 |     }
477 |     uint32_t p_offset = y_start * h_clusters_size * CLUSTER_DISTANCES_BLOCK_SIZE;
478 |     uint32_t p_size = (y_finish - y_start) * h_clusters_size * CLUSTER_DISTANCES_BLOCK_SIZE;
479 |     p_size = std::min(p_size, h_clusters_size * h_clusters_size - p_offset);
480 |     FOR_OTHER_DEVS(
481 |       CUP2P(distances, p_offset, p_size);
482 |     );
483 |   );
484 |   FOR_EACH_DEVI(
485 |     dim3 block(CLUSTER_DISTANCES_BLOCK_SIZE, 1, 1);
486 |     dim3 grid(dist_blocks_n, 1, 1);
487 |     knn_mirror_cluster_distances<<<grid, block>>>((*distances)[devi].get());
488 |   );
489 |   plan = distribute(h_samples_size, h_features_size * sizeof(float), devs);
490 |   INFO("searching for the nearest neighbors...\n");
491 |   FOR_EACH_DEVI(
492 |     uint32_t offset, length;
493 |     std::tie(offset, length) = plan[devi];
494 |     if (knn_cuda_neighbors_mem_multiplier(k, devs[devi], 1) == 2) {
495 |       dim3 block(KNN_BLOCK_SIZE_GMEM, 1, 1);
496 |       dim3 grid(upper(h_samples_size, block.x), 1, 1);
497 |       KERNEL_SWITCH(knn_assign_gmem, <<<grid, block>>>(
498 |           offset, length, k, (*distances)[devi].get(), (*radiuses)[devi].get(),
499 |           reinterpret_cast<const F*>(samples[devi].get()),
500 |           reinterpret_cast<const F*>(centroids[devi].get()),
501 |           assignments[devi].get(), inv_asses[devi].get(),
502 |           inv_asses_offsets[devi].get(), (*neighbors)[devi].get()));
503 |       knn_assign_gmem_deinterleave1<<<grid, block>>>(
504 |           length, k, (*neighbors)[devi].get());
505 |       dim3 grid2(upper(h_samples_size, 2 * block.x), 1, 1);
506 |       knn_assign_gmem_deinterleave2<<<grid2, block>>>(
507 |           length, k, (*neighbors)[devi].get());
508 |     } else {
509 |       dim3 block(KNN_BLOCK_SIZE_SHMEM, 1, 1);
510 |       dim3 grid(upper(h_samples_size, block.x), 1, 1);
511 |       KERNEL_SWITCH(
512 |           knn_assign_shmem,
513 |           <<<grid, block, KNN_BLOCK_SIZE_SHMEM * 2 * k * sizeof(uint32_t)>>>(
514 |               offset, length, k, (*distances)[devi].get(), (*radiuses)[devi].get(),
515 |               reinterpret_cast<const F*>(samples[devi].get()),
516 |               reinterpret_cast<const F*>(centroids[devi].get()),
517 |               assignments[devi].get(), inv_asses[devi].get(),
518 |               inv_asses_offsets[devi].get(), (*neighbors)[devi].get()));
519 |     }
520 |   );
521 |   uint64_t dists_calced = 0;
522 |   FOR_EACH_DEV(
523 |     uint64_t h_dists_calced = 0;
524 |     CUCH(cudaMemcpyFromSymbol(&h_dists_calced, d_dists_calced, sizeof(h_dists_calced)),
525 |          kmcudaMemoryCopyError);
526 |     DEBUG("#%d dists_calced: %" PRIu64 "\n", dev, h_dists_calced);
527 |     dists_calced += h_dists_calced;
528 |   );
529 |   uint64_t max_dists_calced = static_cast<uint64_t>(h_samples_size) * h_samples_size;
530 |   INFO("calculated %f of all the distances\n", (dists_calced + .0) / max_dists_calced);
531 |   return kmcudaSuccess;
532 | }
533 | 
534 | }  // extern "C"
535 | 


--------------------------------------------------------------------------------
/src/python.cc:
--------------------------------------------------------------------------------
  1 | /// avoid spurious trailing ‘%’ in format error
  2 | /// see https://stackoverflow.com/questions/8132399/how-to-printf-uint64-t-fails-with-spurious-trailing-in-format
  3 | #define __STDC_FORMAT_MACROS
  4 | #include <functional>
  5 | #include <memory>
  6 | #include <string>
  7 | #include <unordered_map>
  8 | #include <Python.h>
  9 | #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 10 | #include <numpy/arrayobject.h>
 11 | #include <cuda_runtime_api.h>
 12 | #include "kmcuda.h"
 13 | 
 14 | static char module_docstring[] =
 15 |     "This module provides fast K-means implementation which uses CUDA.";
 16 | static char kmeans_cuda_docstring[] =
 17 |     "Assigns cluster label to each sample and calculates cluster centers.";
 18 | static char knn_cuda_docstring[] =
 19 |     "Finds the nearest neighbors for every sample.";
 20 | 
 21 | static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs);
 22 | static PyObject *py_knn_cuda(PyObject *self, PyObject *args, PyObject *kwargs);
 23 | 
 24 | static PyMethodDef module_functions[] = {
 25 |   {"kmeans_cuda", reinterpret_cast<PyCFunction>(py_kmeans_cuda),
 26 |    METH_VARARGS | METH_KEYWORDS, kmeans_cuda_docstring},
 27 |   {"knn_cuda", reinterpret_cast<PyCFunction>(py_knn_cuda),
 28 |    METH_VARARGS | METH_KEYWORDS, knn_cuda_docstring},
 29 |   {NULL, NULL, 0, NULL}
 30 | };
 31 | 
 32 | extern "C" {
 33 | PyMODINIT_FUNC PyInit_libKMCUDA(void) {
 34 |   static struct PyModuleDef moduledef = {
 35 |       PyModuleDef_HEAD_INIT,
 36 |       "libKMCUDA",         /* m_name */
 37 |       module_docstring,    /* m_doc */
 38 |       -1,                  /* m_size */
 39 |       module_functions,    /* m_methods */
 40 |       NULL,                /* m_reload */
 41 |       NULL,                /* m_traverse */
 42 |       NULL,                /* m_clear */
 43 |       NULL,                /* m_free */
 44 |   };
 45 |   PyObject *m = PyModule_Create(&moduledef);
 46 |   if (m == NULL) {
 47 |     PyErr_SetString(PyExc_RuntimeError, "PyModule_Create() failed");
 48 |     return NULL;
 49 |   }
 50 |   // numpy
 51 |   import_array();
 52 |   PyObject_SetAttrString(m, "supports_fp16", CUDA_ARCH >= 60? Py_True : Py_False);
 53 |   return m;
 54 | }
 55 | }
 56 | 
 57 | template <typename O>
 58 | using pyobj_parent = std::unique_ptr<O, std::function<void(O*)>>;
 59 | 
 60 | template <typename O>
 61 | class _pyobj : public pyobj_parent<O> {
 62 |  public:
 63 |   _pyobj() : pyobj_parent<O>(
 64 |       nullptr, [](O *p){ if (p) Py_DECREF(p); }) {}
 65 |   explicit _pyobj(PyObject *ptr) : pyobj_parent<O>(
 66 |       reinterpret_cast<O *>(ptr), [](O *p){ if(p) Py_DECREF(p); }) {}
 67 |   void reset(PyObject *p) noexcept {
 68 |     pyobj_parent<O>::reset(reinterpret_cast<O*>(p));
 69 |   }
 70 | };
 71 | 
 72 | using pyobj = _pyobj<PyObject>;
 73 | using pyarray = _pyobj<PyArrayObject>;
 74 | 
 75 | static void set_cuda_malloc_error() {
 76 |   PyErr_SetString(PyExc_MemoryError, "Failed to allocate memory on GPU");
 77 | }
 78 | 
 79 | static void set_cuda_device_error() {
 80 |   PyErr_SetString(PyExc_ValueError, "No such CUDA device exists");
 81 | }
 82 | 
 83 | static void set_cuda_memcpy_error() {
 84 |   PyErr_SetString(PyExc_RuntimeError, "cudaMemcpy failed");
 85 | }
 86 | 
 87 | 
 88 | static bool get_metric(PyObject *metric_obj, KMCUDADistanceMetric *metric) {
 89 |   if (metric_obj == Py_None) {
 90 |     *metric = kmcudaDistanceMetricL2;
 91 |   } else if (!PyUnicode_Check(metric_obj)) {
 92 |     PyErr_SetString(
 93 |         PyExc_TypeError, "\"metric\" must be either None or string.");
 94 |     return false;
 95 |   } else {
 96 |     pyobj bytes(PyUnicode_AsASCIIString(metric_obj));
 97 |     auto immetric = kmcuda::metrics.find(PyBytes_AsString(bytes.get()));
 98 |     if (immetric == kmcuda::metrics.end()) {
 99 |       PyErr_SetString(
100 |           PyExc_ValueError,
101 |           "Unknown metric. Supported values are \"L2\" and \"cos\".");
102 |       return false;
103 |     }
104 |     *metric = immetric->second;
105 |   }
106 |   return true;
107 | }
108 | 
109 | static bool validate_features_size(uint32_t features_size) {
110 |   if (features_size > UINT16_MAX) {
111 |     char msg[128];
112 |     sprintf(msg, "\"samples\": more than %" PRIu32 " features is not supported",
113 |             features_size);
114 |     PyErr_SetString(PyExc_ValueError, msg);
115 |     return false;
116 |   }
117 |   return true;
118 | }
119 | 
120 | static bool get_samples(
121 |     PyObject *samples_obj, pyarray *samples_array, float **samples,
122 |     bool *fp16x2, uint32_t *samples_size, uint32_t *features_size) {
123 |   samples_array->reset(PyArray_FROM_OTF(
124 |       samples_obj, NPY_FLOAT16, NPY_ARRAY_IN_ARRAY));
125 |   if (!*samples_array) {
126 |     PyErr_Clear();
127 |     samples_array->reset(PyArray_FROM_OTF(
128 |         samples_obj, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY));
129 |     if (!*samples_array) {
130 |       PyErr_SetString(PyExc_TypeError,
131 |                       "\"samples\" must be a 2D float32 or float16 numpy array");
132 |       return false;
133 |     }
134 |   } else {
135 |     *fp16x2 = true;
136 |   }
137 |   auto ndims = PyArray_NDIM(samples_array->get());
138 |   if (ndims != 2) {
139 |     PyErr_SetString(PyExc_ValueError, "\"samples\" must be a 2D numpy array");
140 |     return false;
141 |   }
142 |   auto dims = PyArray_DIMS(samples_array->get());
143 |   *samples_size = static_cast<uint32_t>(dims[0]);
144 |   *features_size = static_cast<uint32_t>(dims[1]);
145 |   if (*fp16x2 && PyArray_TYPE(samples_array->get()) == NPY_FLOAT16) {
146 |     if (*features_size % 2 != 0) {
147 |       PyErr_SetString(PyExc_ValueError,
148 |                       "the number of features must be even in fp16 mode");
149 |       return false;
150 |     }
151 |     *features_size /= 2;
152 |   }
153 | 
154 |   *samples = reinterpret_cast<float *>(PyArray_DATA(
155 |       samples_array->get()));
156 |   return true;
157 | }
158 | 
159 | static PyObject *py_kmeans_cuda(PyObject *self, PyObject *args, PyObject *kwargs) {
160 |   uint32_t clusters_size = 0,
161 |            afkmc2_m = 0,
162 |            seed = static_cast<uint32_t>(time(NULL)),
163 |            device = 0;
164 |   int32_t verbosity = 0;
165 |   bool fp16x2 = false;
166 |   int adflag = 0;
167 |   float tolerance = .01, yinyang_t = .1;
168 |   PyObject *samples_obj, *init_obj = Py_None, *metric_obj = Py_None;
169 |   static const char *kwlist[] = {
170 |       "samples", "clusters", "tolerance", "init", "yinyang_t", "metric",
171 |       "average_distance", "seed", "device", "verbosity", NULL};
172 | 
173 |   /* Parse the input tuple */
174 |   if (!PyArg_ParseTupleAndKeywords(
175 |       args, kwargs, "OI|fOfOpIIi", const_cast<char**>(kwlist), &samples_obj,
176 |       &clusters_size, &tolerance, &init_obj, &yinyang_t, &metric_obj, &adflag,
177 |       &seed, &device, &verbosity)) {
178 |     return NULL;
179 |   }
180 | 
181 |   KMCUDAInitMethod init;
182 |   auto set_init = [&init](PyObject *obj) {
183 |     pyobj bytes(PyUnicode_AsASCIIString(obj));
184 |     auto iminit = kmcuda::init_methods.find(PyBytes_AsString(bytes.get()));
185 |     if (iminit == kmcuda::init_methods.end()) {
186 |       PyErr_SetString(
187 |           PyExc_ValueError,
188 |           "Unknown centroids initialization method. Supported values are "
189 |               "\"kmeans++\", \"random\" and <numpy array>.");
190 |       return false;
191 |     }
192 |     init = iminit->second;
193 |     return true;
194 |   };
195 | 
196 |   if (init_obj == Py_None) {
197 |     init = kmcudaInitMethodPlusPlus;
198 |   } else if (PyUnicode_Check(init_obj)) {
199 |     if (!set_init(init_obj)) {
200 |       return NULL;
201 |     }
202 |   } else if (PyTuple_Check(init_obj)) {
203 |     auto e1 = PyTuple_GetItem(init_obj, 0);
204 |     if (e1 == nullptr || e1 == Py_None) {
205 |       PyErr_SetString(
206 |           PyExc_ValueError, "centroid initialization method may not be null.");
207 |       return NULL;
208 |     }
209 |     if (!set_init(e1)) {
210 |       return NULL;
211 |     }
212 |     if (PyTuple_Size(init_obj) > 1 && init == kmcudaInitMethodAFKMC2) {
213 |       afkmc2_m = PyLong_AsUnsignedLong(PyTuple_GetItem(init_obj, 1));
214 |     }
215 |   } else {
216 |     init = kmcudaInitMethodImport;
217 |   }
218 |   KMCUDADistanceMetric metric;
219 |   if (!get_metric(metric_obj, &metric)) {
220 |     return NULL;
221 |   }
222 |   if (clusters_size < 2 || clusters_size == UINT32_MAX) {
223 |     PyErr_SetString(PyExc_ValueError, "\"clusters\" must be greater than 1 and "
224 |                                       "less than (1 << 32) - 1");
225 |     return NULL;
226 |   }
227 |   float *samples = nullptr, *centroids = nullptr;
228 |   uint32_t *assignments = nullptr;
229 |   uint32_t samples_size = 0, features_size = 0;
230 |   int device_ptrs = -1;
231 |   pyarray samples_array;
232 |   if (PyTuple_Check(samples_obj)) {
233 |     auto size = PyTuple_GET_SIZE(samples_obj);
234 |     if (size != 3 && size != 5) {
235 |       PyErr_SetString(PyExc_ValueError,
236 |                       "len(\"samples\") must be either 3 or 5");
237 |       return NULL;
238 |     }
239 |     auto member1 = PyTuple_GetItem(samples_obj, 0),
240 |          member2 = PyTuple_GetItem(samples_obj, 1),
241 |          member3 = PyTuple_GetItem(samples_obj, 2);
242 |     if (!member1 || !member2 || !member3) {
243 |       PyErr_SetString(PyExc_RuntimeError, "\"samples\" tuple contains nulls");
244 |       return NULL;
245 |     }
246 |     auto ull_ptr = PyLong_AsUnsignedLongLong(member1);
247 |     if (ull_ptr == NPY_MAX_ULONGLONG) {
248 |       PyErr_SetString(PyExc_ValueError,
249 |                       "\"samples\"[0] is not a pointer (integer)");
250 |       return NULL;
251 |     }
252 |     samples = reinterpret_cast<float *>(static_cast<uintptr_t>(ull_ptr));
253 |     if (samples == nullptr) {
254 |       PyErr_SetString(PyExc_ValueError, "\"samples\"[0] is null");
255 |       return NULL;
256 |     }
257 |     device_ptrs = PyLong_AsLong(member2);
258 |     if (!PyTuple_Check(member3) || PyTuple_GET_SIZE(member3) != 2) {
259 |       PyErr_SetString(PyExc_TypeError, "\"samples\"[2] must be a shape tuple");
260 |       return NULL;
261 |     }
262 |     samples_size = PyLong_AsUnsignedLong(PyTuple_GetItem(member3, 0));
263 |     features_size = PyLong_AsUnsignedLong(PyTuple_GetItem(member3, 1));
264 |     if (PyTuple_Size(member3) == 3) {
265 |       fp16x2 = PyObject_IsTrue(PyTuple_GetItem(member3, 2));
266 |     }
267 |     if (size == 5) {
268 |       auto member4 = PyTuple_GetItem(samples_obj, 3),
269 |            member5 = PyTuple_GetItem(samples_obj, 4);
270 |       if (!member4 || !member5) {
271 |         PyErr_SetString(PyExc_RuntimeError, "\"samples\" tuple contains nulls");
272 |         return NULL;
273 |       }
274 |       centroids = reinterpret_cast<float *>(static_cast<uintptr_t>(
275 |           PyLong_AsUnsignedLongLong(member4)));
276 |       assignments = reinterpret_cast<uint32_t *>(static_cast<uintptr_t>(
277 |           PyLong_AsUnsignedLongLong(member5)));
278 |     }
279 |   } else if (!get_samples(samples_obj, &samples_array, &samples,
280 |                           &fp16x2, &samples_size, &features_size)) {
281 |     return NULL;
282 |   }
283 |   if (!validate_features_size(features_size)) {
284 |     return NULL;
285 |   }
286 |   pyarray centroids_array, assignments_array;
287 |   if (device_ptrs < 0) {
288 |     npy_intp centroid_dims[] = {
289 |         clusters_size, fp16x2? features_size * 2 : features_size, 0};
290 |     centroids_array.reset(PyArray_EMPTY(
291 |         2, centroid_dims, fp16x2? NPY_FLOAT16 : NPY_FLOAT32, false));
292 |     centroids = reinterpret_cast<float *>(PyArray_DATA(
293 |         centroids_array.get()));
294 |     npy_intp assignments_dims[] = {samples_size, 0};
295 |     assignments_array.reset(PyArray_EMPTY(1, assignments_dims, NPY_UINT32, false));
296 |     assignments = reinterpret_cast<uint32_t *>(PyArray_DATA(
297 |         assignments_array.get()));
298 |   } else if (centroids == nullptr) {
299 |     if (cudaSetDevice(device_ptrs) != cudaSuccess) {
300 |       set_cuda_device_error();
301 |       return NULL;
302 |     }
303 |     if (cudaMalloc(reinterpret_cast<void **>(&centroids),
304 |                    clusters_size * features_size * sizeof(float)) != cudaSuccess) {
305 |       set_cuda_malloc_error();
306 |       return NULL;
307 |     }
308 |     if (cudaMalloc(reinterpret_cast<void **>(&assignments),
309 |                    static_cast<uint64_t>(samples_size) * sizeof(uint32_t)) != cudaSuccess) {
310 |       set_cuda_malloc_error();
311 |       return NULL;
312 |     }
313 |   }
314 |   if (init == kmcudaInitMethodImport) {
315 |     pyarray import_centroids_array(PyArray_FROM_OTF(
316 |         init_obj, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY));
317 |     if (import_centroids_array == NULL) {
318 |       PyErr_SetString(PyExc_TypeError, "\"init\" centroids must be a 2D numpy array");
319 |       return NULL;
320 |     }
321 |     auto ndims = PyArray_NDIM(import_centroids_array.get());
322 |     if (ndims != 2) {
323 |       PyErr_SetString(PyExc_ValueError, "\"init\" centroids must be a 2D numpy array");
324 |       return NULL;
325 |     }
326 |     auto dims = PyArray_DIMS(import_centroids_array.get());
327 |     if (static_cast<uint32_t>(dims[0]) != clusters_size) {
328 |       PyErr_SetString(PyExc_ValueError,
329 |                       "\"init\" centroids shape[0] does not match "
330 |                       "the number of clusters");
331 |       return NULL;
332 |     }
333 |     if (static_cast<uint32_t>(dims[1]) != features_size) {
334 |       PyErr_SetString(PyExc_ValueError,
335 |                       "\"init\" centroids shape[1] does not match "
336 |                           "the number of features");
337 |       return NULL;
338 |     }
339 |     auto icd = reinterpret_cast<float *>(PyArray_DATA(
340 |         import_centroids_array.get()));
341 |     auto size = clusters_size * features_size * sizeof(float);
342 |     if (device_ptrs < 0) {
343 |       memcpy(centroids, icd, size);
344 |     } else {
345 |       if (cudaSetDevice(device_ptrs) != cudaSuccess) {
346 |         set_cuda_device_error();
347 |         return NULL;
348 |       }
349 |       if (cudaMemcpy(centroids, icd, size, cudaMemcpyHostToDevice) != cudaSuccess) {
350 |         set_cuda_memcpy_error();
351 |         return NULL;
352 |       }
353 |     }
354 |   }
355 |   float average_distance = 0;
356 |   int result;
357 |   Py_BEGIN_ALLOW_THREADS
358 |   result = kmeans_cuda(
359 |       init, &afkmc2_m, tolerance, yinyang_t, metric, samples_size,
360 |       static_cast<uint16_t>(features_size), clusters_size, seed, device,
361 |       device_ptrs, fp16x2, verbosity, samples, centroids, assignments,
362 |       adflag? &average_distance : nullptr);
363 |   Py_END_ALLOW_THREADS
364 | 
365 |   switch (result) {
366 |     case kmcudaInvalidArguments:
367 |       PyErr_SetString(PyExc_ValueError,
368 |                       "Invalid arguments were passed to kmeans_cuda");
369 |       return NULL;
370 |     case kmcudaNoSuchDevice:
371 |       set_cuda_device_error();
372 |       return NULL;
373 |     case kmcudaMemoryAllocationFailure:
374 |       set_cuda_malloc_error();
375 |       return NULL;
376 |     case kmcudaMemoryCopyError:
377 |       set_cuda_memcpy_error();
378 |       return NULL;
379 |     case kmcudaRuntimeError:
380 |       PyErr_SetString(PyExc_AssertionError, "kmeans_cuda failure (bug?)");
381 |       return NULL;
382 |     case kmcudaSuccess:
383 |       if (device_ptrs < 0) {
384 |         if (!adflag) {
385 |           return Py_BuildValue(
386 |               "OO", centroids_array.get(), assignments_array.get());
387 |         } else {
388 |           return Py_BuildValue(
389 |               "OOf", centroids_array.get(), assignments_array.get(),
390 |               average_distance);
391 |         }
392 |       }
393 |       if (!adflag) {
394 |         return Py_BuildValue(
395 |             "KK",
396 |             static_cast<uint64_t>(reinterpret_cast<uintptr_t>(centroids)),
397 |             static_cast<uint64_t>(reinterpret_cast<uintptr_t>(assignments)));
398 |       } else {
399 |         return Py_BuildValue(
400 |             "KKf",
401 |             static_cast<uint64_t>(reinterpret_cast<uintptr_t>(centroids)),
402 |             static_cast<uint64_t>(reinterpret_cast<uintptr_t>(assignments)),
403 |             average_distance);
404 |       }
405 |     default:
406 |       PyErr_SetString(PyExc_AssertionError,
407 |                       "Unknown error code returned from kmeans_cuda");
408 |       return NULL;
409 |   }
410 | }
411 | 
412 | static PyObject *py_knn_cuda(PyObject *self, PyObject *args, PyObject *kwargs) {
413 |   uint32_t device = 0, k = 0;
414 |   int32_t verbosity = 0;
415 |   bool fp16x2 = false;
416 |   PyObject *samples_obj, *centroids_obj, *assignments_obj, *metric_obj = Py_None;
417 |   static const char *kwlist[] = {
418 |       "k", "samples", "centroids", "assignments", "metric", "device",
419 |       "verbosity", NULL};
420 | 
421 |   /* Parse the input tuple */
422 |   if (!PyArg_ParseTupleAndKeywords(
423 |       args, kwargs, "IOOO|OIi", const_cast<char**>(kwlist), &k, &samples_obj,
424 |       &centroids_obj, &assignments_obj, &metric_obj, &device, &verbosity)) {
425 |     return NULL;
426 |   }
427 | 
428 |   KMCUDADistanceMetric metric;
429 |   if (!get_metric(metric_obj, &metric)) {
430 |     return NULL;
431 |   }
432 |   if (k == 0 || k > UINT16_MAX) {
433 |     PyErr_SetString(PyExc_ValueError, "\"k\" must be greater than 0 and "
434 |         "less than (1 << 16)");
435 |     return NULL;
436 |   }
437 |   float *samples = nullptr, *centroids = nullptr;
438 |   uint32_t *assignments = nullptr, *neighbors = nullptr;
439 |   uint32_t samples_size = 0, features_size = 0, clusters_size = 0;
440 |   int device_ptrs = -1;
441 |   pyarray samples_array, centroids_array, assignments_array;
442 |   if (PyTuple_Check(samples_obj)) {
443 |     auto size = PyTuple_GET_SIZE(samples_obj);
444 |     if (size != 3 && size != 4) {
445 |       PyErr_SetString(PyExc_ValueError, "len(\"samples\") must be either 3 or 4");
446 |       return NULL;
447 |     }
448 |     auto member1 = PyTuple_GetItem(samples_obj, 0),
449 |         member2 = PyTuple_GetItem(samples_obj, 1),
450 |         member3 = PyTuple_GetItem(samples_obj, 2);
451 |     if (!member1 || !member2 || !member3) {
452 |       PyErr_SetString(PyExc_RuntimeError, "\"samples\" tuple contains nulls");
453 |       return NULL;
454 |     }
455 |     auto ull_ptr = PyLong_AsUnsignedLongLong(member1);
456 |     if (ull_ptr == NPY_MAX_ULONGLONG) {
457 |       PyErr_SetString(PyExc_ValueError,
458 |                       "\"samples\"[0] is not a pointer (integer)");
459 |       return NULL;
460 |     }
461 |     samples = reinterpret_cast<float *>(static_cast<uintptr_t>(ull_ptr));
462 |     if (samples == nullptr) {
463 |       PyErr_SetString(PyExc_ValueError, "\"samples\"[0] is null");
464 |       return NULL;
465 |     }
466 |     device_ptrs = PyLong_AsLong(member2);
467 |     if (!PyTuple_Check(member3) || PyTuple_GET_SIZE(member3) != 2) {
468 |       PyErr_SetString(PyExc_TypeError, "\"samples\"[2] must be a shape tuple");
469 |       return NULL;
470 |     }
471 |     samples_size = PyLong_AsUnsignedLong(PyTuple_GetItem(member3, 0));
472 |     features_size = PyLong_AsUnsignedLong(PyTuple_GetItem(member3, 1));
473 |     if (PyTuple_Size(member3) == 3) {
474 |       fp16x2 = PyObject_IsTrue(PyTuple_GetItem(member3, 2));
475 |     }
476 |     if (size == 4) {
477 |       auto member4 = PyTuple_GetItem(samples_obj, 3);
478 |       if (!member4) {
479 |         PyErr_SetString(PyExc_RuntimeError, "\"samples\" tuple contains nulls");
480 |         return NULL;
481 |       }
482 |       neighbors = reinterpret_cast<uint32_t *>(static_cast<uintptr_t>(
483 |           PyLong_AsUnsignedLongLong(member4)));
484 |     }
485 |     if (!PyTuple_Check(centroids_obj)) {
486 |       PyErr_SetString(PyExc_ValueError, "\"centroids\" must be a tuple of length 2");
487 |       return NULL;
488 |     }
489 |     size = PyTuple_GET_SIZE(centroids_obj);
490 |     if (size != 2) {
491 |       PyErr_SetString(PyExc_ValueError, "len(\"centroids\") must be 2");
492 |       return NULL;
493 |     }
494 |     member1 = PyTuple_GetItem(centroids_obj, 0);
495 |     member2 = PyTuple_GetItem(centroids_obj, 1);
496 |     if (!member1 || !member2) {
497 |       PyErr_SetString(PyExc_RuntimeError, "\"centroids\" tuple contains nulls");
498 |       return NULL;
499 |     }
500 |     ull_ptr = PyLong_AsUnsignedLongLong(member1);
501 |     if (ull_ptr == NPY_MAX_ULONGLONG) {
502 |       PyErr_SetString(PyExc_ValueError,
503 |                       "\"centroids\"[0] is not a pointer (integer)");
504 |       return NULL;
505 |     }
506 |     centroids = reinterpret_cast<float *>(static_cast<uintptr_t>(ull_ptr));
507 |     if (centroids == nullptr) {
508 |       PyErr_SetString(PyExc_ValueError, "\"centroids\"[0] is null");
509 |       return NULL;
510 |     }
511 |     clusters_size = PyLong_AsUnsignedLong(member2);
512 |     ull_ptr = PyLong_AsUnsignedLongLong(assignments_obj);
513 |     if (ull_ptr == NPY_MAX_ULONGLONG) {
514 |       PyErr_SetString(PyExc_ValueError,
515 |                       "\"assignments\" is not a pointer (integer)");
516 |       return NULL;
517 |     }
518 |     assignments = reinterpret_cast<uint32_t *>(static_cast<uintptr_t>(ull_ptr));
519 |   } else {
520 |     if (!get_samples(samples_obj, &samples_array, &samples,
521 |                      &fp16x2, &samples_size, &features_size)) {
522 |       return NULL;
523 |     }
524 |     if (fp16x2) {
525 |       centroids_array.reset(PyArray_FROM_OTF(
526 |           centroids_obj, NPY_FLOAT16, NPY_ARRAY_IN_ARRAY));
527 |     } else {
528 |       centroids_array.reset(PyArray_FROM_OTF(
529 |           centroids_obj, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY));
530 |     }
531 |     if (!centroids_array) {
532 |       PyErr_SetString(PyExc_TypeError,
533 |           "\"centroids\" must be a 2D float32 or float16 numpy array");
534 |       return NULL;
535 |     }
536 |     auto ndims = PyArray_NDIM(centroids_array.get());
537 |     if (ndims != 2) {
538 |       PyErr_SetString(PyExc_ValueError, "\"centroids\" must be a 2D numpy array");
539 |       return NULL;
540 |     }
541 |     auto dims = PyArray_DIMS(centroids_array.get());
542 |     clusters_size = static_cast<uint32_t>(dims[0]);
543 |     if (static_cast<uint32_t>(dims[1]) != features_size * (fp16x2? 2 : 1)) {
544 |       PyErr_SetString(
545 |           PyExc_ValueError, "\"centroids\" must have same number of features "
546 |                             "as \"samples\" (shape[-1])");
547 |       return NULL;
548 |     }
549 |     centroids = reinterpret_cast<float *>(PyArray_DATA(
550 |         centroids_array.get()));
551 |     assignments_array.reset(PyArray_FROM_OTF(
552 |         assignments_obj, NPY_UINT32, NPY_ARRAY_IN_ARRAY));
553 |     if (!assignments_array) {
554 |       PyErr_SetString(PyExc_TypeError,
555 |                       "\"assignments\" must be a 1D uint32 numpy array");
556 |       return NULL;
557 |     }
558 |     ndims = PyArray_NDIM(assignments_array.get());
559 |     if (ndims != 1) {
560 |       PyErr_SetString(PyExc_ValueError, "\"assignments\" must be a 1D numpy array");
561 |       return NULL;
562 |     }
563 |     dims = PyArray_DIMS(assignments_array.get());
564 |     if (static_cast<uint32_t>(dims[0]) != samples_size) {
565 |       PyErr_SetString(
566 |           PyExc_ValueError, "\"assignments\" must be of the same length as "
567 |                             "\"samples\"");
568 |       return NULL;
569 |     }
570 |     assignments = reinterpret_cast<uint32_t *>(PyArray_DATA(
571 |         assignments_array.get()));
572 |   }
573 |   if (!validate_features_size(features_size)) {
574 |     return NULL;
575 |   }
576 |   pyarray neighbors_array;
577 |   if (device_ptrs < 0) {
578 |     npy_intp neighbors_dims[] = {samples_size, k, 0};
579 |     neighbors_array.reset(PyArray_EMPTY(
580 |         2, neighbors_dims, NPY_UINT32, false));
581 |     neighbors = reinterpret_cast<uint32_t *>(PyArray_DATA(
582 |         neighbors_array.get()));
583 |   } else if (neighbors == nullptr) {
584 |     if (cudaSetDevice(device_ptrs) != cudaSuccess) {
585 |       set_cuda_device_error();
586 |       return NULL;
587 |     }
588 |     if (cudaMalloc(reinterpret_cast<void **>(&neighbors),
589 |                    static_cast<uint64_t>(samples_size) * k * sizeof(float)) != cudaSuccess) {
590 |       set_cuda_malloc_error();
591 |       return NULL;
592 |     }
593 |   }
594 |   int result;
595 |   Py_BEGIN_ALLOW_THREADS
596 |     result = knn_cuda(k, metric, samples_size, features_size, clusters_size,
597 |                       device, device_ptrs, fp16x2, verbosity,
598 |                       samples, centroids, assignments, neighbors);
599 |   Py_END_ALLOW_THREADS
600 | 
601 |   switch (result) {
602 |     case kmcudaInvalidArguments:
603 |       PyErr_SetString(PyExc_ValueError,
604 |                       "Invalid arguments were passed to knn_cuda");
605 |       return NULL;
606 |     case kmcudaNoSuchDevice:
607 |       set_cuda_device_error();
608 |       return NULL;
609 |     case kmcudaMemoryAllocationFailure:
610 |       set_cuda_malloc_error();
611 |       return NULL;
612 |     case kmcudaMemoryCopyError:
613 |       set_cuda_memcpy_error();
614 |       return NULL;
615 |     case kmcudaRuntimeError:
616 |       PyErr_SetString(PyExc_AssertionError, "knn_cuda failure (bug?)");
617 |       return NULL;
618 |     case kmcudaSuccess:
619 |       if (device_ptrs < 0) {
620 |         return Py_BuildValue(
621 |             "O",
622 |             reinterpret_cast<PyObject*>(neighbors_array.get()));
623 |       }
624 |       return Py_BuildValue(
625 |           "K",
626 |           static_cast<unsigned long long>(reinterpret_cast<uintptr_t>(neighbors)));
627 |     default:
628 |       PyErr_SetString(PyExc_AssertionError,
629 |                       "Unknown error code returned from knn_cuda");
630 |       return NULL;
631 |   }
632 | }
633 | 


--------------------------------------------------------------------------------
/src/kmcuda.cc:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cstdlib>
  3 | #include <cstring>
  4 | #include <cinttypes>
  5 | #include <cmath>
  6 | #include <cassert>
  7 | #include <algorithm>
  8 | #include <map>
  9 | #include <memory>
 10 | 
 11 | #include <cuda_runtime_api.h>
 12 | #ifdef PROFILE
 13 | #include <cuda_profiler_api.h>
 14 | #endif
 15 | 
 16 | #include "private.h"
 17 | 
 18 | /// Used in kmeans_cuda() to validate function arguments.
 19 | static KMCUDAResult check_kmeans_args(
 20 |     float tolerance,
 21 |     float yinyang_t,
 22 |     uint32_t samples_size,
 23 |     uint16_t features_size,
 24 |     uint32_t clusters_size,
 25 |     uint32_t device,
 26 |     bool fp16x2,
 27 |     int verbosity,
 28 |     const float *samples,
 29 |     float *centroids,
 30 |     uint32_t *assignments) {
 31 |   if (clusters_size < 2 || clusters_size == UINT32_MAX) {
 32 |     return kmcudaInvalidArguments;
 33 |   }
 34 |   if (features_size == 0) {
 35 |     return kmcudaInvalidArguments;
 36 |   }
 37 |   if (samples_size < clusters_size) {
 38 |     return kmcudaInvalidArguments;
 39 |   }
 40 |   int devices = 0;
 41 |   cudaGetDeviceCount(&devices);
 42 |   if (device > (1u << devices)) {
 43 |     return kmcudaNoSuchDevice;
 44 |   }
 45 |   if (samples == nullptr || centroids == nullptr || assignments == nullptr) {
 46 |     return kmcudaInvalidArguments;
 47 |   }
 48 |   if (tolerance < 0 || tolerance > 1) {
 49 |     return kmcudaInvalidArguments;
 50 |   }
 51 |   if (yinyang_t < 0 || yinyang_t > 0.5) {
 52 |     return kmcudaInvalidArguments;
 53 |   }
 54 | #if CUDA_ARCH < 60
 55 |   if (fp16x2) {
 56 |     INFO("CUDA device arch %d does not support fp16\n", CUDA_ARCH);
 57 |     return kmcudaInvalidArguments;
 58 |   }
 59 | #endif
 60 |   return kmcudaSuccess;
 61 | }
 62 | 
 63 | static std::vector<int> setup_devices(uint32_t device, int device_ptrs, int verbosity) {
 64 |   std::vector<int> devs;
 65 |   if (device == 0) {
 66 |     cudaGetDeviceCount(reinterpret_cast<int *>(&device));
 67 |     if (device == 0) {
 68 |       return devs;
 69 |     }
 70 |     device = (1u << device) - 1;
 71 |   }
 72 |   for (int dev = 0; device; dev++) {
 73 |     if (device & 1) {
 74 |       devs.push_back(dev);
 75 |       if (cudaSetDevice(dev) != cudaSuccess) {
 76 |         INFO("failed to cudaSetDevice(%d)\n", dev);
 77 |         devs.pop_back();
 78 |       }
 79 |       cudaDeviceProp props;
 80 |       auto err = cudaGetDeviceProperties(&props, dev);
 81 |       if (err != cudaSuccess) {
 82 |         INFO("failed to cudaGetDeviceProperties(%d): %s\n",
 83 |              dev, cudaGetErrorString(err));
 84 |         devs.pop_back();
 85 |       }
 86 |       if (props.major < (CUDA_ARCH / 10) ||
 87 |           (props.major == (CUDA_ARCH / 10) && props.minor < (CUDA_ARCH % 10))) {
 88 |         INFO("compute capability mismatch for device %d: wanted %d.%d, have "
 89 |              "%d.%d\n>>>> you may want to build kmcuda with -DCUDA_ARCH=%d "
 90 |              "(refer to \"Building\" in README.md)\n",
 91 |              dev, CUDA_ARCH / 10, CUDA_ARCH % 10, props.major, props.minor,
 92 |              props.major * 10 + props.minor);
 93 |         devs.pop_back();
 94 |       }
 95 |     }
 96 |     device >>= 1;
 97 |   }
 98 |   bool p2p_dp = (device_ptrs >= 0 && !(device & (1 << device_ptrs)));
 99 |   if (p2p_dp) {
100 |     // enable p2p for device_ptrs which is not in the devices list
101 |     devs.push_back(device_ptrs);
102 |   }
103 |   if (devs.size() > 1) {
104 |     for (int dev1 : devs) {
105 |       for (int dev2 : devs) {
106 |         if (dev1 <= dev2) {
107 |           continue;
108 |         }
109 |         int access = 0;
110 |         cudaDeviceCanAccessPeer(&access, dev1, dev2);
111 |         if (!access) {
112 |           INFO("warning: p2p %d <-> %d is impossible\n", dev1, dev2);
113 |         }
114 |       }
115 |     }
116 |     for (int dev : devs) {
117 |       cudaSetDevice(dev);
118 |       for (int odev : devs) {
119 |         if (dev == odev) {
120 |           continue;
121 |         }
122 |         auto err = cudaDeviceEnablePeerAccess(odev, 0);
123 |         if (err == cudaErrorPeerAccessAlreadyEnabled) {
124 |           DEBUG("p2p is already enabled on gpu #%d\n", dev);
125 |         } else if (err != cudaSuccess) {
126 |           INFO("warning: failed to enable p2p on gpu #%d: %s\n", dev,
127 |                cudaGetErrorString(err));
128 |         }
129 |       }
130 |     }
131 |   }
132 |   if (p2p_dp) {
133 |     // remove device_ptrs - it is not in the devices list
134 |     devs.pop_back();
135 |   }
136 |   return devs;
137 | }
138 | 
139 | template <typename T>
140 | static KMCUDAResult init_udevptrs(
141 |     uint32_t length, uint32_t size_each,
142 |     int32_t device_ptrs, const std::vector<int> &devs, int verbosity,
143 |     const T *source, udevptrs<T> *dest, int32_t *origin_devi_ptr = nullptr) {
144 |   size_t device_size = static_cast<size_t>(length) * size_each;
145 |   int32_t origin_devi = -1;
146 |   FOR_EACH_DEVI(
147 |       if (devs[devi] == device_ptrs) {
148 |         dest->emplace_back(const_cast<T*>(source), true);
149 |         origin_devi = devi;
150 |       } else {
151 |         CUMALLOC_ONE(*dest, device_size, devs[devi]);
152 |       }
153 |   );
154 |   if (origin_devi_ptr != nullptr) {
155 |     *origin_devi_ptr = origin_devi;
156 |   }
157 |   if (device_ptrs < 0) {
158 |     CUMEMCPY_H2D_ASYNC(*dest, 0, source, device_size);
159 |   } else {
160 |     FOR_EACH_DEVI(
161 |         if (static_cast<int32_t>(devi) != origin_devi) {
162 |           CUCH(cudaMemcpyPeerAsync(
163 |               (*dest)[devi].get(), devs[devi], source,
164 |               device_ptrs, device_size * sizeof(T)),
165 |                kmcudaMemoryCopyError);
166 |         }
167 |     );
168 |   }
169 |   return kmcudaSuccess;
170 | }
171 | 
172 | static KMCUDAResult print_memory_stats(const std::vector<int> &devs) {
173 |   FOR_EACH_DEV(
174 |     size_t free_bytes, total_bytes;
175 |     if (cudaMemGetInfo(&free_bytes, &total_bytes) != cudaSuccess) {
176 |       return kmcudaRuntimeError;
177 |     }
178 |     printf("GPU #%d memory: used %zu bytes (%.1f%%), free %zu bytes, "
179 |            "total %zu bytes\n",
180 |            dev, total_bytes - free_bytes,
181 |            (total_bytes - free_bytes) * 100.0 / total_bytes,
182 |            free_bytes, total_bytes);
183 |   );
184 |   return kmcudaSuccess;
185 | }
186 | 
187 | extern "C" {
188 | 
189 | KMCUDAResult kmeans_init_centroids(
190 |     KMCUDAInitMethod method, const void *init_params, uint32_t samples_size,
191 |     uint16_t features_size, uint32_t clusters_size, KMCUDADistanceMetric metric,
192 |     uint32_t seed, const std::vector<int> &devs, int device_ptrs, int fp16x2,
193 |     int32_t verbosity, const float *host_centroids, const udevptrs<float> &samples,
194 |     udevptrs<float> *dists, udevptrs<float> *aux, udevptrs<float> *centroids) {
195 |   if (metric == kmcudaDistanceMetricCosine && !fp16x2) {
196 |     // 3 sanity checks, not implemented for fp16x2
197 |     float *probe;
198 |     CUCH(cudaMallocManaged(reinterpret_cast<void**>(&probe),
199 |                            static_cast<uint32_t>(features_size) * sizeof(float)),
200 |          kmcudaMemoryAllocationFailure);
201 |     unique_devptr<float> managed(probe);
202 |     cudaSetDevice(devs[0]);
203 |     for (uint32_t s : {0u, samples_size / 2, samples_size - 1}) {
204 |       RETERR(cuda_extract_sample_t(
205 |           s, samples_size, features_size, verbosity, samples[0].get(), probe));
206 |       double norm = 0;
207 |       #pragma omp simd
208 |       for (uint16_t i = 0; i < features_size; i++) {
209 |         float v = probe[i];
210 |         norm += v * v;
211 |       }
212 |       const float high = 1.00001;
213 |       const float low = 0.99999;
214 |       if (norm > high || norm < low) {
215 |         INFO("error: angular distance: samples[%" PRIu32 "] has L2 norm = %f "
216 |              "which is outside [%f, %f]\n", s, norm, low, high);
217 |         return kmcudaInvalidArguments;
218 |       }
219 |     }
220 |   }
221 | 
222 |   srand(seed);
223 |   switch (method) {
224 |     case kmcudaInitMethodImport:
225 |       if (device_ptrs < 0) {
226 |         CUMEMCPY_H2D_ASYNC(*centroids, 0, host_centroids,
227 |                            clusters_size * features_size);
228 |       } else {
229 |         int32_t origin_devi = -1;
230 |         FOR_EACH_DEVI(
231 |           if (devs[devi] == device_ptrs) {
232 |             origin_devi = devi;
233 |           }
234 |         );
235 |         FOR_EACH_DEVI(
236 |           if (static_cast<int32_t>(devi) != origin_devi) {
237 |             CUCH(cudaMemcpyPeerAsync(
238 |                 (*centroids)[devi].get(), devs[devi], host_centroids,
239 |                 device_ptrs, clusters_size * features_size * sizeof(float)),
240 |                  kmcudaMemoryCopyError);
241 |           }
242 |         );
243 |       }
244 |       break;
245 |     case kmcudaInitMethodRandom: {
246 |       INFO("randomly picking initial centroids...\n");
247 |       std::vector<uint32_t> chosen(samples_size);
248 |       #pragma omp parallel for
249 |       for (uint32_t s = 0; s < samples_size; s++) {
250 |         chosen[s] = s;
251 |       }
252 |       std::random_shuffle(chosen.begin(), chosen.end());
253 |       DEBUG("shuffle complete, copying to device(s)...\n");
254 |       for (uint32_t c = 0; c < clusters_size; c++) {
255 |         RETERR(cuda_copy_sample_t(
256 |             chosen[c], c * features_size, samples_size, features_size, devs,
257 |             verbosity, samples, centroids));
258 |       }
259 |       SYNC_ALL_DEVS;
260 |       break;
261 |     }
262 |     case kmcudaInitMethodPlusPlus: {
263 |       float smoke = NAN;
264 |       uint32_t first_index;
265 |       while (smoke != smoke) {
266 |         first_index = rand() % samples_size;
267 |         cudaSetDevice(devs[0]);
268 |         CUCH(cudaMemcpy(&smoke, samples[0].get() + first_index, sizeof(float),
269 |                         cudaMemcpyDeviceToHost), kmcudaMemoryCopyError);
270 |       }
271 |       RETERR(cuda_copy_sample_t(
272 |             first_index, 0, samples_size, features_size, devs, verbosity,
273 |             samples, centroids));
274 |       INFO("performing kmeans++...\n");
275 |       std::unique_ptr<float[]> host_dists(new float[samples_size]);
276 |       if (verbosity > 2) {
277 |         printf("kmeans++: dump %" PRIu32 " %" PRIu32 " %p\n",
278 |                samples_size, features_size, host_dists.get());
279 |         FOR_EACH_DEVI(
280 |           printf("kmeans++: dev #%d: %p %p %p\n", devs[devi],
281 |                  samples[devi].get(), (*centroids)[devi].get(),
282 |                  (*dists)[devi].get());
283 |         );
284 |       }
285 |       for (uint32_t i = 1; i < clusters_size; i++) {
286 |         if (verbosity > 1 || (verbosity > 0 && (
287 |               clusters_size < 100 || i % (clusters_size / 100) == 0))) {
288 |           printf("\rstep %d", i);
289 |           fflush(stdout);
290 |         }
291 |         atomic_float dist_sum = 0;
292 |         RETERR(kmeans_cuda_plus_plus(
293 |             samples_size, features_size, i, metric, devs, fp16x2, verbosity,
294 |             samples, centroids, dists, host_dists.get(), &dist_sum),
295 |                DEBUG("\nkmeans_cuda_plus_plus failed\n"));
296 |         if (dist_sum != dist_sum) {
297 |           assert(dist_sum == dist_sum);
298 |           INFO("\ninternal bug inside kmeans_init_centroids: dist_sum is NaN\n");
299 |         }
300 |         double choice = ((rand() + .0) / RAND_MAX);
301 |         uint32_t choice_approx = choice * samples_size;
302 |         double choice_sum = choice * dist_sum;
303 |         uint32_t j;
304 |         if (choice_approx < 100) {
305 |           double dist_sum2 = 0;
306 |           for (j = 0; j < samples_size && dist_sum2 < choice_sum; j++) {
307 |             dist_sum2 += host_dists[j];
308 |           }
309 |         } else {
310 |           double dist_sum2 = 0;
311 |           #pragma omp simd reduction(+:dist_sum2)
312 |           for (uint32_t t = 0; t < choice_approx; t++) {
313 |             dist_sum2 += host_dists[t];
314 |           }
315 |           if (dist_sum2 < choice_sum) {
316 |             for (j = choice_approx; j < samples_size && dist_sum2 < choice_sum; j++) {
317 |               dist_sum2 += host_dists[j];
318 |             }
319 |           } else {
320 |             for (j = choice_approx; j > 1 && dist_sum2 >= choice_sum; j--) {
321 |               dist_sum2 -= host_dists[j];
322 |             }
323 |             j++;
324 |           }
325 |         }
326 |         if (j == 0 || j > samples_size) {
327 |           assert(j > 0 && j <= samples_size);
328 |           INFO("\ninternal bug in kmeans_init_centroids: j = %" PRIu32 "\n", j);
329 |         }
330 |         RETERR(cuda_copy_sample_t(
331 |             j - 1, i * features_size, samples_size, features_size, devs,
332 |             verbosity, samples, centroids));
333 |       }
334 |       SYNC_ALL_DEVS;
335 |       break;
336 |     }
337 |     case kmcudaInitMethodAFKMC2: {
338 |       uint32_t m = *reinterpret_cast<const uint32_t*>(init_params);
339 |       if (m == 0) {
340 |         m = 200;
341 |       } else if (m > samples_size / 2) {
342 |         INFO("afkmc2: m > %" PRIu32 " is not supported (got %" PRIu32 ")\n",
343 |              samples_size / 2, m);
344 |         return kmcudaInvalidArguments;
345 |       }
346 |       float smoke = NAN;
347 |       uint32_t first_index;
348 |       while (smoke != smoke) {
349 |         first_index = rand() % samples_size;
350 |         cudaSetDevice(devs[0]);
351 |         CUCH(cudaMemcpy(&smoke, samples[0].get() + first_index, sizeof(float),
352 |                         cudaMemcpyDeviceToHost), kmcudaMemoryCopyError);
353 |       }
354 |       INFO("afkmc2: calculating q (c0 = %" PRIu32 ")... ",
355 |            first_index / features_size);
356 |       RETERR(cuda_copy_sample_t(
357 |             first_index, 0, samples_size, features_size, devs, verbosity,
358 |             samples, centroids));
359 |       auto q = std::unique_ptr<float[]>(new float[samples_size]);
360 |       kmeans_cuda_afkmc2_calc_q(
361 |           samples_size, features_size, first_index / features_size, metric,
362 |           devs, fp16x2, verbosity, samples, dists, q.get());
363 |       INFO("done\n");
364 |       auto cand_ind = std::unique_ptr<uint32_t[]>(new uint32_t[m]);
365 |       auto rand_a = std::unique_ptr<float[]>(new float[m]);
366 |       auto p_cand = std::unique_ptr<float[]>(new float[m]);
367 |       for (uint32_t k = 1; k < clusters_size; k++) {
368 |         if (verbosity > 1 || (verbosity > 0 && (
369 |               clusters_size < 100 || k % (clusters_size / 100) == 0))) {
370 |           printf("\rstep %d", k);
371 |           fflush(stdout);
372 |         }
373 |         RETERR(kmeans_cuda_afkmc2_random_step(
374 |             k, m, seed, verbosity, dists->back().get(),
375 |             reinterpret_cast<uint32_t*>(aux->back().get()),
376 |             cand_ind.get(), aux->back().get() + m, rand_a.get()));
377 |         RETERR(kmeans_cuda_afkmc2_min_dist(
378 |             k, m, metric, fp16x2, verbosity, samples.back().get(),
379 |             reinterpret_cast<uint32_t*>(aux->back().get()),
380 |             centroids->back().get(), aux->back().get() + m, p_cand.get()));
381 |         float curr_prob = 0;
382 |         uint32_t curr_ind = 0;
383 |         for (uint32_t j = 0; j < m; j++) {
384 |           auto cand_prob = p_cand[j] / q[cand_ind[j]];
385 |           if (curr_prob == 0 || cand_prob / curr_prob > rand_a[j]) {
386 |             curr_ind = j;
387 |             curr_prob = cand_prob;
388 |           }
389 |         }
390 |         RETERR(cuda_copy_sample_t(
391 |             cand_ind[curr_ind], k * features_size, samples_size, features_size, devs,
392 |             verbosity, samples, centroids));
393 |       }
394 |       SYNC_ALL_DEVS;
395 |       break;
396 |     }
397 |   }
398 |   INFO("\rdone            \n");
399 |   return kmcudaSuccess;
400 | }
401 | 
402 | KMCUDAResult kmeans_cuda(
403 |     KMCUDAInitMethod init, const void *init_params, float tolerance, float yinyang_t,
404 |     KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
405 |     uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
406 |     int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
407 |     uint32_t *assignments, float *average_distance) {
408 |   DEBUG("arguments: %d %p %.3f %.2f %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %"
409 |         PRIu32 " %" PRIu32 " %d %" PRIi32 " %p %p %p %p\n", init, init_params,
410 |         tolerance, yinyang_t, metric, samples_size, features_size, clusters_size,
411 |         seed, device, fp16x2, verbosity, samples, centroids, assignments,
412 |         average_distance);
413 |   RETERR(check_kmeans_args(
414 |       tolerance, yinyang_t, samples_size, features_size, clusters_size,
415 |       device, fp16x2, verbosity, samples, centroids, assignments));
416 |   INFO("reassignments threshold: %" PRIu32 "\n", uint32_t(tolerance * samples_size));
417 |   uint32_t yy_groups_size = yinyang_t * clusters_size;
418 |   DEBUG("yinyang groups: %" PRIu32 "\n", yy_groups_size);
419 |   auto devs = setup_devices(device, device_ptrs, verbosity);
420 |   if (devs.empty()) {
421 |     return kmcudaNoSuchDevice;
422 |   }
423 |   udevptrs<float> device_samples;
424 |   int32_t origin_devi;
425 |   RETERR(init_udevptrs(samples_size, features_size, device_ptrs, devs,
426 |                        verbosity, samples, &device_samples, &origin_devi));
427 |   udevptrs<float> device_centroids;
428 |   size_t centroids_size = static_cast<size_t>(clusters_size) * features_size;
429 |   FOR_EACH_DEV(
430 |     if (dev == device_ptrs) {
431 |       device_centroids.emplace_back(centroids, true);
432 |     } else {
433 |       CUMALLOC_ONE(device_centroids, centroids_size, dev);
434 |     }
435 |   );
436 |   udevptrs<uint32_t> device_assignments;
437 |   FOR_EACH_DEV(
438 |     if (dev == device_ptrs) {
439 |       device_assignments.emplace_back(assignments, true);
440 |     } else {
441 |       CUMALLOC_ONE(device_assignments, samples_size, dev);
442 |     }
443 |   );
444 |   udevptrs<uint32_t> device_assignments_prev;
445 |   CUMALLOC(device_assignments_prev, samples_size);
446 |   udevptrs<uint32_t> device_ccounts;
447 |   CUMALLOC(device_ccounts, clusters_size);
448 | 
449 |   udevptrs<uint32_t> device_assignments_yy, device_passed_yy;
450 |   udevptrs<float> device_bounds_yy, device_drifts_yy, device_centroids_yy;
451 |   if (yy_groups_size >= 1) {
452 |     CUMALLOC(device_assignments_yy, clusters_size);
453 |     uint32_t max_length = max_distribute_length(
454 |         samples_size, features_size * sizeof(float), devs);
455 |     size_t yyb_size = static_cast<size_t>(max_length) * (yy_groups_size + 1);
456 |     CUMALLOC(device_bounds_yy, yyb_size);
457 |     CUMALLOC(device_drifts_yy, centroids_size + clusters_size);
458 |     max_length = std::max(max_length, clusters_size + yy_groups_size);
459 |     CUMALLOC(device_passed_yy, max_length);
460 |     size_t yyc_size = yy_groups_size * features_size;
461 |     if (yyc_size <= max_length) {
462 |       DEBUG("reusing passed_yy for centroids_yy\n");
463 |       for (auto &p : device_passed_yy) {
464 |         device_centroids_yy.emplace_back(
465 |             reinterpret_cast<float*>(p.get()), true);
466 |       }
467 |     } else {
468 |       CUMALLOC(device_centroids_yy, yyc_size);
469 |     }
470 |   }
471 | 
472 |   if (verbosity > 1) {
473 |     RETERR(print_memory_stats(devs));
474 |   }
475 |   RETERR(kmeans_cuda_setup(samples_size, features_size, clusters_size,
476 |                            yy_groups_size, devs, verbosity),
477 |          DEBUG("kmeans_cuda_setup failed: %s\n", CUERRSTR()));
478 |   #ifdef PROFILE
479 |   FOR_EACH_DEV(cudaProfilerStart());
480 |   #endif
481 |   RETERR(cuda_transpose(
482 |       samples_size, features_size, true, devs, verbosity, &device_samples));
483 |   RETERR(kmeans_init_centroids(
484 |       init, init_params, samples_size, features_size, clusters_size, metric,
485 |       seed, devs, device_ptrs, fp16x2, verbosity, centroids, device_samples,
486 |       reinterpret_cast<udevptrs<float>*>(&device_assignments),
487 |       reinterpret_cast<udevptrs<float>*>(&device_assignments_prev),
488 |       &device_centroids),
489 |          DEBUG("kmeans_init_centroids failed: %s\n", CUERRSTR()));
490 |   RETERR(kmeans_cuda_yy(
491 |       tolerance, yy_groups_size, samples_size, clusters_size, features_size,
492 |       metric, devs, fp16x2, verbosity, device_samples, &device_centroids, &device_ccounts,
493 |       &device_assignments_prev, &device_assignments, &device_assignments_yy,
494 |       &device_centroids_yy, &device_bounds_yy, &device_drifts_yy, &device_passed_yy),
495 |          DEBUG("kmeans_cuda_yy failed: %s\n", CUERRSTR()));
496 |   if (average_distance) {
497 |     RETERR(kmeans_cuda_calc_average_distance(
498 |         samples_size, features_size, metric, devs, fp16x2, verbosity,
499 |         device_samples, device_centroids, device_assignments, average_distance),
500 |            DEBUG("kmeans_cuda_calc_average_distance failed: %s\n", CUERRSTR()));
501 |   }
502 |   #ifdef PROFILE
503 |   FOR_EACH_DEV(cudaProfilerStop());
504 |   #endif
505 |   if (origin_devi >= 0 || device_ptrs >= 0) {
506 |     RETERR(cuda_transpose(
507 |         samples_size, features_size, false, devs, verbosity, &device_samples));
508 |   }
509 |   if (origin_devi < 0) {
510 |     if (device_ptrs < 0) {
511 |       CUCH(cudaMemcpy(centroids, device_centroids[devs.size() - 1].get(),
512 |                       centroids_size * sizeof(float), cudaMemcpyDeviceToHost),
513 |            kmcudaMemoryCopyError);
514 |       CUCH(cudaMemcpy(assignments, device_assignments[devs.size() - 1].get(),
515 |                       samples_size * sizeof(uint32_t), cudaMemcpyDeviceToHost),
516 |            kmcudaMemoryCopyError);
517 |     } else {
518 |       CUCH(cudaMemcpyPeerAsync(centroids, device_ptrs,
519 |                                device_centroids[devs.size() - 1].get(),
520 |                                devs.back(), centroids_size * sizeof(float)),
521 |            kmcudaMemoryCopyError);
522 |       CUCH(cudaMemcpyPeerAsync(assignments, device_ptrs,
523 |                                device_assignments[devs.size() - 1].get(),
524 |                                devs.back(), samples_size * sizeof(uint32_t)),
525 |            kmcudaMemoryCopyError);
526 |       SYNC_ALL_DEVS;
527 |     }
528 |   }
529 |   DEBUG("return kmcudaSuccess\n");
530 |   return kmcudaSuccess;
531 | }
532 | 
533 | ////////////--------------------------------------------------------------------
534 | /// K-nn ///--------------------------------------------------------------------
535 | ////////////--------------------------------------------------------------------
536 | 
537 | static KMCUDAResult check_knn_args(
538 |     uint16_t k, uint32_t samples_size, uint16_t features_size,
539 |     uint32_t clusters_size, uint32_t device, int32_t fp16x2, int32_t verbosity,
540 |     const float *samples, const float *centroids, const uint32_t *assignments,
541 |     uint32_t *neighbors) {
542 |   if (k == 0) {
543 |     return kmcudaInvalidArguments;
544 |   }
545 |   if (clusters_size < 2 || clusters_size == UINT32_MAX) {
546 |     return kmcudaInvalidArguments;
547 |   }
548 |   if (features_size == 0) {
549 |     return kmcudaInvalidArguments;
550 |   }
551 |   if (samples_size < clusters_size) {
552 |     return kmcudaInvalidArguments;
553 |   }
554 |   int devices = 0;
555 |   cudaGetDeviceCount(&devices);
556 |   if (device > (1u << devices)) {
557 |     return kmcudaNoSuchDevice;
558 |   }
559 |   if (samples == nullptr || centroids == nullptr || assignments == nullptr ||
560 |       neighbors == nullptr) {
561 |     return kmcudaInvalidArguments;
562 |   }
563 | #if CUDA_ARCH < 60
564 |   if (fp16x2) {
565 |     INFO("CUDA device arch %d does not support fp16\n", CUDA_ARCH);
566 |     return kmcudaInvalidArguments;
567 |   }
568 | #endif
569 |   return kmcudaSuccess;
570 | }
571 | 
572 | KMCUDAResult knn_cuda(
573 |     uint16_t k, KMCUDADistanceMetric metric, uint32_t samples_size,
574 |     uint16_t features_size, uint32_t clusters_size, uint32_t device,
575 |     int32_t device_ptrs, int32_t fp16x2, int32_t verbosity,
576 |     const float *samples, const float *centroids, const uint32_t *assignments,
577 |     uint32_t *neighbors) {
578 |   DEBUG("arguments: %" PRIu16 " %d %" PRIu32 " %" PRIu16 " %" PRIu32 " %" PRIu32
579 |         " %" PRIi32 " %" PRIi32 " %" PRIi32 " %p %p %p %p\n",
580 |         k, metric, samples_size, features_size, clusters_size, device,
581 |         device_ptrs, fp16x2, verbosity, samples, centroids, assignments,
582 |         neighbors);
583 |   check_knn_args(k, samples_size, features_size, clusters_size, device, fp16x2,
584 |                  verbosity, samples, centroids, assignments, neighbors);
585 |   auto devs = setup_devices(device, device_ptrs, verbosity);
586 |   if (devs.empty()) {
587 |     return kmcudaNoSuchDevice;
588 |   }
589 |   udevptrs<float> device_samples;
590 |   udevptrs<float> device_centroids;
591 |   udevptrs<uint32_t> device_assignments;
592 |   int32_t origin_devi;
593 |   RETERR(init_udevptrs(samples_size, features_size, device_ptrs, devs,
594 |                        verbosity, samples, &device_samples, &origin_devi));
595 |   RETERR(init_udevptrs(clusters_size, features_size, device_ptrs, devs,
596 |                        verbosity, centroids, &device_centroids));
597 |   RETERR(init_udevptrs(samples_size, 1, device_ptrs, devs,
598 |                        verbosity, assignments, &device_assignments));
599 |   udevptrs<uint32_t> device_inv_asses, device_inv_asses_offsets;
600 |   CUMALLOC(device_inv_asses, samples_size);
601 |   CUMALLOC(device_inv_asses_offsets, clusters_size + 1);
602 |   udevptrs<uint32_t> device_neighbors;
603 |   auto nplan = distribute(samples_size, features_size * sizeof(float), devs);
604 |   size_t neighbors_size = 0;
605 |   for (auto &p : nplan) {
606 |     auto length = std::get<1>(p);
607 |     if (length > neighbors_size) {
608 |       neighbors_size = length;
609 |     }
610 |   }
611 |   neighbors_size *= k;
612 |   FOR_EACH_DEVI(
613 |     if (devs[devi] == device_ptrs) {
614 |       if (knn_cuda_neighbors_mem_multiplier(k, devs[devi], 0) == 2) {
615 |         INFO("warning: x2 memory is required for neighbors, using the "
616 |              "external pointer and not able to check the size\n");
617 |       }
618 |       device_neighbors.emplace_back(
619 |           neighbors + std::get<0>(nplan[devi]) * k, true);
620 |     } else {
621 |       CUMALLOC_ONE(
622 |           device_neighbors,
623 |           neighbors_size * knn_cuda_neighbors_mem_multiplier(k, devs[devi], 0),
624 |           devs[devi]);
625 |     }
626 |   );
627 |   udevptrs<float> device_cluster_distances;
628 |   CUMALLOC(device_cluster_distances, clusters_size * clusters_size);
629 |   udevptrs<float> device_sample_dists;
630 |   if (clusters_size * clusters_size < samples_size) {
631 |     CUMALLOC(device_sample_dists, samples_size);
632 |   } else {
633 |     DEBUG("using the centroid distances matrix as the sample distances temporary\n");
634 |   }
635 |   udevptrs<float> device_cluster_radiuses;
636 |   CUMALLOC(device_cluster_radiuses, clusters_size);
637 |   if (verbosity > 1) {
638 |     RETERR(print_memory_stats(devs));
639 |   }
640 |   RETERR(knn_cuda_setup(samples_size, features_size, clusters_size,
641 |                         devs, verbosity),
642 |          DEBUG("knn_cuda_setup failed: %s\n", CUERRSTR()));
643 |   #ifdef PROFILE
644 |   FOR_EACH_DEV(cudaProfilerStart());
645 |   #endif
646 |   RETERR(cuda_transpose(
647 |       samples_size, features_size, true, devs, verbosity, &device_samples));
648 |   {
649 |     INFO("initializing the inverse assignments...\n");
650 |     auto asses_with_idxs = std::unique_ptr<std::tuple<uint32_t, uint32_t>[]>(
651 |         new std::tuple<uint32_t, uint32_t>[samples_size]);
652 |     if (device_ptrs < 0) {
653 |       #pragma omp parallel for
654 |       for (uint32_t s = 0; s < samples_size; s++) {
655 |         asses_with_idxs[s] = std::make_tuple(assignments[s], s);
656 |       }
657 |     } else {
658 |       auto asses_on_host =
659 |           std::unique_ptr<uint32_t[]>(new uint32_t[samples_size]);
660 |       cudaSetDevice(device_ptrs);
661 |       CUCH(cudaMemcpy(
662 |           asses_on_host.get(), assignments, samples_size * sizeof(uint32_t),
663 |           cudaMemcpyDeviceToHost), kmcudaMemoryCopyError);
664 |       #pragma omp parallel for
665 |       for (uint32_t s = 0; s < samples_size; s++) {
666 |         asses_with_idxs[s] = std::make_tuple(asses_on_host[s], s);
667 |       }
668 |     }
669 |     std::sort(asses_with_idxs.get(), asses_with_idxs.get() + samples_size);
670 |     auto asses_sorted =
671 |         std::unique_ptr<uint32_t[]>(new uint32_t[samples_size]);
672 |     auto asses_offsets =
673 |         std::unique_ptr<uint32_t[]>(new uint32_t[clusters_size + 1]);
674 |     uint32_t cls = 0;
675 |     asses_offsets[0] = 0;
676 |     for (uint32_t s = 0; s < samples_size; s++) {
677 |       uint32_t newcls;
678 |       std::tie(newcls, asses_sorted[s]) = asses_with_idxs[s];
679 |       if (newcls != cls) {
680 |         for (auto icls = newcls; icls > cls; icls--) {
681 |           asses_offsets[icls] = s;
682 |         }
683 |         cls = newcls;
684 |       }
685 |     }
686 |     for (auto icls = clusters_size; icls > cls; icls--) {
687 |       asses_offsets[icls] = samples_size;
688 |     }
689 |     CUMEMCPY_H2D_ASYNC(device_inv_asses, 0, asses_sorted.get(), samples_size);
690 |     CUMEMCPY_H2D_ASYNC(device_inv_asses_offsets, 0, asses_offsets.get(), clusters_size + 1);
691 |   }
692 |   CUMEMSET_ASYNC(device_cluster_distances, 0, clusters_size * clusters_size);
693 |   if (clusters_size * clusters_size < samples_size) {
694 |     CUMEMSET_ASYNC(device_sample_dists, 0, samples_size);
695 |   }
696 |   RETERR(knn_cuda_calc(
697 |       k, samples_size, clusters_size, features_size, metric, devs, fp16x2,
698 |       verbosity, device_samples, device_centroids, device_assignments,
699 |       device_inv_asses, device_inv_asses_offsets, &device_cluster_distances,
700 |       &device_sample_dists, &device_cluster_radiuses, &device_neighbors));
701 |   #ifdef PROFILE
702 |   FOR_EACH_DEV(cudaProfilerStop());
703 |   #endif
704 | 
705 |   if (device_ptrs < 0) {
706 |     FOR_EACH_DEVI(
707 |       CUCH(cudaMemcpyAsync(neighbors + std::get<0>(nplan[devi]) * k,
708 |                            device_neighbors[devi].get(),
709 |                            std::get<1>(nplan[devi]) * k * sizeof(float),
710 |                            cudaMemcpyDeviceToHost),
711 |            kmcudaMemoryCopyError);
712 |     );
713 |   } else {
714 |     RETERR(cuda_transpose(
715 |         samples_size, features_size, false, devs, verbosity, &device_samples));
716 |     FOR_EACH_DEVI(
717 |       if (static_cast<int32_t>(devi) == origin_devi) {
718 |         continue;
719 |       }
720 |       CUCH(cudaMemcpyPeerAsync(
721 |           neighbors + std::get<0>(nplan[devi]) * k, device_ptrs,
722 |           device_neighbors[devi].get(), devs[devi],
723 |           std::get<1>(nplan[devi]) * k * sizeof(float)),
724 |            kmcudaMemoryCopyError);
725 |     );
726 |   }
727 |   SYNC_ALL_DEVS;
728 |   DEBUG("return kmcudaSuccess\n");
729 |   return kmcudaSuccess;
730 | }
731 | 
732 | }  // extern "C"
733 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/src-d/kmcuda.svg?branch=master)](https://travis-ci.org/src-d/kmcuda) [![PyPI](https://img.shields.io/pypi/v/libKMCUDA.svg)](https://pypi.python.org/pypi/libKMCUDA) [![10.5281/zenodo.286944](https://zenodo.org/badge/DOI/10.5281/zenodo.286944.svg)](https://doi.org/10.5281/zenodo.286944)
  2 | 
  3 | "Yinyang" K-means and K-nn using NVIDIA CUDA
  4 | ============================================
  5 | 
  6 | K-means implementation is based on ["Yinyang K-Means: A Drop-In Replacement
  7 | of the Classic K-Means with Consistent Speedup"](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/ding15.pdf).
  8 | While it introduces some overhead and many conditional clauses
  9 | which are bad for CUDA, it still shows 1.6-2x speedup against the Lloyd
 10 | algorithm. K-nearest neighbors employ the same triangle inequality idea and
 11 | require precalculated centroids and cluster assignments, similar to the flattened
 12 | ball tree.
 13 | 
 14 | | [Benchmarks](#benchmarks) | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPUs |
 15 | |---------------------------|----------------|-----------|------------------|--------|--------|---------------|
 16 | | speed                     | 1x             | 4.5x      | 8.2x             | 15.5x  | 17.8x  | 29.8x         |
 17 | | memory                    | 1x             | 2x        | 2x               | 0.6x   | 0.6x   | 0.6x          |
 18 | 
 19 | Technically, this project is a shared library which exports two functions
 20 | defined in `kmcuda.h`: `kmeans_cuda` and `knn_cuda`.
 21 | It has built-in Python3 and R native extension support, so you can
 22 | `from libKMCUDA import kmeans_cuda` or `dyn.load("libKMCUDA.so")`.
 23 | 
 24 | [![source{d}](img/sourced.png)](http://sourced.tech)
 25 | <p align="right"><a href="img/kmeans_image.ipynb">How was this created?</a></p>
 26 | 
 27 | Table of contents
 28 | -----------------
 29 | * [K-means](#k-means)
 30 | * [K-nn](#k-nn)
 31 | * [Notes](#notes)
 32 | * [Building](#building)
 33 |    * [macOS](#macos)
 34 | * [Testing](#testing)
 35 | * [Benchmarks](#benchmarks)
 36 |    * [100,000x256@1024](#100000x2561024)
 37 |       * [Configuration](#configuration)
 38 |       * [Contestants](#contestants)
 39 |       * [Data](#data)
 40 |       * [Notes](#notes-1)
 41 |    * [8,000,000x256@1024](#8000000x2561024)
 42 |       * [Data](#data-1)
 43 |       * [Notes](#notes-2)
 44 | * [Python examples](#python-examples)
 45 |    * [K-means, L2 (Euclidean) distance](#k-means-l2-euclidean-distance)
 46 |    * [K-means, angular (cosine) distance + average](#k-means-angular-cosine-distance--average)
 47 |    * [K-nn](#k-nn-1)
 48 | * [Python API](#python-api)
 49 | * [R examples](#r-examples)
 50 |    * [K-means](#k-means-1)
 51 |    * [K-nn](#k-nn-2)
 52 | * [R API](#r-api)
 53 | * [C examples](#c-examples)
 54 | * [C API](#c-api)
 55 | * [License](#license)
 56 | 
 57 | K-means
 58 | -------
 59 | The major difference between this project and others is that kmcuda is
 60 | optimized for low memory consumption and the large number of clusters. E.g.,
 61 | kmcuda can sort 4M samples in 480 dimensions into 40000 clusters (if you
 62 | have several days and 12 GB of GPU memory); 300K samples are grouped
 63 | into 5000 clusters in 4½ minutes on NVIDIA Titan X (15 iterations); 3M samples
 64 | and 1000 clusters take 20 minutes (33 iterations). Yinyang can be
 65 | turned off to save GPU memory but the slower Lloyd will be used then.
 66 | Four centroid initialization schemes are supported: random, k-means++,
 67 | [AFKMC2](http://olivierbachem.ch/files/afkmcmc-oral-pdf.pdf) and import.
 68 | Two distance metrics are supported: L2 (the usual one) and angular
 69 | (arccos of the scalar product). L1 is in development.
 70 | 16-bit float support delivers 2x memory compression. If you've got several GPUs,
 71 | they can be utilized together and it gives the corresponding linear speedup
 72 | either for Lloyd or Yinyang.
 73 | 
 74 | The code has been thoroughly tested to yield bit-to-bit identical
 75 | results from Yinyang and Lloyd. "Fast and Provably Good Seedings for k-Means" was adapted from
 76 | [the reference code](https://github.com/obachem/kmc2).
 77 | 
 78 | Read the articles: [1](http://blog.sourced.tech/post/towards_kmeans_on_gpu/),
 79 | [2](https://blog.sourced.tech/post/kmcuda4/).
 80 | 
 81 | K-nn
 82 | ----
 83 | Centroid distance matrix C<sub>ij</sub> is calculated together with clusters'
 84 | radiuses R<sub>i</sub> (the maximum distance from the centroid to the corresponding
 85 | cluster's members). Given sample S in cluster A, we avoid calculating the distances from S
 86 | to another cluster B's members if C<sub>AB</sub> - SA - R<sub>B</sub> is greater
 87 | than the current maximum K-nn distance. This resembles the [ball tree
 88 | algorithm](http://scikit-learn.org/stable/modules/neighbors.html#ball-tree).
 89 | 
 90 | The implemented algorithm is tolerant to NANs. There are two variants depending
 91 | on whether k is small enough to fit the sample's neighbors into CUDA shared memory.
 92 | Internally, the neighbors list is a [binary heap](https://en.wikipedia.org/wiki/Binary_heap) -
 93 | that reduces the complexity multiplier from O(k) to O(log k).
 94 | 
 95 | The implementation yields identical results to `sklearn.neighbors.NearestNeighbors`
 96 | except cases in which adjacent distances are equal and the order is undefined.
 97 | That is, the returned indices are sorted in the increasing order of the
 98 | corresponding distances.
 99 | 
100 | Notes
101 | -----
102 | Lloyd is tolerant to samples with NaN features while Yinyang is not.
103 | It may happen that some of the resulting clusters contain zero elements.
104 | In such cases, their features are set to NaN.
105 | 
106 | Angular (cosine) distance metric effectively results in Spherical K-Means behavior.
107 | The samples **must** be normalized to L2 norm equal to 1 before clustering,
108 | it is not done automatically. The actual formula is:
109 | 
110 | ![D(A, B)=\arccos\left(\frac{A\cdot B}{|A||B|}\right)](img/latex_angular.png)
111 | 
112 | If you get OOM with the default parameters, set `yinyang_t` to 0 which
113 | forces Lloyd. `verbosity` 2 will print the memory allocation statistics
114 | (all GPU allocation happens at startup).
115 | 
116 | Data type is either 32- or 16-bit float. Number of samples is limited by 2^32,
117 | clusters by 2^32 and features by 2^16 (2^17 for fp16). Besides, the product of
118 | clusters number and features number may not exceed 2^32.
119 | 
120 | In the case of 16-bit floats, the reduced precision often leads to a slightly
121 | increased number of iterations, Yinyang is especially sensitive to that.
122 | In some cases, there may be overflows and the clustering may fail completely.
123 | 
124 | Building
125 | --------
126 | ```
127 | git clone https://github.com/src-d/kmcuda
128 | cd src
129 | cmake -DCMAKE_BUILD_TYPE=Release . && make
130 | ```
131 | It requires cudart 8.0 / Pascal and OpenMP 4.0 capable compiler. The build has
132 | been tested primarily on Linux but it works on macOS too with some blows and whistles
133 | (see "macOS" subsection).
134 | If you do not want to build the Python native module, add `-D DISABLE_PYTHON=y`.
135 | If you do not want to build the R native module, add `-D DISABLE_R=y`.
136 | If CUDA is not automatically found, add `-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0`
137 | (change the path to the actual one). By default, CUDA kernels are compiled for
138 | the architecture 60 (Pascal). It is possible to override it via `-D CUDA_ARCH=52`,
139 | but fp16 support will be disabled then.
140 | 
141 | Python users:
142 | ```
143 | CUDA_ARCH=61 pip install libKMCUDA
144 | # replace 61 with your device version
145 | ```
146 | 
147 | Or install it from source:
148 | ```
149 | CUDA_ARCH=61 pip install git+https://github.com/src-d/kmcuda.git#subdirectory=src
150 | # replace 61 with your device version
151 | ```
152 | 
153 | Binary Python packages are quite hard to provide because they depend on CUDA and device architecture versions. PRs welcome!
154 | 
155 | #### macOS
156 | macOS build is tricky, but possible. The instructions below correspond to the state from 1 year ago and may be different now.
157 | Please help with updates!
158 | 
159 | Install [Homebrew](http://brew.sh/) and the [Command Line Developer Tools](https://developer.apple.com/download/more/)
160 | which are compatible with your CUDA installation. E.g., CUDA 8.0 does not support
161 | the latest 8.x and works with 7.3.1 and below. Install `clang` with OpenMP support
162 | and Python with numpy:
163 | ```
164 | brew install llvm --with-clang
165 | brew install python3
166 | pip3 install numpy
167 | ```
168 | Execute this magic command which builds kmcuda afterwards:
169 | ```
170 | CC=/usr/local/opt/llvm/bin/clang CXX=/usr/local/opt/llvm/bin/clang++ LDFLAGS=-L/usr/local/opt/llvm/lib/ cmake -DCMAKE_BUILD_TYPE=Release .
171 | ```
172 | And make the last important step - rename \*.dylib to \*.so so that Python is able to import the native extension:
173 | ```
174 | mv libKMCUDA.{dylib,so}
175 | ```
176 | 
177 | Testing
178 | -------
179 | `test.py` contains the unit tests based on [unittest](https://docs.python.org/3/library/unittest.html).
180 | They require either [cuda4py](https://github.com/ajkxyz/cuda4py) or [pycuda](https://github.com/inducer/pycuda) and
181 | [scikit-learn](http://scikit-learn.org/stable/).
182 | `test.R` contains R integration tests and shall be run with `Rscript`.
183 | 
184 | Benchmarks
185 | ----------
186 | 
187 | ### 100000x256@1024
188 | |            | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPUs |
189 | |------------|----------------|-----------|------------------|--------|--------|---------------|
190 | | time, s    | 164            | 36        | 20               | 10.6   | 9.2    | 5.5           |
191 | | memory, GB | 1              | 2         | 2                | 0.6    | 0.6    | 0.6           |
192 | 
193 | #### Configuration
194 | * 16-core (32 threads) Intel Xeon E5-2620 v4 @ 2.10GHz
195 | * 256 GB RAM Samsung M393A2K40BB1
196 | * Nvidia Titan X 2016
197 | 
198 | #### Contestants
199 | * [sklearn.cluster.KMeans](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)@0.18.1; `KMeans(n_clusters=1024, init="random", max_iter=15, random_state=0, n_jobs=1, n_init=1)`.
200 | * [KMeansRex](https://github.com/michaelchughes/KMeansRex)@288c40a with `-march-native` and Eigen 3.3; `KMeansRex.RunKMeans(data, 1024, Niter=15, initname=b"random")`.
201 | * KMeansRex with additional `-fopenmp`.
202 | * [Serban KMeans](https://github.com/serban/kmeans)@83e76bf built for arch 6.1; `./cuda_main  -b -i serban.bin -n 1024 -t 0.0028 -o`
203 | * kmcuda v6.1 built for arch 6.1; `libKMCUDA.kmeans_cuda(dataset, 1024, tolerance=0.002, seed=777, init="random", verbosity=2, yinyang_t=0, device=0)`
204 | * kmcuda running on 2 GPUs.
205 | 
206 | #### Data
207 | 100000 random samples uniformly distributed between 0 and 1 in 256 dimensions.
208 | 
209 | #### Notes
210 | 100000 is the maximum size Serban KMeans can handle.
211 | 
212 | ### 8000000x256@1024
213 | |            | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda 2 GPU | kmcuda Yinyang 2 GPUs |
214 | |------------|----------------|-----------|------------------|--------|--------------|-----------------------|
215 | | time       | please no      | -         | 6h 34m           | fail   | 44m          | 36m                   |
216 | | memory, GB | -              | -         | 205              | fail   | 8.7          | 10.4                  |
217 | 
218 | kmeans++ initialization, 93 iterations (1% reassignments equivalent).
219 | 
220 | #### Data
221 | 8,000,000 secret production samples.
222 | 
223 | #### Notes
224 | KmeansRex did eat 205 GB of RAM on peak; it uses dynamic memory so it constantly
225 | bounced from 100 GB to 200 GB.
226 | 
227 | Contributions
228 | -------------
229 | 
230 | ...are welcome! See [CONTRIBUTING](CONTRIBUTING.md) and [code of conduct](CODE_OF_CONDUCT.md).
231 | 
232 | License
233 | -------
234 | 
235 | [Apache 2.0](LICENSE.md)
236 | 
237 | Python examples
238 | ---------------
239 | 
240 | #### K-means, L2 (Euclidean) distance
241 | 
242 | ```python
243 | import numpy
244 | from matplotlib import pyplot
245 | from libKMCUDA import kmeans_cuda
246 | 
247 | numpy.random.seed(0)
248 | arr = numpy.empty((10000, 2), dtype=numpy.float32)
249 | arr[:2500] = numpy.random.rand(2500, 2) + [0, 2]
250 | arr[2500:5000] = numpy.random.rand(2500, 2) - [0, 2]
251 | arr[5000:7500] = numpy.random.rand(2500, 2) + [2, 0]
252 | arr[7500:] = numpy.random.rand(2500, 2) - [2, 0]
253 | centroids, assignments = kmeans_cuda(arr, 4, verbosity=1, seed=3)
254 | print(centroids)
255 | pyplot.scatter(arr[:, 0], arr[:, 1], c=assignments)
256 | pyplot.scatter(centroids[:, 0], centroids[:, 1], c="white", s=150)
257 | ```
258 | You should see something like this:
259 | ![Clustered dots](img/cls_euclidean.png)
260 | 
261 | #### K-means, angular (cosine) distance + average
262 | 
263 | ```python
264 | import numpy
265 | from matplotlib import pyplot
266 | from libKMCUDA import kmeans_cuda
267 | 
268 | numpy.random.seed(0)
269 | arr = numpy.empty((10000, 2), dtype=numpy.float32)
270 | angs = numpy.random.rand(10000) * 2 * numpy.pi
271 | for i in range(10000):
272 |     arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
273 | centroids, assignments, avg_distance = kmeans_cuda(
274 |     arr, 4, metric="cos", verbosity=1, seed=3, average_distance=True)
275 | print("Average distance between centroids and members:", avg_distance)
276 | print(centroids)
277 | pyplot.scatter(arr[:, 0], arr[:, 1], c=assignments)
278 | pyplot.scatter(centroids[:, 0], centroids[:, 1], c="white", s=150)
279 | ```
280 | You should see something like this:
281 | ![Clustered dots](img/cls_angular.png)
282 | 
283 | #### K-nn
284 | 
285 | ```python
286 | import numpy
287 | from libKMCUDA import kmeans_cuda, knn_cuda
288 | 
289 | numpy.random.seed(0)
290 | arr = numpy.empty((10000, 2), dtype=numpy.float32)
291 | angs = numpy.random.rand(10000) * 2 * numpy.pi
292 | for i in range(10000):
293 |     arr[i] = numpy.sin(angs[i]), numpy.cos(angs[i])
294 | ca = kmeans_cuda(arr, 4, metric="cos", verbosity=1, seed=3)
295 | neighbors = knn_cuda(10, arr, *ca, metric="cos", verbosity=1, device=1)
296 | print(neighbors[0])
297 | ```
298 | You should see
299 | ```
300 | reassignments threshold: 100
301 | performing kmeans++...
302 | done
303 | too few clusters for this yinyang_t => Lloyd
304 | iteration 1: 10000 reassignments
305 | iteration 2: 926 reassignments
306 | iteration 3: 416 reassignments
307 | iteration 4: 187 reassignments
308 | iteration 5: 87 reassignments
309 | initializing the inverse assignments...
310 | calculating the cluster radiuses...
311 | calculating the centroid distance matrix...
312 | searching for the nearest neighbors...
313 | calculated 0.276552 of all the distances
314 | [1279 1206 9846 9886 9412 9823 7019 7075 6453 8933]
315 | ```
316 | 
317 | Python API
318 | ----------
319 | ```python
320 | def kmeans_cuda(samples, clusters, tolerance=0.01, init="k-means++",
321 |                 yinyang_t=0.1, metric="L2", average_distance=False,
322 |                 seed=time(), device=0, verbosity=0)
323 | ```
324 | **samples** numpy array of shape \[number of samples, number of features\]
325 |             or tuple(raw device pointer (int), device index (int), shape (tuple(number of samples, number of features\[, fp16x2 marker\]))).
326 |             In the latter case, negative device index means host pointer. Optionally,
327 |             the tuple can be 2 items longer with preallocated device pointers for
328 |             centroids and assignments. dtype must be either float16 or
329 |             convertible to float32.
330 | 
331 | **clusters** integer, the number of clusters.
332 | 
333 | **tolerance** float, if the relative number of reassignments drops below this value,
334 |               algorithm stops.
335 | 
336 | **init** string or numpy array, sets the method for centroids initialization,
337 |          may be "k-means++", "afk-mc2", "random" or numpy array of shape
338 |          \[**clusters**, number of features\]. dtype must be float32.
339 | 
340 | **yinyang_t** float, the relative number of cluster groups, usually 0.1.
341 |               0 disables Yinyang refinement.
342 | 
343 | **metric** str, the name of the distance metric to use. The default is Euclidean (L2),
344 |            it can be changed to "cos" to change the algorithm to Spherical K-means
345 |            with the angular distance. Please note that samples *must* be normalized
346 |            in the latter case.
347 | 
348 | **average_distance** boolean, the value indicating whether to calculate
349 |                      the average distance between cluster elements and
350 |                      the corresponding centroids. Useful for finding
351 |                      the best K. Returned as the third tuple element.
352 | 
353 | **seed** integer, random generator seed for reproducible results.
354 | 
355 | **device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
356 |            2 means second device, 3 means using first and second device. Special
357 |            value 0 enables all available devices. The default is 0.
358 | 
359 | **verbosity** integer, 0 means complete silence, 1 means mere progress logging,
360 |               2 means lots of output.
361 | 
362 | **return** tuple(centroids, assignments, \[average_distance\]).
363 |            If **samples** was a numpy array or a host pointer tuple, the types
364 |            are numpy arrays, otherwise, raw pointers (integers) allocated on the
365 |            same device. If **samples** are float16, the returned centroids are
366 |            float16 too.
367 | 
368 | ```python
369 | def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
370 | ```
371 | **k** integer, the number of neighbors to search for each sample. Must be ≤ 1<sup>16</sup>.
372 | 
373 | **samples** numpy array of shape \[number of samples, number of features\]
374 |             or tuple(raw device pointer (int), device index (int), shape (tuple(number of samples, number of features\[, fp16x2 marker\]))).
375 |             In the latter case, negative device index means host pointer. Optionally,
376 |             the tuple can be 1 item longer with the preallocated device pointer for
377 |             neighbors. dtype must be either float16 or convertible to float32.
378 | 
379 | **centroids** numpy array with precalculated clusters' centroids (e.g., using
380 |               K-means/kmcuda/kmeans_cuda()). dtype must match **samples**.
381 |               If **samples** is a tuple then **centroids** must be a length-2
382 |               tuple, the first element is the pointer and the second is the
383 |               number of clusters. The shape is (number of clusters, number of features).
384 | 
385 | **assignments** numpy array with sample-cluster associations. dtype is expected
386 |                 to be compatible with uint32. If **samples** is a tuple then
387 |                 **assignments** is a pointer. The shape is (number of samples,).
388 | 
389 | **metric** str, the name of the distance metric to use. The default is Euclidean (L2),
390 |            it can be changed to "cos" to change the algorithm to Spherical K-means
391 |            with the angular distance. Please note that samples *must* be normalized
392 |            in the latter case.
393 | 
394 | **device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
395 |            2 means second device, 3 means using first and second device. Special
396 |            value 0 enables all available devices. The default is 0.
397 | 
398 | **verbosity** integer, 0 means complete silence, 1 means mere progress logging,
399 |               2 means lots of output.
400 | 
401 | **return** neighbor indices. If **samples** was a numpy array or
402 |             a host pointer tuple, the return type is numpy array, otherwise, a
403 |             raw pointer (integer) allocated on the same device. The shape is
404 |             (number of samples, k).
405 | 
406 | R examples
407 | ----------
408 | #### K-means
409 | ```R
410 | dyn.load("libKMCUDA.so")
411 | samples = replicate(4, runif(16000))
412 | result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
413 |                    seed=777, verbosity=1, average_distance=TRUE)
414 | print(result$average_distance)
415 | print(result$centroids[1:10,])
416 | print(result$assignments[1:10])
417 | ```
418 | 
419 | #### K-nn
420 | ```R
421 | dyn.load("libKMCUDA.so")
422 | samples = replicate(4, runif(16000))
423 | cls = .External("kmeans_cuda", samples, 50, tolerance=0.01,
424 |                 seed=777, verbosity=1)
425 | result = .External("knn_cuda", 20, samples, cls$centroids, cls$assignments,
426 |                    verbosity=1)
427 | print(result[1:10,])
428 | ```
429 | 
430 | R API
431 | -----
432 | ```R
433 | function kmeans_cuda(
434 |     samples, clusters, tolerance=0.01, init="k-means++", yinyang_t=0.1,
435 |     metric="L2", average_distance=FALSE, seed=Sys.time(), device=0, verbosity=0)
436 | ```
437 | **samples** real matrix of shape \[number of samples, number of features\]
438 |             or list of real matrices which are rbind()-ed internally. No more
439 |             than INT32_MAX samples and UINT16_MAX features are supported.
440 | 
441 | **clusters** integer, the number of clusters.
442 | 
443 | **tolerance** real, if the relative number of reassignments drops below this value,
444 |               algorithm stops.
445 | 
446 | **init** character vector or real matrix, sets the method for centroids initialization,
447 |          may be "k-means++", "afk-mc2", "random" or real matrix, of shape
448 |          \[**clusters**, number of features\].
449 | 
450 | **yinyang_t** real, the relative number of cluster groups, usually 0.1.
451 |               0 disables Yinyang refinement.
452 | 
453 | **metric** character vector, the name of the distance metric to use. The default
454 |            is Euclidean (L2), it can be changed to "cos" to change the algorithm
455 |            to Spherical K-means with the angular distance. Please note that
456 |            samples *must* be normalized in the latter case.
457 | 
458 | **average_distance** logical, the value indicating whether to calculate
459 |                      the average distance between cluster elements and
460 |                      the corresponding centroids. Useful for finding
461 |                      the best K. Returned as the third list element.
462 | 
463 | **seed** integer, random generator seed for reproducible results.
464 | 
465 | **device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
466 |            2 means second device, 3 means using first and second device. Special
467 |            value 0 enables all available devices. The default is 0.
468 | 
469 | **verbosity** integer, 0 means complete silence, 1 means mere progress logging,
470 |               2 means lots of output.
471 | 
472 | **return** list(centroids, assignments\[, average_distance\]). Indices in
473 |            assignments start from 1.
474 | 
475 | ```R
476 | function knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
477 | ```
478 | **k** integer, the number of neighbors to search for each sample. Must be ≤ 1<sup>16</sup>.
479 | 
480 | **samples** real matrix of shape \[number of samples, number of features\]
481 |             or list of real matrices which are rbind()-ed internally.
482 |             In the latter case, is is possible to pass in more than INT32_MAX
483 |             samples.
484 | 
485 | **centroids** real matrix with precalculated clusters' centroids (e.g., using
486 |               kmeans() or kmeans_cuda()).
487 | 
488 | **assignments** integer vector with sample-cluster associations. Indices start
489 |                 from 1.
490 | 
491 | **metric** str, the name of the distance metric to use. The default is Euclidean (L2),
492 |                 can be changed to "cos" to behave as Spherical K-means with the
493 |                 angular distance. Please note that samples *must* be normalized in that
494 |                 case.
495 | 
496 | **device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, 2 means second device,
497 |            3 means using first and second device. Special value 0 enables all available devices.
498 |            The default is 0.
499 | 
500 | **verbosity** integer, 0 means complete silence, 1 means mere progress logging,
501 |               2 means lots of output.
502 | 
503 | **return** integer matrix with neighbor indices. The shape is (number of samples, k).
504 |            Indices start from 1.
505 | 
506 | C examples
507 | ----------
508 | `example.c`:
509 | ```C
510 | #include <assert.h>
511 | #include <stdint.h>
512 | #include <stdio.h>
513 | #include <stdlib.h>
514 | #include <kmcuda.h>
515 | 
516 | // ./example /path/to/data <number of clusters>
517 | int main(int argc, const char **argv) {
518 |   assert(argc == 3);
519 |   // we open the binary file with the data
520 |   // [samples_size][features_size][samples_size x features_size]
521 |   FILE *fin = fopen(argv[1], "rb");
522 |   assert(fin);
523 |   uint32_t samples_size, features_size;
524 |   assert(fread(&samples_size, sizeof(samples_size), 1, fin) == 1);
525 |   assert(fread(&features_size, sizeof(features_size), 1, fin) == 1);
526 |   uint64_t total_size = ((uint64_t)samples_size) * features_size;
527 |   float *samples = malloc(total_size * sizeof(float));
528 |   assert(samples);
529 |   assert(fread(samples, sizeof(float), total_size, fin) == total_size);
530 |   fclose(fin);
531 |   int clusters_size = atoi(argv[2]);
532 |   // we will store cluster centers here
533 |   float *centroids = malloc(clusters_size * features_size * sizeof(float));
534 |   assert(centroids);
535 |   // we will store assignments of every sample here
536 |   uint32_t *assignments = malloc(((uint64_t)samples_size) * sizeof(uint32_t));
537 |   assert(assignments);
538 |   float average_distance;
539 |   KMCUDAResult result = kmeans_cuda(
540 |       kmcudaInitMethodPlusPlus, NULL,  // kmeans++ centroids initialization
541 |       0.01,                            // less than 1% of the samples are reassigned in the end
542 |       0.1,                             // activate Yinyang refinement with 0.1 threshold
543 |       kmcudaDistanceMetricL2,          // Euclidean distance
544 |       samples_size, features_size, clusters_size,
545 |       0xDEADBEEF,                      // random generator seed
546 |       0,                               // use all available CUDA devices
547 |       -1,                              // samples are supplied from host
548 |       0,                               // not in float16x2 mode
549 |       1,                               // moderate verbosity
550 |       samples, centroids, assignments, &average_distance);
551 |   free(samples);
552 |   free(centroids);
553 |   free(assignments);
554 |   assert(result == kmcudaSuccess);
555 |   printf("Average distance between a centroid and the corresponding "
556 |          "cluster members: %f\n", average_distance);
557 |   return 0;
558 | }
559 | ```
560 | Build:
561 | ```
562 | gcc -std=c99 -O2 example.c -I/path/to/kmcuda.h/dir -L/path/to/libKMCUDA.so/dir -l KMCUDA -Wl,-rpath,. -o example
563 | ```
564 | Run:
565 | ```
566 | ./example serban.bin 1024
567 | ```
568 | The file format is the same as in [serban/kmeans](https://github.com/serban/kmeans/blob/master/README#L113).
569 | 
570 | C API
571 | -----
572 | ```C
573 | KMCUDAResult kmeans_cuda(
574 |     KMCUDAInitMethod init, float tolerance, float yinyang_t,
575 |     KMCUDADistanceMetric metric, uint32_t samples_size, uint16_t features_size,
576 |     uint32_t clusters_size, uint32_t seed, uint32_t device, int32_t device_ptrs,
577 |     int32_t fp16x2, int32_t verbosity, const float *samples, float *centroids,
578 |     uint32_t *assignments, float *average_distance)
579 | ```
580 | **init** specifies the centroids initialization method: k-means++, random or import
581 |          (in the latter case, **centroids** is read).
582 | 
583 | **tolerance** if the number of reassignments drop below this ratio, stop.
584 | 
585 | **yinyang_t** the relative number of cluster groups, usually 0.1.
586 | 
587 | **metric** The distance metric to use. The default is Euclidean (L2), can be
588 |            changed to cosine to behave as Spherical K-means with the angular
589 |            distance. Please note that samples *must* be normalized in that case.
590 | 
591 | **samples_size** number of samples.
592 | 
593 | **features_size** number of features. if fp16x2 is set, one half of the number of features.
594 | 
595 | **clusters_size** number of clusters.
596 | 
597 | **seed** random generator seed passed to srand().
598 | 
599 | **device** CUDA device OR-ed indices - usually 1. For example, 1 means using first device,
600 |            2 means second device, 3 means first and second device (2x speedup). Special
601 |            value 0 enables all available devices.
602 | 
603 | **device_ptrs** configures the location of input and output. If it is negative,
604 |                 samples and returned arrays are on host, otherwise, they belong to the
605 |                 corresponding device. E.g., if device_ptrs is 0, **samples** is expected
606 |                 to be a pointer to device #0's memory and the resulting **centroids** and
607 |                 **assignments** are expected to be preallocated on device #0 as well.
608 |                 Usually this value is -1.
609 | 
610 | **fp16x2** activates fp16 mode, two half-floats are packed into a single 32-bit float,
611 |            features_size becomes effectively 2 times bigger, the returned
612 |            centroids are fp16x2 too.
613 | 
614 | **verbosity** 0 - no output; 1 - progress output; >=2 - debug output.
615 | 
616 | **samples** input array of size samples_size x features_size in row major format.
617 | 
618 | **centroids** output array of centroids of size clusters_size x features_size
619 |               in row major format.
620 | 
621 | **assignments** output array of cluster indices for each sample of size
622 |                 samples_size x 1.
623 | 
624 | **average_distance** output mean distance between cluster elements and
625 |                      the corresponding centroids. If nullptr, not calculated.
626 | 
627 | Returns KMCUDAResult (see `kmcuda.h`);
628 | 
629 | ```C
630 | KMCUDAResult knn_cuda(
631 |     uint16_t k, KMCUDADistanceMetric metric, uint32_t samples_size,
632 |     uint16_t features_size, uint32_t clusters_size, uint32_t device,
633 |     int32_t device_ptrs, int32_t fp16x2, int32_t verbosity,
634 |     const float *samples, const float *centroids, const uint32_t *assignments,
635 |     uint32_t *neighbors);
636 | ```
637 | **k** integer, the number of neighbors to search for each sample.
638 | 
639 | **metric** The distance metric to use. The default is Euclidean (L2), can be
640 |            changed to cosine to behave as Spherical K-means with the angular
641 |            distance. Please note that samples *must* be normalized in that case.
642 | 
643 | **samples_size** number of samples.
644 | 
645 | **features_size** number of features. if fp16x2 is set, one half of the number of features.
646 | 
647 | **clusters_size** number of clusters.
648 | 
649 | **device** CUDA device OR-ed indices - usually 1. For example, 1 means using first device,
650 |            2 means second device, 3 means first and second device (2x speedup). Special
651 |            value 0 enables all available devices.
652 | 
653 | **device_ptrs** configures the location of input and output. If it is negative,
654 |                 samples, centroids, assignments and the returned array are on host,
655 |                 otherwise, they belong to the corresponding device.
656 |                 E.g., if device_ptrs is 0, **samples**, **centroids** and
657 |                 **assignments** are expected to be pointers to device #0's memory
658 |                 and the resulting **neighbors** is expected to be preallocated on
659 |                 device #0 as well. Usually this value is -1.
660 | 
661 | **fp16x2** activates fp16 mode, two half-floats are packed into a single 32-bit float,
662 |            features_size becomes effectively 2 times bigger, affects **samples**
663 |            and **centroids**.
664 | 
665 | **verbosity** 0 - no output; 1 - progress output; >=2 - debug output.
666 | 
667 | **samples** input array of size samples_size x features_size in row major format.
668 | 
669 | **centroids** input array of centroids of size clusters_size x features_size
670 |               in row major format.
671 | 
672 | **assignments** input array of cluster indices for each sample of size
673 |                 samples_size x 1.
674 | 
675 | **neighbors** output array with the nearest neighbors of size
676 |               samples_size x k in row major format.
677 | 
678 | Returns KMCUDAResult (see `kmcuda.h`);
679 | 
680 | #### README {#ignore_this_doxygen_anchor}
681 | 


--------------------------------------------------------------------------------