├── .gitignore ├── amazon.json ├── wikiLSHTC.json ├── CMakeLists.txt ├── include ├── ReluLayer.h ├── GPUTimer.h ├── Layer.h ├── CscActNodes.h ├── SoftmaxLayer.h ├── LSH.h ├── utils.h ├── Network.h ├── lshKnl.h ├── kernel.h └── GPUMultiLinkedHashTable.h ├── LICENSE ├── readme.md └── src ├── ReluLayer.cu ├── Layer.cu ├── CscActNodes.cu ├── SoftmaxLayer.cu ├── Network.cu ├── main.cu ├── LSH.cu ├── GPUMultiLinkedHashTable.cu ├── lshKnl.cu └── kernel.cu /.gitignore: -------------------------------------------------------------------------------- 1 | bin 2 | build 3 | dataset 4 | logs 5 | runme 6 | -------------------------------------------------------------------------------- /amazon.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_size": 135909, 3 | "node_num_per_layer": [128, 670091], 4 | 5 | "max_batch_size": 256, 6 | "max_input_num": 256, 7 | "max_act_nums": [128, 5480], 8 | "max_label_num": 32, 9 | 10 | "K": 6, "L": 50, 11 | "bin_size": 8, 12 | "bucket_capacity": 128, 13 | "threshold": 1, 14 | "min_softmax_act_num": 1500, 15 | "tbl_num_per_tile": 4, 16 | "tbl_num_per_thread": 4, 17 | 18 | "lr": 0.0001, 19 | "BETA1": 0.9, "BETA2": 0.999, 20 | "rebuild_period": 6400, 21 | "reshuffle_period": 128000, 22 | "thread_num": 128, 23 | "epoch_num": 20, 24 | 25 | "train_fname": "dataset/Amazon/amazon_train.txt", 26 | "test_fname": "dataset/Amazon/amazon_test.txt" 27 | } 28 | -------------------------------------------------------------------------------- /wikiLSHTC.json: -------------------------------------------------------------------------------- 1 | { 2 | "input_size": 1617899, 3 | "node_num_per_layer": [128, 325056], 4 | 5 | "max_batch_size": 128, 6 | "max_input_num": 5400, 7 | "max_act_nums": [128, 45056], 8 | "max_label_num": 256, 9 | 10 | "K": 5, "L": 350, 11 | "bin_size": 8, 12 | "bucket_capacity": 128, 13 | "threshold": 1, 14 | "min_softmax_act_num": 1500, 15 | "tbl_num_per_tile": 4, 16 | "tbl_num_per_thread": 4, 17 | 18 | "lr": 0.0001, 19 | "BETA1": 0.9, "BETA2": 0.999, 20 | "rebuild_period": 6400, 21 | "reshuffle_period": 128000, 22 | "thread_num": 128, 23 | "epoch_num": 20, 24 | 25 | "train_fname": "dataset/WikiLSHTC/wikiLSHTC_train.txt", 26 | "test_fname": "dataset/WikiLSHTC/wikiLSHTC_test.txt" 27 | } 28 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR) 2 | project(G_SLIDE LANGUAGES CXX CUDA) 3 | 4 | include(FetchContent) 5 | 6 | FetchContent_Declare( 7 | jsoncpp_static 8 | GIT_REPOSITORY https://github.com/open-source-parsers/jsoncpp.git 9 | GIT_TAG 1.9.5) 10 | 11 | FetchContent_MakeAvailable(jsoncpp_static) 12 | 13 | aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/src SRCS) 14 | 15 | add_executable(runme ${SRCS}) 16 | 17 | target_include_directories(runme PRIVATE include) 18 | 19 | set_target_properties(runme PROPERTIES CUDA_SEPARABLE_COMPILATION ON) 20 | set_target_properties(runme PROPERTIES RUNTIME_OUTPUT_DIRECTORY 21 | ${CMAKE_CURRENT_SOURCE_DIR}) 22 | 23 | target_compile_features(runme PUBLIC cxx_std_14) 24 | 25 | target_link_libraries(runme PRIVATE jsoncpp_static) 26 | target_link_libraries(runme PRIVATE cublas) 27 | -------------------------------------------------------------------------------- /include/ReluLayer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "CscActNodes.h" 4 | #include "Layer.h" 5 | 6 | class ReluLayer : public Layer { // weight: col major 7 | public: 8 | ReluLayer(const int prev_node_num, const int node_num, 9 | const int max_batch_size, const int node_capacity) 10 | : Layer(prev_node_num, node_num, max_batch_size, node_capacity) {} 11 | 12 | ReluLayer(const Layer &) = delete; 13 | ReluLayer(Layer &&) = delete; 14 | ReluLayer &operator=(const ReluLayer &) = delete; 15 | 16 | void forward(const Layer &prev_layer, const int batch_size, 17 | const int thread_num, const int max_out_num); 18 | 19 | void forward(const CscActNodes &csc_inputs, const int batch_size, 20 | const int thread_num, const int max_out_num); 21 | 22 | void bp(Layer &prev_layer, const int batch_size, const int thread_num, 23 | const int max_act_num); 24 | 25 | void bp_first_layer(const CscActNodes &csc_inputs, const int batch_size, 26 | const int thread_num, const int max_act_num); 27 | }; 28 | -------------------------------------------------------------------------------- /include/GPUTimer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "utils.h" 8 | 9 | class GPUTimer { 10 | cudaEvent_t start_time; 11 | cudaEvent_t end_time; 12 | float elapsed_time; 13 | 14 | public: 15 | GPUTimer() { 16 | CUDA_CHECK(cudaEventCreate(&start_time)); 17 | CUDA_CHECK(cudaEventCreate(&end_time)); 18 | } 19 | 20 | ~GPUTimer() { 21 | CUDA_CHECK(cudaEventDestroy(start_time)); 22 | CUDA_CHECK(cudaEventDestroy(end_time)); 23 | } 24 | 25 | void start() { CUDA_CHECK(cudaEventRecord(start_time)); } 26 | 27 | // return the elapsed time from start_time to end_time 28 | float record(std::string msg = "") { 29 | CUDA_CHECK(cudaEventRecord(end_time)); 30 | CUDA_CHECK(cudaEventSynchronize(end_time)); 31 | CUDA_CHECK(cudaEventElapsedTime(&elapsed_time, start_time, end_time)); 32 | 33 | if (msg != "") { 34 | printf("%s%f ms\n", msg.c_str(), elapsed_time); 35 | } 36 | 37 | // restart timer 38 | CUDA_CHECK(cudaEventRecord(start_time)); 39 | 40 | return elapsed_time; 41 | } 42 | }; 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Zaifeng Pan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /include/Layer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "CscActNodes.h" 3 | 4 | class Layer { 5 | public: 6 | const int prev_node_num; 7 | const int node_num; 8 | 9 | float *d_weights; 10 | float *d_biases; 11 | 12 | CscActNodes csc_acts; 13 | float *d_cmprs_bp_deltas; 14 | 15 | struct Adam { 16 | float *d_ts; 17 | float *d_moms; 18 | float *d_vels; 19 | 20 | Adam(int size); 21 | ~Adam(); 22 | }; 23 | 24 | Adam weight_adam; 25 | Adam bias_adam; 26 | 27 | public: 28 | Layer(const int prev_node_num, const int node_num, const int max_batch_size, 29 | const int node_capacity); 30 | 31 | Layer(const Layer &) = delete; 32 | Layer(Layer &&) = delete; 33 | Layer &operator=(const Layer &) = delete; 34 | 35 | virtual ~Layer(); 36 | 37 | void test_get_acts(const std::vector &h_cmprs_nodes, 38 | const std::vector &h_cmprs_offsets) { 39 | csc_acts.extract_from(h_cmprs_nodes, h_cmprs_offsets); 40 | } 41 | 42 | virtual void update_weights(const int thread_num, const float lr); 43 | virtual void update_biases(const int thread_num, const float lr); 44 | }; 45 | -------------------------------------------------------------------------------- /include/CscActNodes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | struct CscActNodes { 5 | int max_batch_size; 6 | int node_capacity; 7 | int *d_nodes; 8 | float *d_vals; 9 | int *d_offsets; 10 | 11 | const bool val_enabled; 12 | 13 | CscActNodes(int max_batch_size, int node_capacity, bool val_enabled = true, 14 | bool is_managed = false); 15 | 16 | // virtual ~CscActNodes(); 17 | 18 | void free(); 19 | 20 | void extract_from(const std::vector &h_cmprs_nodes, 21 | const std::vector &h_cmprs_vals, 22 | const std::vector &h_cmprs_offsets); 23 | 24 | void extract_from(const std::vector &h_cmprs_nodes, 25 | const std::vector &h_cmprs_offsets); 26 | 27 | void extract_to(std::vector &h_cmprs_nodes, 28 | std::vector &h_cmprs_vals, 29 | std::vector &h_cmprs_offsets, 30 | const int batch_size) const; 31 | 32 | void extract_to(std::vector &h_cmprs_nodes, 33 | std::vector &h_cmprs_offsets, 34 | const int batch_size) const; 35 | }; 36 | -------------------------------------------------------------------------------- /include/SoftmaxLayer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | 4 | #include 5 | 6 | #include "CscActNodes.h" 7 | #include "LSH.h" 8 | #include "Layer.h" 9 | 10 | class SoftmaxLayer : public Layer { 11 | public: 12 | std::shared_ptr lsh_tbls_ptr; 13 | 14 | float *d_dense_activations; 15 | cublasHandle_t handle; 16 | 17 | public: 18 | SoftmaxLayer(const int prev_node_num, const int node_num, 19 | const int max_batch_size, const int node_capacity, const int K, 20 | const int L, const int bin_size, const int bucket_num_per_tbl, 21 | const int bucket_capacity, const int threshold, 22 | const int min_act_num, const int tbl_num_per_tile, 23 | const int tbl_num_per_thread, 24 | const int linked_bucket_num_per_tbl, const int linked_pool_size); 25 | 26 | SoftmaxLayer(const Layer &) = delete; 27 | SoftmaxLayer(Layer &&) = delete; 28 | SoftmaxLayer &operator=(const SoftmaxLayer &) = delete; 29 | 30 | ~SoftmaxLayer(); 31 | 32 | void forward(const Layer &prev_layer, const CscActNodes &cmprs_labels, 33 | const int batch_size, const int thread_num, const int max_in_num, 34 | const int max_out_num, const int max_label_num); 35 | 36 | void forward_dense(const Layer &prev_layer, const int batch_size); 37 | 38 | void bp(Layer &prev_layer, const int batch_size, const int thread_num, 39 | const int max_prev_num, const int max_act_num); 40 | 41 | void rebuild(const bool reshuffle) { 42 | lsh_tbls_ptr->build(d_weights, reshuffle); 43 | } 44 | }; 45 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # G-SLIDE 2 | 3 | G-SLIDE is a GPU-based sub-linear deep learning engine via LSH sparsification of fully-connected neural networks. The details can be found in this [paper](https://ieeexplore.ieee.org/document/9635657). 4 | 5 | ## Dataset 6 | 7 | The Datasets can be downloaded in [Amazon-670K](https://drive.google.com/open?id=0B3lPMIHmG6vGdUJwRzltS1dvUVk) and [WikiLSHTC-325K](https://drive.google.com/file/d/0B3lPMIHmG6vGSHE1SWx4TVRva3c/view?resourcekey=0-ZGNqdLuqttRdnAj-U0bktA). 8 | 9 | ## Baselines 10 | 11 | The baseline is [SLIDE](https://github.com/keroro824/HashingDeepLearning). The source codes of TensorFlow-CPU and TensorFlow-GPU baselines can also be found from the same link. 12 | 13 | ## Running G-SLIDE 14 | 15 | ### Tested Environments 16 | 17 | The experiment environments in the paper are as follow: 18 | 19 | * OS: Ubuntu 20.04 20 | * Compiler: nvcc 11.1 21 | * GPU: 2080ti 22 | * CPU: Intel(R) Core(TM) i9-9900K CPU @ 3.60GHz 23 | * CMake:3.14 and above 24 | 25 | ### Dependencies 26 | 27 | * cuBLAS 28 | * Thrust 29 | * [JsonCpp](https://github.com/open-source-parsers/jsoncpp): we use it to parse the configuration json file. 30 | 31 | ### Compile and Run 32 | 33 | Type the following commands to compile the project: 34 | 35 | ```bash 36 | git clone https://github.com/PanZaifeng/G-SLIDE.git 37 | cd G-SLIDE 38 | cmake -B build 39 | cmake --build build 40 | ``` 41 | 42 | Before running G-SLIDE, you should download the **dataset** of Amazon-670K and re-configure the `amazon.json` properly. Note that there will be lots of information to be printed, so we recommend **redirecting stdout** when running. 43 | 44 | ```bash 45 | ./runme ./amazon.json > amazon.log 46 | ``` 47 | -------------------------------------------------------------------------------- /include/LSH.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "CscActNodes.h" 3 | #include "GPUMultiLinkedHashTable.h" 4 | 5 | class LSH { 6 | unsigned int *d_rand_keys; 7 | int *d_bins; 8 | 9 | int *d_rand_node_keys; 10 | int *d_rand_nodes; 11 | 12 | const int node_num; 13 | const int prev_node_num; 14 | 15 | const int K, L; 16 | const int bin_size; 17 | const int tot_elem_num; 18 | const int ceil_elem_num; 19 | const int tbl_num_per_tile; 20 | const int tbl_num_per_thread; 21 | 22 | const int bucket_num_per_tbl; 23 | const int bucket_capacity; 24 | int *d_buckets; 25 | int *d_bucket_sizes; 26 | 27 | const int min_act_num; 28 | 29 | int *d_hashed_bucket_ids_colmajor; 30 | // int *d_hashed_bucket_sizes; 31 | 32 | CscActNodes cmprs_gathered; 33 | 34 | GPUMultiLinkedHashTable multi_linked_htables; 35 | 36 | public: 37 | LSH(const int node_num, const int prev_node_num, const int max_batch_size, 38 | const int K, const int L, const int bin_size, 39 | const int bucket_num_per_tbl, const int bucket_capacity, 40 | const int threshold, const int min_act_num, const int tbl_num_per_tile, 41 | const int tbl_num_per_thread, const int linked_bucket_num_per_tbl, 42 | const int linked_pool_size); 43 | 44 | ~LSH(); 45 | 46 | void shuffle_bins(); 47 | void shuffle_rand(); 48 | 49 | void build(const float *d_weights_rowmajor, const bool reshuffle); 50 | 51 | void query_act_nodes(const CscActNodes &csc_inputs, 52 | const CscActNodes &cmprs_labels, const int batch_size, 53 | CscActNodes &csc_acts); 54 | 55 | void query_act_nodes(const CscActNodes &csc_inputs, const int batch_size, 56 | CscActNodes &csc_acts); 57 | }; -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | // need to check idx 10 | #define FOR_IDX_SYNC(idx, begin, end) \ 11 | for (int idx = (begin) + threadIdx.x; (idx - threadIdx.x) < (end); \ 12 | idx += blockDim.x) 13 | 14 | #define FOR_IDX_ASYNC(idx, begin, end) \ 15 | for (int idx = (begin) + threadIdx.x; idx < (end); idx += blockDim.x) 16 | 17 | #define FOR_OFFSET(offset, begin, end) \ 18 | for (int offset = (begin); offset < (end); offset += blockDim.x) 19 | 20 | #define CUDA_CHECK(ans) cuda_assert((ans), __FILE__, __LINE__) 21 | static inline void cuda_assert(cudaError_t code, const char *file, int line, 22 | bool abort_flag = true) { 23 | if (code != cudaSuccess) { 24 | std::cerr << "CUDA assert: " << cudaGetErrorString(code) << " at file " 25 | << file << " line " << line << std::endl; 26 | 27 | if (abort_flag) exit(code); 28 | } 29 | } 30 | 31 | #define CUBLAS_CHECK(ans) cublas_assert((ans), __FILE__, __LINE__) 32 | static inline void cublas_assert(cublasStatus_t code, const char *file, 33 | int line, bool abort_flag = true) { 34 | if (code != CUBLAS_STATUS_SUCCESS) { 35 | std::cerr << "CUBLAS assert: " << code << " at file " << file << " line " 36 | << line << std::endl; 37 | 38 | if (abort_flag) exit(code); 39 | } 40 | } 41 | 42 | static inline bool is_smem_enough(const void *knl, int thread_num, 43 | size_t smem_size) { 44 | int block_num; 45 | CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor( 46 | &block_num, knl, thread_num, smem_size)); 47 | return block_num > 0; 48 | } 49 | -------------------------------------------------------------------------------- /src/ReluLayer.cu: -------------------------------------------------------------------------------- 1 | #include "ReluLayer.h" 2 | #include "kernel.h" 3 | #include "utils.h" 4 | 5 | void ReluLayer::forward(const Layer &prev_layer, const int batch_size, 6 | const int thread_num, const int max_out_num) { 7 | assert(prev_layer.node_num == prev_node_num); 8 | 9 | const int block_num = (node_num * batch_size + thread_num - 1) / thread_num; 10 | get_dense_acts_knl<<>>(csc_acts, node_num, batch_size); 11 | 12 | const int smem_size = 13 | (sizeof(int) + sizeof(float)) * (thread_num + max_out_num); 14 | relu_fwd_slide_in_knl<<>>( 15 | prev_layer.csc_acts, d_weights, d_biases, node_num, max_out_num, 16 | csc_acts); 17 | } 18 | 19 | void ReluLayer::forward(const CscActNodes &csc_inputs, const int batch_size, 20 | const int thread_num, const int max_out_num) { 21 | const int block_num = (node_num * batch_size + thread_num - 1) / thread_num; 22 | get_dense_acts_knl<<>>(csc_acts, node_num, batch_size); 23 | 24 | const int smem_size = 25 | (sizeof(int) + sizeof(float)) * (thread_num + max_out_num); 26 | relu_fwd_slide_in_knl<<>>( 27 | csc_inputs, d_weights, d_biases, node_num, max_out_num, csc_acts); 28 | } 29 | 30 | void ReluLayer::bp(Layer &prev_layer, const int batch_size, 31 | const int thread_num, const int max_act_num) { 32 | assert(prev_layer.node_num == prev_node_num); 33 | 34 | const int smem_size = (sizeof(int) + sizeof(float)) * max_act_num; 35 | bp_knl<<>>( 36 | csc_acts, prev_layer.csc_acts, d_weights, d_cmprs_bp_deltas, node_num, 37 | max_act_num, prev_layer.d_cmprs_bp_deltas, weight_adam.d_ts, 38 | bias_adam.d_ts); 39 | } 40 | 41 | void ReluLayer::bp_first_layer(const CscActNodes &csc_inputs, 42 | const int batch_size, const int thread_num, 43 | const int max_act_num) { 44 | const int smem_size = (sizeof(int) + sizeof(float)) * max_act_num; 45 | bp_first_layer_knl<<>>( 46 | csc_acts, csc_inputs, d_cmprs_bp_deltas, node_num, max_act_num, 47 | weight_adam.d_ts, bias_adam.d_ts); 48 | } 49 | -------------------------------------------------------------------------------- /include/Network.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "CscActNodes.h" 11 | #include "Layer.h" 12 | #include "ReluLayer.h" 13 | #include "SoftmaxLayer.h" 14 | #include "utils.h" 15 | 16 | class Network { 17 | std::vector> relu_layers; 18 | std::shared_ptr softmax_layer; 19 | 20 | CscActNodes csc_inputs; 21 | CscActNodes cmprs_labels; 22 | 23 | const int layer_num; 24 | 25 | public: 26 | Network(const std::vector &node_num_per_layer, 27 | const std::vector &node_capacity_per_layer, const int input_size, 28 | const int max_batch_size, const int input_capacity, 29 | const int label_capacity, const int K, const int L, 30 | const int bin_size, const int bucket_num_per_tbl, 31 | const int bucket_capacity, const int threshold, 32 | const int min_softmax_act_num, const int tbl_num_per_tile, 33 | const int tbl_num_per_thread, const int linked_bucket_num_per_tbl, 34 | const int linked_pool_size); 35 | 36 | Network(const Network &) = delete; 37 | Network(Network &&) = delete; 38 | Network &operator=(const Network &) = delete; 39 | 40 | virtual ~Network(); 41 | 42 | int eval(const std::vector &h_cmprs_input_nodes, 43 | const std::vector &h_cmprs_input_vals, 44 | const std::vector &h_cmprs_input_offsets, 45 | const std::vector &h_cmprs_label_nodes, 46 | const std::vector &h_cmprs_label_offsets, const int batch_size, 47 | const int thread_num); 48 | 49 | void test_get_act_nodes(const int layer_idx, 50 | const std::vector &h_cmprs_act_nodes, 51 | const std::vector &h_cmprs_act_offsets) { 52 | if (layer_idx < relu_layers.size()) { 53 | relu_layers[layer_idx]->test_get_acts(h_cmprs_act_nodes, 54 | h_cmprs_act_offsets); 55 | } else { 56 | softmax_layer->test_get_acts(h_cmprs_act_nodes, h_cmprs_act_offsets); 57 | } 58 | } 59 | 60 | void train(const std::vector &h_cmprs_input_nodes, 61 | const std::vector &h_cmprs_input_vals, 62 | const std::vector &h_cmprs_input_offsets, 63 | const std::vector &h_cmprs_label_nodes, 64 | const std::vector &h_cmprs_label_offsets, 65 | const std::vector &max_act_nums, const int batch_size, 66 | const float lr, const int max_label_num, const int thread_num, 67 | const bool rebuild, const bool reshuffle); 68 | 69 | void rebuild(const bool reshuffle) { softmax_layer->rebuild(reshuffle); } 70 | }; -------------------------------------------------------------------------------- /include/lshKnl.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "CscActNodes.h" 3 | 4 | __global__ void init_bins_knl(int *d_bins, const int prev_node_num, 5 | const int tot_elem_num); 6 | 7 | __global__ void gen_rand_keys_knl(unsigned int *d_rand_keys, const int seed, 8 | const int prev_node_num, 9 | const int tot_elem_num); 10 | 11 | __global__ void gen_rand_keys_knl(int *d_rand_keys, const int seed, 12 | const int node_num); 13 | 14 | // Assumption: prev_node_num is small while node_num is large 15 | // tt: one thread for each tile 16 | __global__ void init_hash_tt_knl( 17 | const int *d_bins, const float *d_weights_rowmajor, const int prev_node_num, 18 | const int node_num, const int tot_elem_num, const int L, const int K, 19 | const int bin_size, const int tbl_num_per_tile, 20 | const int bucket_num_per_tbl, const int bucket_capacity, int *d_buckets, 21 | int *d_bucket_sizes); 22 | 23 | // Assumption: prev_node_num is small while node_num is large 24 | __global__ void init_hash_knl( 25 | const int *d_bins, const float *d_weights_rowmajor, const int prev_node_num, 26 | const int node_num, const int tot_elem_num, const int L, const int K, 27 | const int bin_size, const int tbl_num_per_tile, 28 | const int tbl_num_per_thread, const int bucket_num_per_tbl, 29 | const int bucket_capacity, int *d_buckets, int *d_bucket_sizes); 30 | 31 | // No shared memory for weights 32 | __global__ void init_hash_no_sw_knl( 33 | const int *d_bins, const float *d_weights_rowmajor, const int prev_node_num, 34 | const int node_num, const int tot_elem_num, const int L, const int K, 35 | const int bin_size, const int tbl_num_per_tile, 36 | const int bucket_num_per_tbl, const int bucket_capacity, int *d_buckets, 37 | int *d_bucket_sizes); 38 | 39 | // Assumption: previous layer is dense 40 | __global__ void get_hash_knl(const int *d_bins, 41 | const float *d_dense_inputs_colmajor, 42 | const int *d_bucket_sizes, const int in_node_num, 43 | const int tot_elem_num, const int L, const int K, 44 | const int bin_size, const int tbl_num_per_tile, 45 | const int batch_size, const int bucket_num_per_tbl, 46 | const int bucket_capacity, 47 | int *d_hashed_bucket_ids_colmajor, 48 | int *d_hashed_bucket_sizes_colmajor); 49 | 50 | __global__ void gather_buckets_knl(const int *d_hashed_bucket_ids_colmajor, 51 | const int *d_buckets, const int L, 52 | const int batch_size, 53 | const int bucket_num_per_tbl, 54 | const int bucket_capacity, 55 | CscActNodes cmprs_gathered); 56 | -------------------------------------------------------------------------------- /src/Layer.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "Layer.h" 6 | #include "kernel.h" 7 | #include "utils.h" 8 | 9 | Layer::Adam::Adam(int size) { 10 | CUDA_CHECK(cudaMalloc(&d_ts, sizeof(float) * size)); 11 | CUDA_CHECK(cudaMalloc(&d_moms, sizeof(float) * size)); 12 | CUDA_CHECK(cudaMalloc(&d_vels, sizeof(float) * size)); 13 | 14 | CUDA_CHECK(cudaMemset(d_ts, 0, sizeof(float) * size)); 15 | CUDA_CHECK(cudaMemset(d_moms, 0, sizeof(float) * size)); 16 | CUDA_CHECK(cudaMemset(d_vels, 0, sizeof(float) * size)); 17 | } 18 | 19 | Layer::Adam::~Adam() { 20 | CUDA_CHECK(cudaFree(d_ts)); 21 | CUDA_CHECK(cudaFree(d_moms)); 22 | CUDA_CHECK(cudaFree(d_vels)); 23 | } 24 | 25 | Layer::Layer(const int prev_node_num, const int node_num, 26 | const int max_batch_size, const int node_capacity) 27 | : prev_node_num(prev_node_num), 28 | node_num(node_num), 29 | weight_adam(prev_node_num * node_num), 30 | bias_adam(node_num), 31 | csc_acts(max_batch_size, node_capacity) { 32 | const int weight_size = prev_node_num * node_num; 33 | CUDA_CHECK(cudaMalloc(&d_weights, sizeof(float) * weight_size)); 34 | CUDA_CHECK(cudaMalloc(&d_biases, sizeof(float) * node_num)); 35 | 36 | std::vector tmp_weights(weight_size); 37 | std::vector tmp_biases(node_num); 38 | 39 | std::random_device rd; 40 | std::default_random_engine dre(rd()); 41 | std::normal_distribution distribution(0.0, 0.01); 42 | 43 | std::generate(tmp_weights.begin(), tmp_weights.end(), 44 | [&]() { return distribution(dre); }); 45 | std::generate(tmp_biases.begin(), tmp_biases.end(), 46 | [&]() { return distribution(dre); }); 47 | 48 | CUDA_CHECK(cudaMemcpy(d_weights, &tmp_weights[0], sizeof(float) * weight_size, 49 | cudaMemcpyHostToDevice)); 50 | CUDA_CHECK(cudaMemcpy(d_biases, &tmp_biases[0], sizeof(float) * node_num, 51 | cudaMemcpyHostToDevice)); 52 | 53 | CUDA_CHECK(cudaMalloc(&d_cmprs_bp_deltas, sizeof(float) * node_capacity)); 54 | CUDA_CHECK(cudaMemset(d_cmprs_bp_deltas, 0, sizeof(float) * node_capacity)); 55 | } 56 | 57 | Layer::~Layer() { 58 | CUDA_CHECK(cudaFree(d_weights)); 59 | CUDA_CHECK(cudaFree(d_biases)); 60 | CUDA_CHECK(cudaFree(d_cmprs_bp_deltas)); 61 | csc_acts.free(); 62 | } 63 | 64 | void Layer::update_weights(const int thread_num, const float lr) { 65 | const int weight_size = prev_node_num * node_num; 66 | const int weight_block_num = (weight_size + thread_num - 1) / thread_num; 67 | update_weights_knl<<>>( 68 | d_weights, weight_adam.d_ts, weight_adam.d_moms, weight_adam.d_vels, lr, 69 | weight_size); 70 | } 71 | 72 | void Layer::update_biases(const int thread_num, const float lr) { 73 | const int bias_block_num = (node_num + thread_num - 1) / thread_num; 74 | update_weights_knl<<>>( 75 | d_biases, bias_adam.d_ts, bias_adam.d_moms, bias_adam.d_vels, lr, 76 | node_num); 77 | } 78 | -------------------------------------------------------------------------------- /include/kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "CscActNodes.h" 3 | 4 | __global__ void get_dense_acts_knl(CscActNodes csc_acts, const int node_num, 5 | const int batch_size); 6 | 7 | __global__ void relu_fwd_slide_in_knl(const CscActNodes csc_inputs, 8 | const float *d_weights_colmajor, 9 | const float *d_biases, 10 | const int weight_row_num, 11 | const int max_out_num, 12 | CscActNodes csc_outputs); 13 | 14 | __global__ void softmax_fwd_bp_rowmajor_slide_in_knl( 15 | const CscActNodes csc_inputs, const float *d_weights_rowmajor, 16 | const float *d_biases, const CscActNodes cmprs_labels, 17 | const int weight_col_num, const int max_out_num, const int max_label_num, 18 | CscActNodes csc_outputs, float *d_cmpr_bp_deltas); 19 | 20 | __global__ void softmax_fwd_bp_rowmajor_slide_out_knl( 21 | const CscActNodes csc_inputs, const float *d_weights_rowmajor, 22 | const float *d_biases, const CscActNodes cmprs_labels, 23 | const int weight_col_num, const int max_in_num, const int max_label_num, 24 | CscActNodes csc_outputs, float *d_cmprs_bp_deltas); 25 | 26 | __global__ void softmax_fwd_bp_rowmajor_all_sm_knl( 27 | const CscActNodes csc_inputs, const float *d_weights_rowmajor, 28 | const float *d_biases, const CscActNodes cmprs_labels, 29 | const int weight_col_num, const int max_in_num, const int max_out_num, 30 | const int max_label_num, CscActNodes csc_outputs, float *d_cmprs_bp_deltas); 31 | 32 | __global__ void bp_knl(const CscActNodes csc_acts, const CscActNodes csc_prev, 33 | const float *d_weights_colmajor, 34 | const float *d_cmpr_bp_deltas, const int weight_row_num, 35 | const int max_act_num, float *d_cmpr_prev_bp_deltas, 36 | float *d_adam_ts, float *d_bias_adam_ts); 37 | 38 | __global__ void bp_rowmajor_knl(const CscActNodes csc_acts, 39 | const CscActNodes csc_prev, 40 | const float *d_weights_rowmajor, 41 | const float *d_cmpr_bp_deltas, 42 | const int weight_col_num, const int max_act_num, 43 | float *d_cmpr_prev_bp_deltas, float *d_adam_ts, 44 | float *d_bias_adam_ts); 45 | 46 | __global__ void bp_rowmajor_no_sm_knl(const CscActNodes csc_acts, 47 | const CscActNodes csc_prev, 48 | const float *d_weights_rowmajor, 49 | const float *d_cmprs_bp_deltas, 50 | const int weight_col_num, 51 | float *d_cmprs_prev_bp_deltas, 52 | float *d_adam_ts, float *d_bias_adam_ts); 53 | 54 | __global__ void bp_rowmajor_slide_knl( 55 | const CscActNodes csc_acts, const CscActNodes csc_prev, 56 | const float *d_weights_rowmajor, const float *d_cmprs_bp_deltas, 57 | const int weight_col_num, const int max_prev_num, 58 | float *d_cmprs_prev_bp_deltas, float *d_adam_ts, float *d_bias_adam_ts); 59 | 60 | __global__ void bp_first_layer_knl(const CscActNodes csc_acts, 61 | const CscActNodes csc_prev, 62 | const float *d_cmpr_bp_deltas, 63 | const int weight_row_num, 64 | const int max_act_num, float *d_adam_ts, 65 | float *d_bias_adam_ts); 66 | 67 | __global__ void update_weights_knl(float *d_weights, float *d_adam_ts, 68 | float *d_adam_moms, float *d_adam_vels, 69 | const float lr, const int weight_size); 70 | -------------------------------------------------------------------------------- /include/GPUMultiLinkedHashTable.h: -------------------------------------------------------------------------------- 1 | #include "CscActNodes.h" 2 | 3 | struct GPUMultiLinkedHashTable { 4 | const int max_tbl_num; 5 | const size_t bucket_num_per_tbl; 6 | const size_t pool_size; 7 | 8 | int *d_multi_tbl_keys; 9 | int *d_multi_tbl_vals; 10 | int *d_multi_tbl_nexts; 11 | int *d_multi_tbl_locks; 12 | int *d_multi_tbl_sizes; 13 | int *d_multi_tbl_pool_used_sizes; 14 | 15 | int threshold; 16 | 17 | struct filter { 18 | int threshold; 19 | 20 | filter(int threshold) : threshold(threshold) {} 21 | 22 | __device__ bool operator()(const int cnt) { return cnt >= threshold; } 23 | }; 24 | 25 | GPUMultiLinkedHashTable(const int max_tbl_num, 26 | const size_t bucket_num_per_tbl, 27 | const size_t pool_size, const int threshold); 28 | 29 | // virtual ~GPUMultiLinkedHashTable(); 30 | 31 | virtual void free(); 32 | 33 | void init_tbls(); 34 | 35 | __forceinline__ __device__ int d_hashier(const int key) { 36 | return key * 2654435761 & (~(1 << 31)); 37 | } 38 | 39 | __device__ void d_block_reduce_cnt(const int *d_raw_keys, 40 | const int raw_key_begin, 41 | const int raw_key_end, const int tbl_id); 42 | 43 | __forceinline__ __device__ void d_insert_label_seq( 44 | const int label, int *d_tbl_keys, int *d_tbl_vals, int *d_tbl_nexts, 45 | int &tbl_size, int &tbl_pool_used_size) { 46 | const int bucket_idx = d_hashier(label) % bucket_num_per_tbl; 47 | 48 | int pre_entry_idx = -1, entry_idx = bucket_idx; 49 | int searched_key = d_tbl_keys[bucket_idx]; 50 | if (searched_key != -1 && searched_key != label) { 51 | do { 52 | pre_entry_idx = entry_idx; 53 | entry_idx = d_tbl_nexts[pre_entry_idx]; 54 | } while (entry_idx != -1 && 55 | (searched_key = d_tbl_keys[entry_idx]) != label); 56 | } 57 | 58 | if (searched_key == label) { 59 | const int old_val = d_tbl_vals[entry_idx]; 60 | if (old_val < threshold) ++tbl_size; 61 | d_tbl_vals[entry_idx] = old_val + threshold; 62 | } else { 63 | if (entry_idx == -1) 64 | entry_idx = tbl_pool_used_size++ + bucket_num_per_tbl; 65 | d_tbl_keys[entry_idx] = label; 66 | d_tbl_vals[entry_idx] = threshold; 67 | if (pre_entry_idx != -1) d_tbl_nexts[pre_entry_idx] = entry_idx; 68 | ++tbl_size; 69 | } 70 | } 71 | 72 | __device__ void d_activate_labels_seq(const int *d_labels, 73 | const int label_begin, 74 | const int label_end, const int tbl_id); 75 | 76 | __device__ void d_activate_labels_seq(const int *d_labels, 77 | const int *d_rand_nodes, 78 | const int label_begin, 79 | const int label_end, const int tbl_id, 80 | const int min_act_num, 81 | const int node_num, const int seed); 82 | 83 | void block_reduce_cnt(const CscActNodes &cmprs_gathered, const int L, 84 | const int batch_size, const int thread_num); 85 | 86 | void activate_labels_seq(const CscActNodes &cmprs_labels, 87 | const int batch_size, const int thread_num); 88 | 89 | void activate_labels_seq(const CscActNodes &cmprs_labels, 90 | const int *d_rand_nodes, const int batch_size, 91 | const int min_act_num, const int node_num, 92 | const int thread_num); 93 | 94 | void get_act_nodes(CscActNodes &csc_acts, const int batch_size); 95 | }; 96 | 97 | __global__ void block_reduce_cnt_knl( 98 | const CscActNodes cmprs_gathered, const int L, 99 | GPUMultiLinkedHashTable multi_linked_htables); 100 | 101 | __global__ void activate_labels_seq_knl( 102 | const CscActNodes cmprs_labels, const int batch_size, 103 | GPUMultiLinkedHashTable multi_linked_htables); 104 | 105 | __global__ void activate_labels_seq_knl( 106 | const CscActNodes cmprs_labels, const int *d_rand_nodes, 107 | const int batch_size, const int min_act_num, const int node_num, 108 | const int seed, GPUMultiLinkedHashTable multi_linked_htables); 109 | -------------------------------------------------------------------------------- /src/CscActNodes.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "CscActNodes.h" 4 | #include "utils.h" 5 | 6 | CscActNodes::CscActNodes(int max_batch_size, int node_capacity, 7 | bool val_enabled, bool is_managed) 8 | : max_batch_size(max_batch_size), 9 | node_capacity(node_capacity), 10 | val_enabled(val_enabled) { 11 | if (!is_managed) { 12 | CUDA_CHECK(cudaMalloc(&d_nodes, sizeof(int) * node_capacity)); 13 | CUDA_CHECK(cudaMalloc(&d_offsets, sizeof(int) * (max_batch_size + 1))); 14 | if (val_enabled) 15 | CUDA_CHECK(cudaMalloc(&d_vals, sizeof(float) * node_capacity)); 16 | } else { 17 | CUDA_CHECK(cudaMallocManaged(&d_nodes, sizeof(int) * node_capacity)); 18 | CUDA_CHECK( 19 | cudaMallocManaged(&d_offsets, sizeof(int) * (max_batch_size + 1))); 20 | if (val_enabled) 21 | CUDA_CHECK(cudaMallocManaged(&d_vals, sizeof(float) * node_capacity)); 22 | } 23 | 24 | CUDA_CHECK(cudaMemset(d_nodes, 0, sizeof(int) * node_capacity)); 25 | CUDA_CHECK(cudaMemset(d_offsets, 0, sizeof(int) * (max_batch_size + 1))); 26 | if (val_enabled) 27 | CUDA_CHECK(cudaMemset(d_vals, 0, sizeof(float) * node_capacity)); 28 | } 29 | 30 | void CscActNodes::free() { 31 | CUDA_CHECK(cudaFree(d_nodes)); 32 | CUDA_CHECK(cudaFree(d_offsets)); 33 | if (val_enabled) CUDA_CHECK(cudaFree(d_vals)); 34 | } 35 | 36 | void CscActNodes::extract_from(const std::vector &h_cmprs_nodes, 37 | const std::vector &h_cmprs_vals, 38 | const std::vector &h_cmprs_offsets) { 39 | assert(h_cmprs_nodes.size() <= node_capacity); 40 | 41 | assert(val_enabled); 42 | 43 | CUDA_CHECK(cudaMemcpy(d_nodes, &h_cmprs_nodes[0], 44 | sizeof(int) * h_cmprs_nodes.size(), 45 | cudaMemcpyHostToDevice)); 46 | CUDA_CHECK(cudaMemcpy(d_vals, &h_cmprs_vals[0], 47 | sizeof(float) * h_cmprs_vals.size(), 48 | cudaMemcpyHostToDevice)); 49 | CUDA_CHECK(cudaMemcpy(d_offsets, &h_cmprs_offsets[0], 50 | sizeof(int) * h_cmprs_offsets.size(), 51 | cudaMemcpyHostToDevice)); 52 | } 53 | 54 | void CscActNodes::extract_from(const std::vector &h_cmprs_nodes, 55 | const std::vector &h_cmprs_offsets) { 56 | assert(h_cmprs_nodes.size() <= node_capacity); 57 | 58 | CUDA_CHECK(cudaMemcpy(d_nodes, &h_cmprs_nodes[0], 59 | sizeof(int) * h_cmprs_nodes.size(), 60 | cudaMemcpyHostToDevice)); 61 | CUDA_CHECK(cudaMemcpy(d_offsets, &h_cmprs_offsets[0], 62 | sizeof(int) * h_cmprs_offsets.size(), 63 | cudaMemcpyHostToDevice)); 64 | } 65 | 66 | void CscActNodes::extract_to(std::vector &h_cmprs_nodes, 67 | std::vector &h_cmprs_vals, 68 | std::vector &h_cmprs_offsets, 69 | const int batch_size) const { 70 | assert(val_enabled); 71 | 72 | h_cmprs_offsets = std::vector(batch_size + 1); 73 | CUDA_CHECK(cudaMemcpy(&h_cmprs_offsets[0], d_offsets, 74 | sizeof(int) * h_cmprs_offsets.size(), 75 | cudaMemcpyDeviceToHost)); 76 | 77 | int csc_size = h_cmprs_offsets.back(); 78 | h_cmprs_nodes = std::vector(csc_size); 79 | h_cmprs_vals = std::vector(csc_size); 80 | CUDA_CHECK(cudaMemcpy(&h_cmprs_nodes[0], d_nodes, sizeof(int) * csc_size, 81 | cudaMemcpyDeviceToHost)); 82 | CUDA_CHECK(cudaMemcpy(&h_cmprs_vals[0], d_vals, sizeof(float) * csc_size, 83 | cudaMemcpyDeviceToHost)); 84 | } 85 | 86 | void CscActNodes::extract_to(std::vector &h_cmprs_nodes, 87 | std::vector &h_cmprs_offsets, 88 | const int batch_size) const { 89 | h_cmprs_offsets = std::vector(batch_size + 1); 90 | CUDA_CHECK(cudaMemcpy(&h_cmprs_offsets[0], d_offsets, 91 | sizeof(int) * h_cmprs_offsets.size(), 92 | cudaMemcpyDeviceToHost)); 93 | 94 | int csc_size = h_cmprs_offsets.back(); 95 | h_cmprs_nodes = std::vector(csc_size); 96 | CUDA_CHECK(cudaMemcpy(&h_cmprs_nodes[0], d_nodes, sizeof(int) * csc_size, 97 | cudaMemcpyDeviceToHost)); 98 | } 99 | -------------------------------------------------------------------------------- /src/SoftmaxLayer.cu: -------------------------------------------------------------------------------- 1 | #include "GPUTimer.h" 2 | #include "SoftmaxLayer.h" 3 | #include "kernel.h" 4 | #include "utils.h" 5 | 6 | SoftmaxLayer::SoftmaxLayer(const int prev_node_num, const int node_num, 7 | const int max_batch_size, const int node_capacity, 8 | const int K, const int L, const int bin_size, 9 | const int bucket_num_per_tbl, 10 | const int bucket_capacity, const int threshold, 11 | const int min_act_num, const int tbl_num_per_tile, 12 | const int tbl_num_per_thread, 13 | const int linked_bucket_num_per_tbl, 14 | const int linked_pool_size) 15 | : Layer(prev_node_num, node_num, max_batch_size, node_capacity) { 16 | lsh_tbls_ptr = std::make_shared( 17 | node_num, prev_node_num, max_batch_size, K, L, bin_size, 18 | bucket_num_per_tbl, bucket_capacity, threshold, min_act_num, 19 | tbl_num_per_tile, tbl_num_per_thread, linked_bucket_num_per_tbl, 20 | linked_pool_size); 21 | 22 | GPUTimer timer; 23 | timer.start(); 24 | lsh_tbls_ptr->build(d_weights, true); 25 | timer.record("[Build LSH Table] "); 26 | 27 | CUBLAS_CHECK(cublasCreate(&handle)); 28 | CUDA_CHECK(cudaMallocManaged(&d_dense_activations, 29 | sizeof(float) * max_batch_size * node_num)); 30 | } 31 | 32 | SoftmaxLayer::~SoftmaxLayer() { 33 | CUBLAS_CHECK(cublasDestroy(handle)); 34 | CUDA_CHECK(cudaFree(d_dense_activations)); 35 | } 36 | 37 | void SoftmaxLayer::forward(const Layer &prev_layer, 38 | const CscActNodes &cmprs_labels, 39 | const int batch_size, const int thread_num, 40 | const int max_in_num, const int max_out_num, 41 | const int max_label_num) { 42 | assert(prev_layer.node_num == prev_node_num); 43 | 44 | lsh_tbls_ptr->query_act_nodes(prev_layer.csc_acts, cmprs_labels, batch_size, 45 | csc_acts); 46 | 47 | int smem_size = (sizeof(int) + sizeof(float)) * (max_in_num + max_out_num) + 48 | sizeof(int) * max_label_num; 49 | if (is_smem_enough((void *)softmax_fwd_bp_rowmajor_all_sm_knl, thread_num, 50 | smem_size)) { 51 | softmax_fwd_bp_rowmajor_all_sm_knl<<>>( 52 | prev_layer.csc_acts, d_weights, d_biases, cmprs_labels, prev_node_num, 53 | max_in_num, max_out_num, max_label_num, csc_acts, d_cmprs_bp_deltas); 54 | } else { 55 | smem_size = (sizeof(int) + sizeof(float)) * max_in_num + 56 | sizeof(int) * max_label_num; 57 | softmax_fwd_bp_rowmajor_slide_out_knl<<>>( 59 | prev_layer.csc_acts, d_weights, d_biases, cmprs_labels, prev_node_num, 60 | max_in_num, max_label_num, csc_acts, d_cmprs_bp_deltas); 61 | } 62 | } 63 | 64 | void SoftmaxLayer::forward_dense(const Layer &prev_layer, 65 | const int batch_size) { 66 | const float alpha = 1.; 67 | const float beta = 0.; 68 | 69 | CUBLAS_CHECK(cublasSgemm( 70 | handle, CUBLAS_OP_T, CUBLAS_OP_N, node_num, batch_size, prev_node_num, 71 | &alpha, d_weights, prev_node_num, prev_layer.csc_acts.d_vals, 72 | prev_node_num, &beta, d_dense_activations, node_num)); 73 | } 74 | 75 | void SoftmaxLayer::bp(Layer &prev_layer, const int batch_size, 76 | const int thread_num, const int max_prev_num, 77 | const int max_act_num) { 78 | int smem_size = (sizeof(int) + sizeof(float)) * max_act_num; 79 | if (is_smem_enough((void *)bp_rowmajor_knl, thread_num, smem_size)) { 80 | bp_rowmajor_knl<<>>( 81 | csc_acts, prev_layer.csc_acts, d_weights, d_cmprs_bp_deltas, 82 | prev_node_num, max_act_num, prev_layer.d_cmprs_bp_deltas, 83 | weight_adam.d_ts, bias_adam.d_ts); 84 | } else { 85 | // int smem_size = (sizeof(int) + sizeof(float) * 2) * max_prev_num; 86 | // bp_rowmajor_slide_knl<<>>( 87 | // csc_acts, prev_layer.csc_acts, d_weights, d_cmprs_bp_deltas, 88 | // prev_node_num, max_prev_num, prev_layer.d_cmprs_bp_deltas, 89 | // weight_adam.d_ts, bias_adam.d_ts); 90 | bp_rowmajor_no_sm_knl<<>>( 91 | csc_acts, prev_layer.csc_acts, d_weights, d_cmprs_bp_deltas, 92 | prev_node_num, prev_layer.d_cmprs_bp_deltas, weight_adam.d_ts, 93 | bias_adam.d_ts); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/Network.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "GPUTimer.h" 6 | #include "Network.h" 7 | 8 | Network::Network(const std::vector &node_num_per_layer, 9 | const std::vector &node_capacity_per_layer, 10 | const int input_size, const int max_batch_size, 11 | const int input_capacity, const int label_capacity, 12 | const int K, const int L, const int bin_size, 13 | const int bucket_num_per_tbl, const int bucket_capacity, 14 | const int threshold, const int min_softmax_act_num, 15 | const int tbl_num_per_tile, const int tbl_num_per_thread, 16 | const int linked_bucket_num_per_tbl, 17 | const int linked_pool_size) 18 | : csc_inputs(max_batch_size, input_capacity), 19 | cmprs_labels(max_batch_size, label_capacity, false), 20 | layer_num(node_num_per_layer.size()) { 21 | assert(layer_num >= 2); 22 | assert(layer_num == node_capacity_per_layer.size()); 23 | 24 | relu_layers = std::vector>(layer_num - 1); 25 | for (int l = 0; l < layer_num; ++l) { 26 | if (l == 0) { 27 | relu_layers[l] = std::make_shared( 28 | input_size, node_num_per_layer[l], max_batch_size, 29 | node_capacity_per_layer[l]); 30 | } else if (l + 1 == layer_num) { 31 | softmax_layer = std::make_shared( 32 | node_num_per_layer[l - 1], node_num_per_layer[l], max_batch_size, 33 | node_capacity_per_layer[l], K, L, bin_size, bucket_num_per_tbl, 34 | bucket_capacity, threshold, min_softmax_act_num, tbl_num_per_tile, 35 | tbl_num_per_thread, linked_bucket_num_per_tbl, linked_pool_size); 36 | } else { 37 | relu_layers[l] = std::make_shared( 38 | node_num_per_layer[l - 1], node_num_per_layer[l], max_batch_size, 39 | node_capacity_per_layer[l]); 40 | } 41 | } 42 | } 43 | 44 | Network::~Network() { 45 | csc_inputs.free(); 46 | cmprs_labels.free(); 47 | } 48 | 49 | int Network::eval(const std::vector &h_cmprs_input_nodes, 50 | const std::vector &h_cmprs_input_vals, 51 | const std::vector &h_cmprs_input_offsets, 52 | const std::vector &h_cmprs_label_nodes, 53 | const std::vector &h_cmprs_label_offsets, 54 | const int batch_size, const int thread_num) { 55 | // forward 56 | csc_inputs.extract_from(h_cmprs_input_nodes, h_cmprs_input_vals, 57 | h_cmprs_input_offsets); 58 | 59 | for (int l = 0; l < relu_layers.size(); ++l) { 60 | const int node_num = relu_layers[l]->node_num; 61 | if (l == 0) { 62 | relu_layers[l]->forward(csc_inputs, batch_size, thread_num, node_num); 63 | } else { 64 | relu_layers[l]->forward(*relu_layers[l - 1], batch_size, thread_num, 65 | node_num); 66 | } 67 | } 68 | softmax_layer->forward_dense(*relu_layers.back(), batch_size); 69 | 70 | CUDA_CHECK(cudaDeviceSynchronize()); 71 | 72 | int correct_cnt = 0; 73 | for (int b = 0; b < batch_size; b++) { 74 | const float *begin = 75 | softmax_layer->d_dense_activations + b * softmax_layer->node_num; 76 | const float *end = begin + softmax_layer->node_num; 77 | const int max_node = 78 | thrust::max_element(thrust::device, begin, end) - begin; 79 | 80 | const int label_begin = h_cmprs_label_offsets[b]; 81 | const int label_end = h_cmprs_label_offsets[b + 1]; 82 | if (std::find(h_cmprs_label_nodes.begin() + label_begin, 83 | h_cmprs_label_nodes.begin() + label_end, 84 | max_node) != h_cmprs_label_nodes.begin() + label_end) { 85 | correct_cnt++; 86 | } 87 | } 88 | 89 | return correct_cnt; 90 | } 91 | 92 | void Network::train(const std::vector &h_cmprs_input_nodes, 93 | const std::vector &h_cmprs_input_vals, 94 | const std::vector &h_cmprs_input_offsets, 95 | const std::vector &h_cmprs_label_nodes, 96 | const std::vector &h_cmprs_label_offsets, 97 | const std::vector &max_act_nums, const int batch_size, 98 | const float lr, const int max_label_num, 99 | const int thread_num, const bool rebuild, 100 | const bool reshuffle) { 101 | GPUTimer timer; 102 | 103 | csc_inputs.extract_from(h_cmprs_input_nodes, h_cmprs_input_vals, 104 | h_cmprs_input_offsets); 105 | cmprs_labels.extract_from(h_cmprs_label_nodes, h_cmprs_label_offsets); 106 | 107 | // forward 108 | timer.start(); 109 | for (int l = 0; l < relu_layers.size(); ++l) { 110 | if (l == 0) { 111 | relu_layers[l]->forward(csc_inputs, batch_size, thread_num, 112 | max_act_nums[l]); 113 | } else { 114 | relu_layers[l]->forward(*relu_layers[l - 1], batch_size, thread_num, 115 | max_act_nums[l]); 116 | } 117 | // timer.record("[FW " + std::to_string(l) + "] "); 118 | } 119 | softmax_layer->forward(*relu_layers.back(), cmprs_labels, batch_size, 120 | thread_num, *(max_act_nums.end() - 2), 121 | max_act_nums.back(), max_label_num); 122 | // timer.record("[FW " + std::to_string(layer_num - 1) + "] "); 123 | 124 | // backpropagate 125 | softmax_layer->bp(*relu_layers.back(), batch_size, thread_num, 126 | *(max_act_nums.end() - 2), max_act_nums.back()); 127 | // timer.record("[BP " + std::to_string(layer_num - 1) + "] "); 128 | for (int l = relu_layers.size() - 1; l >= 0; --l) { 129 | if (l == 0) { 130 | relu_layers[l]->bp_first_layer(csc_inputs, batch_size, thread_num, 131 | max_act_nums[l]); 132 | } else { 133 | relu_layers[l]->bp(*relu_layers[l - 1], batch_size, thread_num, 134 | max_act_nums[l]); 135 | } 136 | // timer.record("[BP " + std::to_string(l) + "] "); 137 | } 138 | 139 | // update 140 | for (int l = 0; l < relu_layers.size(); ++l) { 141 | relu_layers[l]->update_weights(thread_num, lr); 142 | relu_layers[l]->update_biases(thread_num, lr); 143 | // timer.record("[UPDATE " + std::to_string(l) + "] "); 144 | } 145 | softmax_layer->update_weights(thread_num, lr); 146 | softmax_layer->update_biases(thread_num, lr); 147 | // timer.record("[UPDATE " + std::to_string(layer_num - 1) + "] "); 148 | 149 | if (rebuild || reshuffle) { 150 | softmax_layer->rebuild(reshuffle); 151 | // timer.record("[REBUILD] "); 152 | } 153 | 154 | CUDA_CHECK(cudaDeviceSynchronize()); 155 | } 156 | -------------------------------------------------------------------------------- /src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "GPUTimer.h" 9 | #include "Network.h" 10 | #include "json/json.h" 11 | #include "utils.h" 12 | 13 | std::vector jarr_to_vec(const Json::Value &jarr) { 14 | const size_t size = jarr.size(); 15 | std::vector res(size); 16 | for (int i = 0; i < size; ++i) { 17 | res[i] = jarr[i].asInt(); 18 | } 19 | 20 | return res; 21 | } 22 | 23 | // Return: real batch size 24 | int get_batch_data(std::ifstream &ist, std::vector &h_cmprs_input_nodes, 25 | std::vector &h_cmprs_input_vals, 26 | std::vector &h_cmprs_input_offsets, 27 | std::vector &h_cmprs_labels, 28 | std::vector &h_cmprs_label_offsets, 29 | const int batch_size) { 30 | h_cmprs_input_nodes.clear(); 31 | h_cmprs_input_vals.clear(); 32 | h_cmprs_input_offsets.clear(); 33 | h_cmprs_labels.clear(); 34 | h_cmprs_label_offsets.clear(); 35 | 36 | h_cmprs_input_offsets.push_back(0); 37 | h_cmprs_label_offsets.push_back(0); 38 | 39 | for (int b = 0; b < batch_size; ++b) { 40 | int label; 41 | if (ist >> label) { 42 | h_cmprs_labels.push_back(label); 43 | } else { 44 | return b; 45 | } 46 | 47 | while (ist.get() == ',') { 48 | ist >> label; 49 | h_cmprs_labels.push_back(label); 50 | } 51 | h_cmprs_label_offsets.push_back(h_cmprs_labels.size()); 52 | 53 | do { 54 | int node; 55 | ist >> node; 56 | assert(ist.get() == ':'); 57 | 58 | float val; 59 | ist >> val; 60 | h_cmprs_input_nodes.push_back(node); 61 | h_cmprs_input_vals.push_back(val); 62 | } while (ist.get() == ' '); 63 | h_cmprs_input_offsets.push_back(h_cmprs_input_nodes.size()); 64 | } 65 | 66 | return batch_size; 67 | } 68 | 69 | int main(int argc, char *argv[]) { 70 | if (argc != 2) { 71 | printf("Usage: %s config.json\n", argv[0]); 72 | exit(1); 73 | } 74 | 75 | Json::Reader reader; 76 | Json::Value root; 77 | std::ifstream config_ist(argv[1]); 78 | if (!reader.parse(config_ist, root)) { 79 | printf("Parse %s failed!\n", argv[1]); 80 | exit(1); 81 | } 82 | 83 | const std::vector node_num_per_layer = 84 | jarr_to_vec(root["node_num_per_layer"]); 85 | // const std::vector node_capacity_per_layer = 86 | // jarr_to_vec(root["node_capacity_per_layer"]); 87 | const int input_size = root["input_size"].asInt(); 88 | const int max_batch_size = root["max_batch_size"].asInt(); 89 | // const int input_capacity = root["input_capacity"].asInt(); 90 | // const int label_capacity = root["label_capacity"].asInt(); 91 | const int K = root["K"].asInt(), L = root["L"].asInt(); 92 | const int bin_size = root["bin_size"].asInt(); 93 | // const int bucket_num_per_tbl = root["bucket_num_per_tbl"].asInt(); 94 | int bucket_num_per_tbl = 1; 95 | for (int i = 0; i < K; ++i) { 96 | bucket_num_per_tbl += (bin_size - 1) << ((K - 1 - i) * (int)log(bin_size)); 97 | } 98 | const int bucket_capacity = root["bucket_capacity"].asInt(); 99 | const int threshold = root["threshold"].asInt(); 100 | const int min_softmax_act_num = root["min_softmax_act_num"].asInt(); 101 | const int tbl_num_per_tile = root["tbl_num_per_tile"].asInt(); 102 | const int tbl_num_per_thread = root["tbl_num_per_thread"].asInt(); 103 | // const int linked_bucket_num_per_tbl = 104 | // root["linked_bucket_num_per_tbl"].asInt(); 105 | // const int linked_pool_size = root["linked_pool_size"].asInt(); 106 | const int linked_bucket_num_per_tbl = bucket_capacity * L * 2; 107 | const int linked_pool_size = bucket_capacity * L; 108 | 109 | const int max_input_num = root["max_input_num"].asInt(); 110 | const std::vector max_act_nums = jarr_to_vec(root["max_act_nums"]); 111 | const int max_label_num = root["max_label_num"].asInt(); 112 | 113 | const int input_capacity = max_input_num * max_batch_size; 114 | const int label_capacity = max_label_num * max_batch_size; 115 | std::vector node_capacity_per_layer(node_num_per_layer.size()); 116 | for (int i = 0; i < node_num_per_layer.size(); ++i) { 117 | node_capacity_per_layer[i] = max_act_nums[i] * max_batch_size; 118 | } 119 | 120 | Network network(node_num_per_layer, node_capacity_per_layer, input_size, 121 | max_batch_size, input_capacity, label_capacity, K, L, 122 | bin_size, bucket_num_per_tbl, bucket_capacity, threshold, 123 | min_softmax_act_num, tbl_num_per_tile, tbl_num_per_thread, 124 | linked_bucket_num_per_tbl, linked_pool_size); 125 | 126 | const float lr = root["lr"].asFloat(); 127 | const float BETA1 = root["BETA1"].asFloat(); 128 | const float BETA2 = root["BETA2"].asFloat(); 129 | const int rebuild_period = root["rebuild_period"].asInt() / max_batch_size; 130 | const int reshuffle_period = 131 | root["reshuffle_period"].asInt() / max_batch_size; 132 | const int thread_num = root["thread_num"].asInt(); 133 | const int epoch_num = root["epoch_num"].asInt(); 134 | 135 | const std::string train_fname = root["train_fname"].asString(); 136 | const std::string test_fname = root["test_fname"].asString(); 137 | 138 | GPUTimer timer; 139 | float tot_time = 0; 140 | 141 | int glb_itr = 0; 142 | for (int e = 0; e < epoch_num; e++) { 143 | printf("------------------- Epoch %d ---------------------\n", e); 144 | std::ifstream train_ist(train_fname); 145 | std::ifstream test_ist(test_fname); 146 | if (!train_ist || !test_ist) { 147 | std::cerr << "Cannot open dataset file!" << std::endl; 148 | exit(-1); 149 | } 150 | 151 | std::string header; 152 | std::getline(train_ist, header); // skip header 153 | std::getline(test_ist, header); // skip header 154 | 155 | int batch_size; 156 | do { 157 | std::vector h_cmprs_input_nodes; 158 | std::vector h_cmprs_input_vals; 159 | std::vector h_cmprs_input_offsets; 160 | std::vector h_cmprs_labels; 161 | std::vector h_cmprs_label_offsets; 162 | batch_size = 163 | get_batch_data(train_ist, h_cmprs_input_nodes, h_cmprs_input_vals, 164 | h_cmprs_input_offsets, h_cmprs_labels, 165 | h_cmprs_label_offsets, max_batch_size); 166 | 167 | if (batch_size == 0) break; 168 | 169 | const float tmplr = lr * sqrt((1 - pow(BETA2, glb_itr + 1))) / 170 | (1 - pow(BETA1, glb_itr + 1)); 171 | const bool rebuild = (glb_itr + 1) % rebuild_period == 0; 172 | const bool reshuffle = (glb_itr + 1) % reshuffle_period == 0; 173 | 174 | timer.start(); 175 | 176 | network.train(h_cmprs_input_nodes, h_cmprs_input_vals, 177 | h_cmprs_input_offsets, h_cmprs_labels, 178 | h_cmprs_label_offsets, max_act_nums, batch_size, tmplr, 179 | max_label_num, thread_num, rebuild, reshuffle); 180 | 181 | tot_time += timer.record("[Iteration " + std::to_string(glb_itr) + "] "); 182 | 183 | glb_itr++; 184 | } while (batch_size == max_batch_size); 185 | 186 | printf("Current elapsed time %f ms\n", tot_time); 187 | 188 | // eval 189 | int correct_cnt = 0, test_cnt = 0; 190 | do { 191 | std::vector h_cmprs_input_nodes; 192 | std::vector h_cmprs_input_vals; 193 | std::vector h_cmprs_input_offsets; 194 | std::vector h_cmprs_labels; 195 | std::vector h_cmprs_label_offsets; 196 | batch_size = 197 | get_batch_data(test_ist, h_cmprs_input_nodes, h_cmprs_input_vals, 198 | h_cmprs_input_offsets, h_cmprs_labels, 199 | h_cmprs_label_offsets, max_batch_size); 200 | 201 | if (batch_size == 0) break; 202 | 203 | correct_cnt += network.eval( 204 | h_cmprs_input_nodes, h_cmprs_input_vals, h_cmprs_input_offsets, 205 | h_cmprs_labels, h_cmprs_label_offsets, batch_size, thread_num); 206 | 207 | test_cnt += batch_size; 208 | } while (batch_size == max_batch_size); 209 | 210 | printf("Test %d records, %d correct; accuracy: %f\n", test_cnt, correct_cnt, 211 | ((float)correct_cnt) / test_cnt); 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /src/LSH.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "LSH.h" 5 | #include "lshKnl.h" 6 | #include "utils.h" 7 | 8 | LSH::LSH(const int node_num, const int prev_node_num, const int max_batch_size, 9 | const int K, const int L, const int bin_size, 10 | const int bucket_num_per_tbl, const int bucket_capacity, 11 | const int threshold, const int min_act_num, const int tbl_num_per_tile, 12 | const int tbl_num_per_thread, const int linked_bucket_num_per_tbl, 13 | const int linked_pool_size) 14 | : node_num(node_num), 15 | prev_node_num(prev_node_num), 16 | K(K), 17 | L(L), 18 | bin_size(bin_size), 19 | bucket_num_per_tbl(bucket_num_per_tbl), 20 | bucket_capacity(bucket_capacity), 21 | min_act_num(min_act_num), 22 | tot_elem_num(K * L * bin_size), 23 | ceil_elem_num((K * L * bin_size + prev_node_num - 1) / prev_node_num * 24 | prev_node_num), 25 | tbl_num_per_tile(tbl_num_per_tile), 26 | tbl_num_per_thread(tbl_num_per_thread), 27 | cmprs_gathered(L * max_batch_size, bucket_capacity * L * max_batch_size, 28 | false, true), 29 | multi_linked_htables(max_batch_size, linked_bucket_num_per_tbl, 30 | linked_pool_size, threshold) { 31 | CUDA_CHECK( 32 | cudaMallocManaged(&d_rand_keys, sizeof(unsigned int) * ceil_elem_num)); 33 | CUDA_CHECK(cudaMalloc(&d_bins, sizeof(int) * ceil_elem_num)); 34 | 35 | const int thread_num = 128; 36 | const int block_num = (ceil_elem_num + thread_num - 1) / thread_num; 37 | init_bins_knl<<>>(d_bins, prev_node_num, 38 | ceil_elem_num); 39 | 40 | CUDA_CHECK(cudaMallocManaged(&d_rand_node_keys, sizeof(int) * node_num)); 41 | CUDA_CHECK(cudaMalloc(&d_rand_nodes, sizeof(int) * node_num)); 42 | thrust::sequence(thrust::device, d_rand_nodes, d_rand_nodes + node_num); 43 | 44 | const size_t tot_bucket_num = L * bucket_num_per_tbl; 45 | const size_t tot_bucket_capacity = tot_bucket_num * bucket_capacity; 46 | CUDA_CHECK(cudaMallocManaged(&d_buckets, sizeof(int) * tot_bucket_capacity)); 47 | CUDA_CHECK(cudaMallocManaged(&d_bucket_sizes, sizeof(int) * tot_bucket_num)); 48 | 49 | CUDA_CHECK(cudaMallocManaged(&d_hashed_bucket_ids_colmajor, 50 | sizeof(int) * L * max_batch_size)); 51 | } 52 | 53 | LSH::~LSH() { 54 | CUDA_CHECK(cudaFree(d_rand_keys)); 55 | CUDA_CHECK(cudaFree(d_bins)); 56 | 57 | CUDA_CHECK(cudaFree(d_buckets)); 58 | CUDA_CHECK(cudaFree(d_bucket_sizes)); 59 | 60 | CUDA_CHECK(cudaFree(d_hashed_bucket_ids_colmajor)); 61 | 62 | cmprs_gathered.free(); 63 | multi_linked_htables.free(); 64 | } 65 | 66 | void LSH::shuffle_bins() { 67 | const int thread_num = 128; 68 | const int block_num = (ceil_elem_num + thread_num - 1) / thread_num; 69 | gen_rand_keys_knl<<>>(d_rand_keys, rand(), 70 | prev_node_num, ceil_elem_num); 71 | thrust::sort_by_key(thrust::device, d_rand_keys, d_rand_keys + ceil_elem_num, 72 | d_bins); 73 | } 74 | 75 | void LSH::shuffle_rand() { 76 | const int thread_num = 128; 77 | const int block_num = (node_num + thread_num - 1) / thread_num; 78 | gen_rand_keys_knl<<>>(d_rand_node_keys, rand(), 79 | node_num); 80 | thrust::sort_by_key(thrust::device, d_rand_node_keys, 81 | d_rand_node_keys + node_num, d_rand_nodes); 82 | } 83 | 84 | void LSH::build(const float *d_weights_rowmajor, const bool reshuffle) { 85 | if (reshuffle) { 86 | shuffle_bins(); 87 | shuffle_rand(); 88 | } 89 | 90 | CUDA_CHECK( 91 | cudaMemset(d_bucket_sizes, 0, sizeof(int) * L * bucket_num_per_tbl)); 92 | 93 | int thread_num; 94 | bool success_flag = false; 95 | for (thread_num = 128; thread_num >= 32 && !success_flag; thread_num >>= 1) { 96 | const int block_num = (node_num + thread_num - 1) / thread_num; 97 | const int smem_size = 98 | (K * bin_size * tbl_num_per_tile + thread_num * prev_node_num) * 99 | sizeof(int); 100 | if (is_smem_enough((void *)init_hash_knl, thread_num, smem_size)) { 101 | if (tbl_num_per_thread == tbl_num_per_tile) { 102 | init_hash_tt_knl<<>>( 103 | d_bins, d_weights_rowmajor, prev_node_num, node_num, tot_elem_num, 104 | L, K, bin_size, tbl_num_per_tile, bucket_num_per_tbl, 105 | bucket_capacity, d_buckets, d_bucket_sizes); 106 | } else { 107 | init_hash_knl<<>>( 108 | d_bins, d_weights_rowmajor, prev_node_num, node_num, tot_elem_num, 109 | L, K, bin_size, tbl_num_per_tile, tbl_num_per_thread, 110 | bucket_num_per_tbl, bucket_capacity, d_buckets, d_bucket_sizes); 111 | } 112 | 113 | success_flag = true; 114 | } 115 | } 116 | 117 | if (!success_flag) { // TODO 118 | thread_num = 128; 119 | const int block_num = (L + tbl_num_per_tile) / tbl_num_per_tile; 120 | const int smem_size = sizeof(int) * K * bin_size * tbl_num_per_tile; 121 | init_hash_no_sw_knl<<>>( 122 | d_bins, d_weights_rowmajor, prev_node_num, node_num, tot_elem_num, L, K, 123 | bin_size, tbl_num_per_tile, bucket_num_per_tbl, bucket_capacity, 124 | d_buckets, d_bucket_sizes); 125 | } 126 | } 127 | 128 | void LSH::query_act_nodes(const CscActNodes &csc_inputs, 129 | const CscActNodes &cmprs_labels, const int batch_size, 130 | CscActNodes &csc_acts) { 131 | // Assume inputs is dense 132 | // TODO: dense -> sparse transform 133 | const float *d_dense_inputs_colmajor = csc_inputs.d_vals; 134 | 135 | const int thread_num = 128; 136 | const int hash_block_num = (L + tbl_num_per_tile - 1) / tbl_num_per_tile; 137 | const int smem_size = sizeof(int) * K * bin_size * tbl_num_per_tile; 138 | get_hash_knl<<>>( 139 | d_bins, d_dense_inputs_colmajor, d_bucket_sizes, prev_node_num, 140 | tot_elem_num, L, K, bin_size, tbl_num_per_tile, batch_size, 141 | bucket_num_per_tbl, bucket_capacity, d_hashed_bucket_ids_colmajor, 142 | cmprs_gathered.d_offsets + 1); 143 | 144 | thrust::inclusive_scan(thrust::device, cmprs_gathered.d_offsets + 1, 145 | cmprs_gathered.d_offsets + 1 + L * batch_size, 146 | cmprs_gathered.d_offsets + 1); 147 | 148 | const int gather_block_num = (L * batch_size + thread_num - 1) / thread_num; 149 | gather_buckets_knl<<>>( 150 | d_hashed_bucket_ids_colmajor, d_buckets, L, batch_size, 151 | bucket_num_per_tbl, bucket_capacity, cmprs_gathered); 152 | 153 | multi_linked_htables.init_tbls(); 154 | multi_linked_htables.block_reduce_cnt(cmprs_gathered, L, batch_size, 155 | thread_num); 156 | multi_linked_htables.activate_labels_seq(cmprs_labels, d_rand_nodes, 157 | batch_size, min_act_num, node_num, 158 | thread_num); 159 | multi_linked_htables.get_act_nodes(csc_acts, batch_size); 160 | } 161 | 162 | void LSH::query_act_nodes(const CscActNodes &csc_inputs, const int batch_size, 163 | CscActNodes &csc_acts) { 164 | // Assume inputs is dense 165 | // TODO: dense -> sparse transform 166 | const float *d_dense_inputs_colmajor = csc_inputs.d_vals; 167 | 168 | const int thread_num = 128; 169 | const int hash_block_num = (L + tbl_num_per_tile - 1) / tbl_num_per_tile; 170 | const int smem_size = sizeof(int) * K * bin_size * tbl_num_per_tile; 171 | get_hash_knl<<>>( 172 | d_bins, d_dense_inputs_colmajor, d_bucket_sizes, prev_node_num, 173 | tot_elem_num, L, K, bin_size, tbl_num_per_tile, batch_size, 174 | bucket_num_per_tbl, bucket_capacity, d_hashed_bucket_ids_colmajor, 175 | cmprs_gathered.d_offsets + 1); 176 | 177 | thrust::inclusive_scan(thrust::device, cmprs_gathered.d_offsets + 1, 178 | cmprs_gathered.d_offsets + 1 + L * batch_size, 179 | cmprs_gathered.d_offsets + 1); 180 | 181 | const int gather_block_num = (L * batch_size + thread_num - 1) / thread_num; 182 | gather_buckets_knl<<>>( 183 | d_hashed_bucket_ids_colmajor, d_buckets, L, batch_size, 184 | bucket_num_per_tbl, bucket_capacity, cmprs_gathered); 185 | 186 | multi_linked_htables.init_tbls(); 187 | multi_linked_htables.block_reduce_cnt(cmprs_gathered, L, batch_size, 188 | thread_num); 189 | multi_linked_htables.get_act_nodes(csc_acts, batch_size); 190 | } 191 | -------------------------------------------------------------------------------- /src/GPUMultiLinkedHashTable.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | #include "GPUMultiLinkedHashTable.h" 10 | #include "utils.h" 11 | 12 | GPUMultiLinkedHashTable::GPUMultiLinkedHashTable( 13 | const int max_tbl_num, const size_t bucket_num_per_tbl, 14 | const size_t pool_size, const int threshold) 15 | : max_tbl_num(max_tbl_num), 16 | bucket_num_per_tbl(bucket_num_per_tbl), 17 | pool_size(pool_size), 18 | threshold(threshold) { 19 | CUDA_CHECK(cudaMallocManaged( 20 | &d_multi_tbl_keys, 21 | sizeof(int) * (bucket_num_per_tbl + pool_size) * max_tbl_num)); 22 | CUDA_CHECK(cudaMallocManaged( 23 | &d_multi_tbl_vals, 24 | sizeof(int) * (bucket_num_per_tbl + pool_size) * max_tbl_num)); 25 | CUDA_CHECK(cudaMallocManaged( 26 | &d_multi_tbl_nexts, 27 | sizeof(int) * (bucket_num_per_tbl + pool_size) * max_tbl_num)); 28 | CUDA_CHECK(cudaMallocManaged(&d_multi_tbl_locks, 29 | sizeof(int) * bucket_num_per_tbl * max_tbl_num)); 30 | CUDA_CHECK(cudaMallocManaged(&d_multi_tbl_sizes, sizeof(int) * max_tbl_num)); 31 | CUDA_CHECK(cudaMallocManaged(&d_multi_tbl_pool_used_sizes, 32 | sizeof(int) * max_tbl_num)); 33 | 34 | init_tbls(); 35 | } 36 | 37 | void GPUMultiLinkedHashTable::free() { 38 | CUDA_CHECK(cudaFree(d_multi_tbl_keys)); 39 | CUDA_CHECK(cudaFree(d_multi_tbl_vals)); 40 | CUDA_CHECK(cudaFree(d_multi_tbl_nexts)); 41 | CUDA_CHECK(cudaFree(d_multi_tbl_locks)); 42 | CUDA_CHECK(cudaFree(d_multi_tbl_sizes)); 43 | CUDA_CHECK(cudaFree(d_multi_tbl_pool_used_sizes)); 44 | } 45 | 46 | void GPUMultiLinkedHashTable::init_tbls() { 47 | CUDA_CHECK( 48 | cudaMemset(d_multi_tbl_keys, -1, 49 | sizeof(int) * (bucket_num_per_tbl + pool_size) * max_tbl_num)); 50 | CUDA_CHECK( 51 | cudaMemset(d_multi_tbl_vals, 0, 52 | sizeof(int) * (bucket_num_per_tbl + pool_size) * max_tbl_num)); 53 | CUDA_CHECK( 54 | cudaMemset(d_multi_tbl_nexts, -1, 55 | sizeof(int) * (bucket_num_per_tbl + pool_size) * max_tbl_num)); 56 | CUDA_CHECK(cudaMemset(d_multi_tbl_locks, 0, 57 | sizeof(int) * bucket_num_per_tbl * max_tbl_num)); 58 | CUDA_CHECK(cudaMemset(d_multi_tbl_sizes, 0, sizeof(int) * max_tbl_num)); 59 | CUDA_CHECK( 60 | cudaMemset(d_multi_tbl_pool_used_sizes, 0, sizeof(int) * max_tbl_num)); 61 | } 62 | 63 | __device__ void GPUMultiLinkedHashTable::d_block_reduce_cnt( 64 | const int *d_raw_keys, const int raw_key_begin, const int raw_key_end, 65 | const int tbl_id) { 66 | assert(tbl_id < max_tbl_num); 67 | 68 | int *d_tbl_keys = 69 | d_multi_tbl_keys + tbl_id * (bucket_num_per_tbl + pool_size); 70 | int *d_tbl_vals = 71 | d_multi_tbl_vals + tbl_id * (bucket_num_per_tbl + pool_size); 72 | int *d_tbl_nexts = 73 | d_multi_tbl_nexts + tbl_id * (bucket_num_per_tbl + pool_size); 74 | int *d_tbl_locks = d_multi_tbl_locks + tbl_id * bucket_num_per_tbl; 75 | 76 | __shared__ int s_tbl_size; 77 | __shared__ int s_tbl_pool_used_size; 78 | if (threadIdx.x == 0) { 79 | s_tbl_size = d_multi_tbl_sizes[tbl_id]; 80 | s_tbl_pool_used_size = d_multi_tbl_pool_used_sizes[tbl_id]; 81 | } 82 | __syncthreads(); 83 | 84 | FOR_IDX_SYNC(raw_key_idx, raw_key_begin, raw_key_end) { 85 | bool exec_flag = raw_key_idx < raw_key_end; 86 | 87 | int key, bucket_idx; 88 | if (exec_flag) { 89 | key = d_raw_keys[raw_key_idx]; 90 | bucket_idx = d_hashier(key) % bucket_num_per_tbl; 91 | } 92 | 93 | int pre_entry_idx = -1, entry_idx; 94 | int stop_flag; 95 | do { 96 | stop_flag = 1; 97 | 98 | if (exec_flag) { 99 | int searched_key = key + 1; 100 | if (pre_entry_idx == -1) { 101 | entry_idx = bucket_idx; 102 | searched_key = d_tbl_keys[entry_idx]; 103 | } 104 | 105 | if (searched_key != -1 && searched_key != key) { 106 | do { 107 | pre_entry_idx = entry_idx; 108 | entry_idx = d_tbl_nexts[pre_entry_idx]; 109 | } while (entry_idx != -1 && 110 | (searched_key = d_tbl_keys[entry_idx]) != key); 111 | } 112 | 113 | if (searched_key == key) { 114 | if (atomicAdd(d_tbl_vals + entry_idx, 1) == threshold - 1) 115 | atomicAdd(&s_tbl_size, 1); 116 | exec_flag = false; 117 | } else { 118 | if (atomicCAS(d_tbl_locks + bucket_idx, 0, 1) == 119 | 0) { // acquire lock successfully 120 | // check again 121 | __threadfence_block(); // ensure read after lock 122 | if (pre_entry_idx == -1 ? d_tbl_keys[entry_idx] == -1 123 | : d_tbl_nexts[pre_entry_idx] == -1) { 124 | if (entry_idx == -1) 125 | entry_idx = 126 | atomicAdd(&s_tbl_pool_used_size, 1) + bucket_num_per_tbl; 127 | d_tbl_keys[entry_idx] = key; 128 | __threadfence_block(); // ensure key writen before next 129 | 130 | if (pre_entry_idx != -1) { 131 | d_tbl_nexts[pre_entry_idx] = entry_idx; 132 | __threadfence_block(); 133 | } 134 | 135 | if (atomicAdd(d_tbl_vals + entry_idx, 1) == threshold - 1) 136 | atomicAdd(&s_tbl_size, 1); 137 | 138 | exec_flag = false; 139 | } 140 | 141 | atomicExch(d_tbl_locks + bucket_idx, 0); // release lock 142 | } 143 | 144 | if (exec_flag) { // fail to acquire lock or list has been updated 145 | if (pre_entry_idx != -1) entry_idx = pre_entry_idx; // roll back 146 | stop_flag = 0; 147 | } 148 | } 149 | } 150 | 151 | stop_flag = __syncthreads_and(stop_flag); 152 | } while (stop_flag == 0); 153 | } 154 | 155 | assert(s_tbl_pool_used_size <= pool_size); 156 | 157 | if (threadIdx.x == 0) { 158 | d_multi_tbl_sizes[tbl_id] = s_tbl_size; 159 | d_multi_tbl_pool_used_sizes[tbl_id] = s_tbl_pool_used_size; 160 | } 161 | } 162 | 163 | // number of labels of one sample tend to be small, so activate them 164 | // sequentially 165 | __device__ void GPUMultiLinkedHashTable::d_activate_labels_seq( 166 | const int *d_labels, const int *d_rand_nodes, const int label_begin, 167 | const int label_end, const int tbl_id, const int min_act_num, 168 | const int node_num, const int seed) { 169 | int *d_tbl_keys = 170 | d_multi_tbl_keys + tbl_id * (bucket_num_per_tbl + pool_size); 171 | int *d_tbl_vals = 172 | d_multi_tbl_vals + tbl_id * (bucket_num_per_tbl + pool_size); 173 | int *d_tbl_nexts = 174 | d_multi_tbl_nexts + tbl_id * (bucket_num_per_tbl + pool_size); 175 | // int *d_tbl_locks = d_multi_tbl_locks + tbl_id * bucket_num_per_tbl; 176 | 177 | int tbl_size = d_multi_tbl_sizes[tbl_id]; 178 | int tbl_pool_used_size = d_multi_tbl_pool_used_sizes[tbl_id]; 179 | 180 | for (int i = label_begin; i < label_end; ++i) { 181 | const int label = d_labels[i]; 182 | d_insert_label_seq(label, d_tbl_keys, d_tbl_vals, d_tbl_nexts, tbl_size, 183 | tbl_pool_used_size); 184 | } 185 | 186 | if (tbl_size < min_act_num) { 187 | // printf("Start random nodes inserting\n"); 188 | 189 | thrust::minstd_rand rand_eng(seed); 190 | rand_eng.discard(tbl_id); 191 | int i = rand_eng() % node_num; 192 | while (tbl_size < min_act_num) { 193 | const int node = d_rand_nodes[i]; 194 | d_insert_label_seq(node, d_tbl_keys, d_tbl_vals, d_tbl_nexts, tbl_size, 195 | tbl_pool_used_size); 196 | i = (i + 1) % node_num; 197 | } 198 | } 199 | 200 | assert(tbl_pool_used_size <= pool_size); 201 | 202 | d_multi_tbl_sizes[tbl_id] = tbl_size; 203 | d_multi_tbl_pool_used_sizes[tbl_id] = tbl_pool_used_size; 204 | } 205 | 206 | __device__ void GPUMultiLinkedHashTable::d_activate_labels_seq( 207 | const int *d_labels, const int label_begin, const int label_end, 208 | const int tbl_id) { 209 | int *d_tbl_keys = 210 | d_multi_tbl_keys + tbl_id * (bucket_num_per_tbl + pool_size); 211 | int *d_tbl_vals = 212 | d_multi_tbl_vals + tbl_id * (bucket_num_per_tbl + pool_size); 213 | int *d_tbl_nexts = 214 | d_multi_tbl_nexts + tbl_id * (bucket_num_per_tbl + pool_size); 215 | // int *d_tbl_locks = d_multi_tbl_locks + tbl_id * bucket_num_per_tbl; 216 | 217 | int tbl_size = d_multi_tbl_sizes[tbl_id]; 218 | int tbl_pool_used_size = d_multi_tbl_pool_used_sizes[tbl_id]; 219 | 220 | for (int i = label_begin; i < label_end; ++i) { 221 | const int label = d_labels[i]; 222 | d_insert_label_seq(label, d_tbl_keys, d_tbl_vals, d_tbl_nexts, tbl_size, 223 | tbl_pool_used_size); 224 | } 225 | 226 | assert(tbl_pool_used_size <= pool_size); 227 | 228 | d_multi_tbl_sizes[tbl_id] = tbl_size; 229 | d_multi_tbl_pool_used_sizes[tbl_id] = tbl_pool_used_size; 230 | } 231 | 232 | void GPUMultiLinkedHashTable::block_reduce_cnt( 233 | const CscActNodes &cmprs_gathered, const int L, const int batch_size, 234 | const int thread_num) { 235 | block_reduce_cnt_knl<<>>(cmprs_gathered, L, *this); 236 | CUDA_CHECK(cudaDeviceSynchronize()); 237 | } 238 | 239 | void GPUMultiLinkedHashTable::activate_labels_seq( 240 | const CscActNodes &cmprs_labels, const int batch_size, 241 | const int thread_num) { 242 | const int block_num = (batch_size + thread_num - 1) / thread_num; 243 | activate_labels_seq_knl<<>>(cmprs_labels, batch_size, 244 | *this); 245 | } 246 | 247 | void GPUMultiLinkedHashTable::activate_labels_seq( 248 | const CscActNodes &cmprs_labels, const int *d_rand_nodes, 249 | const int batch_size, const int min_act_num, const int node_num, 250 | const int thread_num) { 251 | const int block_num = (batch_size + thread_num - 1) / thread_num; 252 | activate_labels_seq_knl<<>>(cmprs_labels, d_rand_nodes, 253 | batch_size, min_act_num, 254 | node_num, rand(), *this); 255 | } 256 | 257 | void GPUMultiLinkedHashTable::get_act_nodes(CscActNodes &csc_acts, 258 | const int batch_size) { 259 | assert(batch_size <= max_tbl_num); 260 | 261 | CUDA_CHECK(cudaMemset(csc_acts.d_offsets, 0, sizeof(int))); 262 | thrust::inclusive_scan(thrust::device, d_multi_tbl_sizes, 263 | d_multi_tbl_sizes + batch_size, 264 | csc_acts.d_offsets + 1); 265 | 266 | int tot_node_num; 267 | CUDA_CHECK(cudaMemcpy(&tot_node_num, csc_acts.d_offsets + batch_size, 268 | sizeof(int), cudaMemcpyDeviceToHost)); 269 | assert(tot_node_num <= csc_acts.node_capacity); 270 | 271 | thrust::copy_if( 272 | thrust::device, d_multi_tbl_keys, 273 | d_multi_tbl_keys + (bucket_num_per_tbl + pool_size) * batch_size, 274 | d_multi_tbl_vals, csc_acts.d_nodes, filter(threshold)); 275 | } 276 | 277 | __global__ void block_reduce_cnt_knl( 278 | const CscActNodes cmprs_gathered, const int L, 279 | GPUMultiLinkedHashTable multi_linked_htables) { 280 | assert(blockIdx.x < multi_linked_htables.max_tbl_num); 281 | 282 | const int gathered_begin = cmprs_gathered.d_offsets[blockIdx.x * L]; 283 | const int gathered_end = cmprs_gathered.d_offsets[(blockIdx.x + 1) * L]; 284 | multi_linked_htables.d_block_reduce_cnt( 285 | cmprs_gathered.d_nodes, gathered_begin, gathered_end, blockIdx.x); 286 | } 287 | 288 | __global__ void activate_labels_seq_knl( 289 | const CscActNodes cmprs_labels, const int batch_size, 290 | GPUMultiLinkedHashTable multi_linked_htables) { 291 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 292 | if (tid >= batch_size) return; 293 | 294 | const int label_begin = cmprs_labels.d_offsets[tid]; 295 | const int label_end = cmprs_labels.d_offsets[tid + 1]; 296 | multi_linked_htables.d_activate_labels_seq(cmprs_labels.d_nodes, label_begin, 297 | label_end, tid); 298 | } 299 | 300 | __global__ void activate_labels_seq_knl( 301 | const CscActNodes cmprs_labels, const int *d_rand_nodes, 302 | const int batch_size, const int min_act_num, const int node_num, 303 | const int seed, GPUMultiLinkedHashTable multi_linked_htables) { 304 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 305 | if (tid >= batch_size) return; 306 | 307 | const int label_begin = cmprs_labels.d_offsets[tid]; 308 | const int label_end = cmprs_labels.d_offsets[tid + 1]; 309 | multi_linked_htables.d_activate_labels_seq(cmprs_labels.d_nodes, d_rand_nodes, 310 | label_begin, label_end, tid, 311 | min_act_num, node_num, seed); 312 | } 313 | -------------------------------------------------------------------------------- /src/lshKnl.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | 7 | #include "lshKnl.h" 8 | #include "utils.h" 9 | 10 | __global__ void init_bins_knl(int *d_bins, const int prev_node_num, 11 | const int tot_elem_num) { 12 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 13 | if (tid >= tot_elem_num) return; 14 | 15 | d_bins[tid] = tid % prev_node_num; 16 | } 17 | 18 | __global__ void gen_rand_keys_knl(unsigned int *d_rand_keys, const int seed, 19 | const int prev_node_num, 20 | const int tot_elem_num) { 21 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 22 | if (tid >= tot_elem_num) return; 23 | 24 | const int permute_id = tid / prev_node_num; 25 | unsigned int key = permute_id << 16; 26 | 27 | thrust::minstd_rand rand_eng(seed); 28 | rand_eng.discard(tid); 29 | key |= rand_eng() & (1 << 16 - 1); 30 | 31 | d_rand_keys[tid] = key; 32 | } 33 | 34 | __global__ void gen_rand_keys_knl(int *d_rand_keys, const int seed, 35 | const int node_num) { 36 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 37 | if (tid >= node_num) return; 38 | 39 | thrust::minstd_rand rand_eng(seed); 40 | rand_eng.discard(tid); 41 | 42 | d_rand_keys[tid] = rand_eng(); 43 | } 44 | 45 | // Assumption: prev_node_num is small while node_num is large 46 | // tt: one thread for each tile 47 | __global__ void init_hash_tt_knl( 48 | const int *d_bins, const float *d_weights_rowmajor, const int prev_node_num, 49 | const int node_num, const int tot_elem_num, const int L, const int K, 50 | const int bin_size, const int tbl_num_per_tile, 51 | const int bucket_num_per_tbl, const int bucket_capacity, int *d_buckets, 52 | int *d_bucket_sizes) { 53 | const int elem_num_per_tile = K * bin_size * tbl_num_per_tile; 54 | extern __shared__ int smem[]; 55 | int *s_tile_bins = smem; // elem_num_per_tile 56 | float *s_weights_rowmajor = 57 | (float *)s_tile_bins + elem_num_per_tile; // blockDim.x * prev_node_num 58 | 59 | const int log_bin_size = (int)logf(bin_size); 60 | 61 | const int tot_weight_size = node_num * prev_node_num; 62 | const int s_weight_size = blockDim.x * prev_node_num; 63 | const int weight_begin = blockIdx.x * s_weight_size; 64 | FOR_IDX_ASYNC(s_weight_idx, 0, s_weight_size) { 65 | const int weight_idx = s_weight_idx + weight_begin; 66 | if (weight_idx < tot_weight_size) { 67 | s_weights_rowmajor[s_weight_idx] = d_weights_rowmajor[weight_idx]; 68 | } 69 | } 70 | // __syncthreads(); 71 | 72 | const int node_idx = threadIdx.x + blockIdx.x * blockDim.x; 73 | const bool exec_flag = node_idx < node_num; 74 | const float *s_node_weights = 75 | s_weights_rowmajor + threadIdx.x * prev_node_num; 76 | for (int tbl_begin = 0; tbl_begin < L; tbl_begin += tbl_num_per_tile) { 77 | const int tile_bin_begin = tbl_begin * K * bin_size; 78 | FOR_IDX_ASYNC(bin_elem_idx, tile_bin_begin, 79 | min(tot_elem_num, tile_bin_begin + elem_num_per_tile)) { 80 | s_tile_bins[bin_elem_idx - tile_bin_begin] = d_bins[bin_elem_idx]; 81 | } 82 | __syncthreads(); 83 | 84 | if (exec_flag) { 85 | for (int i = 0; i < tbl_num_per_tile && i + tbl_begin < L; ++i) { 86 | int bucket_idx = 0; 87 | for (int j = 0; j < K; ++j) { 88 | float maxv = -FLT_MAX; 89 | int hash = 0; 90 | for (int k = 0; k < bin_size; ++k) { 91 | int idx = s_tile_bins[(i * K + j) * bin_size + k]; 92 | float weight = s_node_weights[idx]; 93 | if (weight > maxv) { 94 | maxv = weight; 95 | hash = k; 96 | } 97 | } 98 | bucket_idx += hash << ((K - 1 - j) * log_bin_size); 99 | } 100 | 101 | const int glb_bucket_idx = 102 | bucket_idx + (i + tbl_begin) * bucket_num_per_tbl; 103 | const int pos = 104 | atomicAdd(d_bucket_sizes + glb_bucket_idx, 1) % bucket_capacity; 105 | d_buckets[pos + glb_bucket_idx * bucket_capacity] = node_idx; 106 | } 107 | } 108 | 109 | __syncthreads(); 110 | } 111 | } 112 | 113 | // Assumption: prev_node_num is small while node_num is large 114 | __global__ void init_hash_knl( 115 | const int *d_bins, const float *d_weights_rowmajor, const int prev_node_num, 116 | const int node_num, const int tot_elem_num, const int L, const int K, 117 | const int bin_size, const int tbl_num_per_tile, 118 | const int tbl_num_per_thread, const int bucket_num_per_tbl, 119 | const int bucket_capacity, int *d_buckets, int *d_bucket_sizes) { 120 | const int elem_num_per_tile = K * bin_size * tbl_num_per_tile; 121 | extern __shared__ int smem[]; 122 | int *s_tile_bins = smem; // elem_num_per_tile 123 | float *s_weights_rowmajor = 124 | (float *)s_tile_bins + elem_num_per_tile; // blockDim.x * prev_node_num 125 | 126 | const int log_bin_size = (int)logf(bin_size); 127 | 128 | const int thread_num_per_tile = tbl_num_per_tile / tbl_num_per_thread; 129 | assert(thread_num_per_tile * tbl_num_per_thread == tbl_num_per_tile); 130 | 131 | const int tot_weight_size = node_num * prev_node_num; 132 | const int s_weight_size = blockDim.x * prev_node_num; 133 | const int weight_begin = blockIdx.x * s_weight_size; 134 | FOR_IDX_ASYNC(s_weight_idx, 0, s_weight_size) { 135 | const int weight_idx = s_weight_idx + weight_begin; 136 | if (weight_idx < tot_weight_size) { 137 | s_weights_rowmajor[s_weight_idx] = d_weights_rowmajor[weight_idx]; 138 | } 139 | } 140 | // __syncthreads(); 141 | 142 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 143 | const int node_idx = tid / thread_num_per_tile; 144 | const int tile_lane_idx = tid - node_idx * thread_num_per_tile; 145 | const bool exec_flag = node_idx < node_num; 146 | const float *s_node_weights = 147 | s_weights_rowmajor + threadIdx.x * prev_node_num; 148 | for (int tile_tbl_begin = 0; tile_tbl_begin < L; 149 | tile_tbl_begin += tbl_num_per_tile) { 150 | const int tile_bin_begin = tile_tbl_begin * K * bin_size; 151 | FOR_IDX_ASYNC(bin_elem_idx, tile_bin_begin, 152 | min(tot_elem_num, tile_bin_begin + elem_num_per_tile)) { 153 | s_tile_bins[bin_elem_idx - tile_bin_begin] = d_bins[bin_elem_idx]; 154 | } 155 | __syncthreads(); 156 | 157 | if (exec_flag) { 158 | const int lane_begin = tile_lane_idx * tbl_num_per_thread; 159 | for (int i = lane_begin; 160 | (i - lane_begin) < tbl_num_per_thread && i + tile_tbl_begin < L; 161 | ++i) { 162 | int bucket_idx = 0; 163 | for (int j = 0; j < K; ++j) { 164 | float maxv = -FLT_MAX; 165 | int hash = 0; 166 | for (int k = 0; k < bin_size; ++k) { 167 | int idx = s_tile_bins[(i * K + j) * bin_size + k]; 168 | float weight = s_node_weights[idx]; 169 | if (weight > maxv) { 170 | maxv = weight; 171 | hash = k; 172 | } 173 | } 174 | bucket_idx += hash << ((K - 1 - j) * log_bin_size); 175 | } 176 | 177 | const int glb_bucket_idx = 178 | bucket_idx + (i + tile_tbl_begin) * bucket_num_per_tbl; 179 | const int pos = 180 | atomicAdd(d_bucket_sizes + glb_bucket_idx, 1) % bucket_capacity; 181 | d_buckets[pos + glb_bucket_idx * bucket_capacity] = node_idx; 182 | } 183 | } 184 | 185 | __syncthreads(); 186 | } 187 | } 188 | 189 | // No shared memory for weights 190 | __global__ void init_hash_no_sw_knl( 191 | const int *d_bins, const float *d_weights_rowmajor, const int prev_node_num, 192 | const int node_num, const int tot_elem_num, const int L, const int K, 193 | const int bin_size, const int tbl_num_per_tile, 194 | const int bucket_num_per_tbl, const int bucket_capacity, int *d_buckets, 195 | int *d_bucket_sizes) { 196 | extern __shared__ int s_tile_bins[]; // K * bin_size * tbl_num_per_tile 197 | 198 | const int log_bin_size = (int)logf(bin_size); 199 | 200 | const int elem_num_per_tile = K * bin_size * tbl_num_per_tile; 201 | const int tile_bin_begin = blockIdx.x * elem_num_per_tile; 202 | FOR_IDX_ASYNC(bin_elem_idx, tile_bin_begin, 203 | min(tot_elem_num, tile_bin_begin + elem_num_per_tile)) { 204 | s_tile_bins[bin_elem_idx - tile_bin_begin] = d_bins[bin_elem_idx]; 205 | } 206 | __syncthreads(); 207 | 208 | FOR_IDX_ASYNC(node_idx, 0, node_num) { 209 | const float *d_node_weights = d_weights_rowmajor + node_idx * prev_node_num; 210 | for (int i = 0; 211 | i < tbl_num_per_tile && i + blockIdx.x * tbl_num_per_tile < L; ++i) { 212 | int bucket_idx = 0; 213 | for (int j = 0; j < K; ++j) { 214 | float maxv = -FLT_MAX; 215 | int hash = 0; 216 | for (int k = 0; k < bin_size; ++k) { 217 | int idx = s_tile_bins[(i * K + j) * bin_size + k]; 218 | float weight = d_node_weights[idx]; 219 | if (weight > maxv) { 220 | maxv = weight; 221 | hash = k; 222 | } 223 | } 224 | bucket_idx += hash << ((K - 1 - j) * log_bin_size); 225 | } 226 | 227 | const int glb_bucket_idx = 228 | bucket_idx + (i + blockIdx.x * tbl_num_per_tile) * bucket_num_per_tbl; 229 | const int pos = 230 | atomicAdd(d_bucket_sizes + glb_bucket_idx, 1) % bucket_capacity; 231 | d_buckets[pos + glb_bucket_idx * bucket_capacity] = node_idx; 232 | } 233 | } 234 | } 235 | 236 | // Assumption: previous layer is dense 237 | __global__ void get_hash_knl(const int *d_bins, 238 | const float *d_dense_inputs_colmajor, 239 | const int *d_bucket_sizes, const int in_node_num, 240 | const int tot_elem_num, const int L, const int K, 241 | const int bin_size, const int tbl_num_per_tile, 242 | const int batch_size, const int bucket_num_per_tbl, 243 | const int bucket_capacity, 244 | int *d_hashed_bucket_ids_colmajor, 245 | int *d_hashed_bucket_sizes_colmajor) { 246 | extern __shared__ int s_tile_bins[]; // K * bin_size * tbl_num_per_tile 247 | 248 | const int log_bin_size = (int)logf(bin_size); 249 | 250 | const int elem_num_per_tile = K * bin_size * tbl_num_per_tile; 251 | const int tile_bin_begin = blockIdx.x * elem_num_per_tile; 252 | FOR_IDX_ASYNC(bin_elem_idx, tile_bin_begin, 253 | min(tot_elem_num, tile_bin_begin + elem_num_per_tile)) { 254 | s_tile_bins[bin_elem_idx - tile_bin_begin] = d_bins[bin_elem_idx]; 255 | } 256 | __syncthreads(); 257 | 258 | FOR_IDX_ASYNC(col_idx, 0, batch_size) { 259 | const float *d_input_col = d_dense_inputs_colmajor + col_idx * in_node_num; 260 | const int offset = col_idx * L + blockIdx.x * tbl_num_per_tile; 261 | int *d_hashed_bucket_id_col_tile = d_hashed_bucket_ids_colmajor + offset; 262 | int *d_hashed_bucket_size_col_tile = 263 | d_hashed_bucket_sizes_colmajor + offset; 264 | for (int i = 0; 265 | i < tbl_num_per_tile && i + blockIdx.x * tbl_num_per_tile < L; ++i) { 266 | int bucket_idx = 0; 267 | for (int j = 0; j < K; ++j) { 268 | float maxv = -FLT_MAX; 269 | int hash = 0; 270 | for (int k = 0; k < bin_size; ++k) { 271 | int idx = s_tile_bins[(i * K + j) * bin_size + k]; 272 | float input_val = d_input_col[idx]; 273 | if (input_val > maxv) { 274 | maxv = input_val; 275 | hash = k; 276 | } 277 | } 278 | bucket_idx += hash << ((K - 1 - j) * log_bin_size); 279 | } 280 | // TODO: first write to shared memory 281 | d_hashed_bucket_id_col_tile[i] = bucket_idx; 282 | d_hashed_bucket_size_col_tile[i] = 283 | min(d_bucket_sizes[bucket_idx + (i + blockIdx.x * tbl_num_per_tile) * 284 | bucket_num_per_tbl], 285 | bucket_capacity); 286 | } 287 | } 288 | } 289 | 290 | __global__ void gather_buckets_knl(const int *d_hashed_bucket_ids_colmajor, 291 | const int *d_buckets, const int L, 292 | const int batch_size, 293 | const int bucket_num_per_tbl, 294 | const int bucket_capacity, 295 | CscActNodes cmprs_gathered) { 296 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 297 | if (tid >= L * batch_size) return; 298 | 299 | const int bucket_id = d_hashed_bucket_ids_colmajor[tid]; 300 | const int tbl_id = tid % L; 301 | const int bucket_begin = 302 | (bucket_id + tbl_id * bucket_num_per_tbl) * bucket_capacity; 303 | const int gathered_begin = cmprs_gathered.d_offsets[tid]; 304 | const int gathered_end = cmprs_gathered.d_offsets[tid + 1]; 305 | const int gathered_size = gathered_end - gathered_begin; 306 | 307 | // TODO: widely write 308 | for (int i = 0; i < gathered_size; ++i) { 309 | cmprs_gathered.d_nodes[gathered_begin + i] = d_buckets[bucket_begin + i]; 310 | } 311 | } 312 | -------------------------------------------------------------------------------- /src/kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include "kernel.h" 10 | #include "utils.h" 11 | 12 | #define BETA1 0.9 13 | #define BETA2 0.999 14 | #define EPS 0.00000001 15 | 16 | #define MAX_INIT 0.0 17 | 18 | __forceinline__ __device__ float warp_reduce(float val) { 19 | for (int offset = warpSize / 2; offset > 0; offset /= 2) 20 | val += __shfl_down_sync(0xFFFFFFFF, val, offset); 21 | return val; 22 | } 23 | 24 | __forceinline__ __device__ float block_reduce(float val) { 25 | __shared__ float s_sum_buff[32]; 26 | 27 | const int wid = threadIdx.x / warpSize; 28 | const int lane = threadIdx.x - wid * warpSize; 29 | 30 | val = warp_reduce(val); 31 | if (blockDim.x < warpSize) return val; 32 | 33 | if (lane == 0) { 34 | s_sum_buff[wid] = val; 35 | } 36 | __syncthreads(); 37 | 38 | if (wid == 0) { 39 | val = (threadIdx.x < blockDim.x / warpSize) ? s_sum_buff[lane] : 0.; 40 | val = warp_reduce(val); 41 | } 42 | 43 | return val; 44 | } 45 | 46 | __forceinline__ __device__ float warp_max(float val) { 47 | for (int offset = warpSize / 2; offset > 0; offset /= 2) 48 | val = max(val, __shfl_down_sync(0xFFFFFFFF, val, offset)); 49 | return val; 50 | } 51 | 52 | __forceinline__ __device__ float block_max(float val) { 53 | __shared__ float s_max_buff[32]; 54 | 55 | const int wid = threadIdx.x / warpSize; 56 | const int lane = threadIdx.x - wid * warpSize; 57 | 58 | val = warp_max(val); 59 | if (blockDim.x < warpSize) return val; 60 | 61 | if (lane == 0) { 62 | s_max_buff[wid] = val; 63 | } 64 | __syncthreads(); 65 | 66 | if (wid == 0) { 67 | val = (threadIdx.x < blockDim.x / warpSize) ? s_max_buff[lane] : MAX_INIT; 68 | val = warp_max(val); 69 | } 70 | 71 | return val; 72 | } 73 | 74 | __global__ void get_dense_acts_knl(CscActNodes csc_acts, const int node_num, 75 | const int batch_size) { 76 | const int tid = threadIdx.x + blockIdx.x * blockDim.x; 77 | if (tid < node_num * batch_size) { 78 | const int node_id = tid % node_num; 79 | csc_acts.d_nodes[tid] = node_id; 80 | 81 | if (tid <= batch_size) { 82 | csc_acts.d_offsets[tid] = tid * node_num; 83 | } 84 | } 85 | } 86 | 87 | __global__ void relu_fwd_slide_in_knl(const CscActNodes csc_inputs, 88 | const float *d_weights_colmajor, 89 | const float *d_biases, 90 | const int weight_row_num, 91 | const int max_out_num, 92 | CscActNodes csc_outputs) { 93 | extern __shared__ char smem[]; 94 | float *s_in_vals = (float *)smem; // blockDim.x 95 | int *s_in_nodes = (int *)(s_in_vals + blockDim.x); // blockDim.x 96 | float *s_out_vals = (float *)(s_in_nodes + blockDim.x); // max_out_num 97 | int *s_out_nodes = (int *)(s_out_vals + max_out_num); // max_out_num 98 | 99 | const int in_begin = csc_inputs.d_offsets[blockIdx.x]; 100 | const int in_end = csc_inputs.d_offsets[blockIdx.x + 1]; 101 | // const int in_size = in_end - in_begin; 102 | const int out_begin = csc_outputs.d_offsets[blockIdx.x]; 103 | const int out_end = csc_outputs.d_offsets[blockIdx.x + 1]; 104 | const int out_size = out_end - out_begin; 105 | 106 | assert(out_size <= max_out_num); 107 | 108 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 109 | const int out_node = csc_outputs.d_nodes[out_begin + s_out_idx]; 110 | s_out_nodes[s_out_idx] = out_node; 111 | s_out_vals[s_out_idx] = d_biases[out_node]; 112 | } 113 | // __syncthreads(); 114 | 115 | FOR_OFFSET(in_offset, in_begin, in_end) { 116 | const int in_idx = in_offset + threadIdx.x; 117 | if (in_idx < in_end) { 118 | s_in_nodes[threadIdx.x] = csc_inputs.d_nodes[in_idx]; 119 | s_in_vals[threadIdx.x] = csc_inputs.d_vals[in_idx]; 120 | } 121 | __syncthreads(); 122 | 123 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 124 | const int out_node = s_out_nodes[s_out_idx]; 125 | float psum = 0.; 126 | for (int s_in_idx = 0; 127 | s_in_idx < blockDim.x && in_offset + s_in_idx < in_end; ++s_in_idx) { 128 | const int in_node = s_in_nodes[s_in_idx]; 129 | const float in_val = s_in_vals[s_in_idx]; 130 | const float weight = 131 | d_weights_colmajor[in_node * weight_row_num + out_node]; 132 | psum += in_val * weight; 133 | } 134 | s_out_vals[s_out_idx] += psum; 135 | } 136 | __syncthreads(); 137 | } 138 | 139 | float *d_out_val_col = csc_outputs.d_vals + out_begin; 140 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 141 | d_out_val_col[s_out_idx] = max(0., s_out_vals[s_out_idx]); 142 | } 143 | } 144 | 145 | __global__ void softmax_fwd_bp_rowmajor_slide_in_knl( 146 | const CscActNodes csc_inputs, const float *d_weights_rowmajor, 147 | const float *d_biases, const CscActNodes cmprs_labels, 148 | const int weight_col_num, const int max_out_num, const int max_label_num, 149 | CscActNodes csc_outputs, float *d_cmprs_bp_deltas) { 150 | extern __shared__ char smem[]; 151 | float *s_in_vals = (float *)smem; // blockDim.x 152 | int *s_in_nodes = (int *)(s_in_vals + blockDim.x); // blockDim.x 153 | float *s_out_vals = (float *)(s_in_nodes + blockDim.x); // max_out_num 154 | int *s_out_nodes = (int *)(s_out_vals + max_out_num); // max_out_num 155 | int *s_labels = (int *)(s_out_nodes + max_out_num); // max_label_num 156 | 157 | const int in_begin = csc_inputs.d_offsets[blockIdx.x]; 158 | const int in_end = csc_inputs.d_offsets[blockIdx.x + 1]; 159 | const int out_begin = csc_outputs.d_offsets[blockIdx.x]; 160 | const int out_end = csc_outputs.d_offsets[blockIdx.x + 1]; 161 | const int out_size = out_end - out_begin; 162 | const int label_begin = cmprs_labels.d_offsets[blockIdx.x]; 163 | const int label_end = cmprs_labels.d_offsets[blockIdx.x + 1]; 164 | const int label_size = label_end - label_begin; 165 | 166 | assert(out_size <= max_out_num); 167 | assert(label_size <= max_label_num); 168 | 169 | FOR_IDX_ASYNC(s_label_idx, 0, label_size) { 170 | s_labels[s_label_idx] = cmprs_labels.d_nodes[label_begin + s_label_idx]; 171 | } 172 | 173 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 174 | const int out_node = csc_outputs.d_nodes[out_begin + s_out_idx]; 175 | s_out_nodes[s_out_idx] = out_node; 176 | s_out_vals[s_out_idx] = d_biases[out_node]; 177 | } 178 | 179 | FOR_OFFSET(in_offset, in_begin, in_end) { 180 | const int in_idx = in_offset + threadIdx.x; 181 | if (in_idx < in_end) { 182 | s_in_nodes[threadIdx.x] = csc_inputs.d_nodes[in_idx]; 183 | s_in_vals[threadIdx.x] = csc_inputs.d_vals[in_idx]; 184 | } 185 | __syncthreads(); 186 | 187 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 188 | const int out_node = s_out_nodes[s_out_idx]; 189 | float psum = 0.; 190 | for (int s_in_idx = 0; 191 | s_in_idx < blockDim.x && in_offset + s_in_idx < in_end; ++s_in_idx) { 192 | const int in_node = s_in_nodes[s_in_idx]; 193 | const float in_val = s_in_vals[s_in_idx]; 194 | const float weight = 195 | d_weights_rowmajor[out_node * weight_col_num + in_node]; 196 | psum += in_val * weight; 197 | } 198 | s_out_vals[s_out_idx] += psum; 199 | } 200 | __syncthreads(); 201 | } 202 | 203 | __shared__ float s_max; 204 | float thread_max = MAX_INIT; 205 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 206 | thread_max = max(thread_max, s_out_vals[s_out_idx]); 207 | } 208 | 209 | thread_max = block_max(thread_max); 210 | if (threadIdx.x == 0) s_max = thread_max; 211 | __syncthreads(); 212 | 213 | __shared__ float s_sum; 214 | float thread_sum = 0.; 215 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 216 | float val = __expf(s_out_vals[s_out_idx] - s_max); 217 | // float val = exp(s_out_vals[s_out_idx] - s_max); 218 | s_out_vals[s_out_idx] = val; 219 | thread_sum += val; 220 | } 221 | 222 | thread_sum = block_reduce(thread_sum); 223 | if (threadIdx.x == 0) s_sum = thread_sum; 224 | __syncthreads(); 225 | 226 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 227 | const int out_idx = s_out_idx + out_begin; 228 | const float val = s_out_vals[s_out_idx] / (s_sum + EPS); 229 | const int out_node = s_out_nodes[s_out_idx]; 230 | csc_outputs.d_vals[out_idx] = val; 231 | 232 | bool is_in_label = false; 233 | for (int i = 0; i < label_size; ++i) { 234 | is_in_label = is_in_label || (s_labels[i] == out_node); 235 | } 236 | 237 | float bp_delta = -val; 238 | if (is_in_label) bp_delta += 1.0 / label_size; 239 | bp_delta /= gridDim.x; 240 | d_cmprs_bp_deltas[out_idx] = bp_delta; 241 | } 242 | } 243 | 244 | __global__ void softmax_fwd_bp_rowmajor_slide_out_knl( 245 | const CscActNodes csc_inputs, const float *d_weights_rowmajor, 246 | const float *d_biases, const CscActNodes cmprs_labels, 247 | const int weight_col_num, const int max_in_num, const int max_label_num, 248 | CscActNodes csc_outputs, float *d_cmprs_bp_deltas) { 249 | extern __shared__ char smem[]; 250 | float *s_in_vals = (float *)smem; // max_in_num 251 | int *s_in_nodes = (int *)(s_in_vals + max_in_num); // max_in_num 252 | int *s_labels = (int *)(s_in_nodes + max_in_num); // max_label_num 253 | 254 | const int in_begin = csc_inputs.d_offsets[blockIdx.x]; 255 | const int in_end = csc_inputs.d_offsets[blockIdx.x + 1]; 256 | const int in_size = in_end - in_begin; 257 | const int out_begin = csc_outputs.d_offsets[blockIdx.x]; 258 | const int out_end = csc_outputs.d_offsets[blockIdx.x + 1]; 259 | const int label_begin = cmprs_labels.d_offsets[blockIdx.x]; 260 | const int label_end = cmprs_labels.d_offsets[blockIdx.x + 1]; 261 | const int label_size = label_end - label_begin; 262 | 263 | assert(in_size <= max_in_num); 264 | assert(label_size <= max_label_num); 265 | 266 | FOR_IDX_ASYNC(in_idx, in_begin, in_end) { 267 | const int s_in_idx = in_idx - in_begin; 268 | s_in_nodes[s_in_idx] = csc_inputs.d_nodes[in_idx]; 269 | s_in_vals[s_in_idx] = csc_inputs.d_vals[in_idx]; 270 | } 271 | 272 | FOR_IDX_ASYNC(s_label_idx, 0, label_size) { 273 | s_labels[s_label_idx] = cmprs_labels.d_nodes[label_begin + s_label_idx]; 274 | } 275 | __syncthreads(); 276 | 277 | float thread_max = MAX_INIT; 278 | FOR_IDX_ASYNC(out_idx, out_begin, out_end) { 279 | const int out_node = csc_outputs.d_nodes[out_idx]; 280 | float psum = d_biases[out_node]; 281 | for (int s_in_idx = 0; s_in_idx < in_size; ++s_in_idx) { 282 | const int in_node = s_in_nodes[s_in_idx]; 283 | const float in_val = s_in_vals[s_in_idx]; 284 | const float weight = 285 | d_weights_rowmajor[out_node * weight_col_num + in_node]; 286 | psum += in_val * weight; 287 | } 288 | csc_outputs.d_vals[out_idx] = psum; 289 | thread_max = max(thread_max, psum); 290 | } 291 | 292 | __shared__ float s_max; 293 | thread_max = block_max(thread_max); 294 | if (threadIdx.x == 0) s_max = thread_max; 295 | __syncthreads(); 296 | 297 | __shared__ float s_sum; 298 | float thread_sum = 0.; 299 | FOR_IDX_ASYNC(out_idx, out_begin, out_end) { 300 | float val = __expf(csc_outputs.d_vals[out_idx] - s_max); 301 | // float val = exp(csc_outputs.d_vals[out_idx] - s_max); 302 | csc_outputs.d_vals[out_idx] = val; 303 | thread_sum += val; 304 | } 305 | 306 | thread_sum = block_reduce(thread_sum); 307 | if (threadIdx.x == 0) s_sum = thread_sum; 308 | __syncthreads(); 309 | 310 | FOR_IDX_ASYNC(out_idx, out_begin, out_end) { 311 | const float val = csc_outputs.d_vals[out_idx] / (s_sum + EPS); 312 | const int out_node = csc_outputs.d_nodes[out_idx]; 313 | csc_outputs.d_vals[out_idx] = val; 314 | 315 | bool is_in_label = false; 316 | for (int i = 0; i < label_size; ++i) { 317 | is_in_label = is_in_label || (s_labels[i] == out_node); 318 | } 319 | 320 | float bp_delta = -val; 321 | if (is_in_label) bp_delta += 1.0 / label_size; 322 | bp_delta /= gridDim.x; 323 | d_cmprs_bp_deltas[out_idx] = bp_delta; 324 | } 325 | } 326 | 327 | __global__ void softmax_fwd_bp_rowmajor_all_sm_knl( 328 | const CscActNodes csc_inputs, const float *d_weights_rowmajor, 329 | const float *d_biases, const CscActNodes cmprs_labels, 330 | const int weight_col_num, const int max_in_num, const int max_out_num, 331 | const int max_label_num, CscActNodes csc_outputs, 332 | float *d_cmprs_bp_deltas) { 333 | extern __shared__ char smem[]; 334 | float *s_in_vals = (float *)smem; // max_in_num 335 | int *s_in_nodes = (int *)(s_in_vals + max_in_num); // max_in_num 336 | float *s_out_vals = (float *)(s_in_nodes + max_in_num); // max_out_num 337 | int *s_out_nodes = (int *)(s_out_vals + max_out_num); // max_out_num 338 | int *s_labels = (int *)(s_out_nodes + max_out_num); // max_label_num 339 | 340 | const int in_begin = csc_inputs.d_offsets[blockIdx.x]; 341 | const int in_end = csc_inputs.d_offsets[blockIdx.x + 1]; 342 | const int in_size = in_end - in_begin; 343 | const int out_begin = csc_outputs.d_offsets[blockIdx.x]; 344 | const int out_end = csc_outputs.d_offsets[blockIdx.x + 1]; 345 | const int out_size = out_end - out_begin; 346 | const int label_begin = cmprs_labels.d_offsets[blockIdx.x]; 347 | const int label_end = cmprs_labels.d_offsets[blockIdx.x + 1]; 348 | const int label_size = label_end - label_begin; 349 | 350 | assert(in_size <= max_in_num); 351 | assert(out_size <= max_out_num); 352 | assert(label_size <= max_label_num); 353 | 354 | FOR_IDX_ASYNC(s_label_idx, 0, label_size) { 355 | s_labels[s_label_idx] = cmprs_labels.d_nodes[label_begin + s_label_idx]; 356 | } 357 | 358 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 359 | const int out_node = csc_outputs.d_nodes[out_begin + s_out_idx]; 360 | s_out_nodes[s_out_idx] = out_node; 361 | s_out_vals[s_out_idx] = d_biases[out_node]; 362 | } 363 | 364 | FOR_IDX_ASYNC(in_idx, in_begin, in_end) { 365 | const int s_in_idx = in_idx - in_begin; 366 | s_in_nodes[s_in_idx] = csc_inputs.d_nodes[in_idx]; 367 | s_in_vals[s_in_idx] = csc_inputs.d_vals[in_idx]; 368 | } 369 | __syncthreads(); 370 | 371 | float thread_max = MAX_INIT; 372 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 373 | const int out_node = csc_outputs.d_nodes[out_begin + s_out_idx]; 374 | s_out_nodes[s_out_idx] = out_node; 375 | float psum = d_biases[out_node]; 376 | for (int s_in_idx = 0; s_in_idx < in_size; ++s_in_idx) { 377 | const int in_node = s_in_nodes[s_in_idx]; 378 | const float in_val = s_in_vals[s_in_idx]; 379 | const float weight = 380 | d_weights_rowmajor[out_node * weight_col_num + in_node]; 381 | psum += in_val * weight; 382 | } 383 | s_out_vals[s_out_idx] = psum; 384 | thread_max = max(thread_max, psum); 385 | } 386 | 387 | __shared__ float s_max; 388 | thread_max = block_max(thread_max); 389 | if (threadIdx.x == 0) s_max = thread_max; 390 | __syncthreads(); 391 | 392 | __shared__ float s_sum; 393 | float thread_sum = 0.; 394 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 395 | float val = __expf(s_out_vals[s_out_idx] - s_max); 396 | // float val = exp(s_out_vals[s_out_idx] - s_max); 397 | s_out_vals[s_out_idx] = val; 398 | thread_sum += val; 399 | } 400 | 401 | thread_sum = block_reduce(thread_sum); 402 | if (threadIdx.x == 0) s_sum = thread_sum; 403 | __syncthreads(); 404 | 405 | FOR_IDX_ASYNC(s_out_idx, 0, out_size) { 406 | const int out_idx = s_out_idx + out_begin; 407 | const float val = s_out_vals[s_out_idx] / (s_sum + EPS); 408 | const int out_node = s_out_nodes[s_out_idx]; 409 | csc_outputs.d_vals[out_idx] = val; 410 | 411 | bool is_in_label = false; 412 | for (int i = 0; i < label_size; ++i) { 413 | is_in_label = is_in_label || (s_labels[i] == out_node); 414 | } 415 | 416 | float bp_delta = -val; 417 | if (is_in_label) bp_delta += 1.0 / label_size; 418 | bp_delta /= gridDim.x; 419 | d_cmprs_bp_deltas[out_idx] = bp_delta; 420 | } 421 | } 422 | 423 | __global__ void bp_knl(const CscActNodes csc_acts, const CscActNodes csc_prev, 424 | const float *d_weights_colmajor, 425 | const float *d_cmprs_bp_deltas, const int weight_row_num, 426 | const int max_act_num, float *d_cmprs_prev_bp_deltas, 427 | float *d_adam_ts, float *d_bias_adam_ts) { 428 | extern __shared__ char smem[]; 429 | float *s_bp_deltas = (float *)smem; // max_act_num 430 | int *s_act_nodes = (int *)(s_bp_deltas + max_act_num); // max_act_num 431 | 432 | const int act_begin = csc_acts.d_offsets[blockIdx.x]; 433 | const int act_end = csc_acts.d_offsets[blockIdx.x + 1]; 434 | const int act_size = act_end - act_begin; 435 | const int prev_begin = csc_prev.d_offsets[blockIdx.x]; 436 | const int prev_end = csc_prev.d_offsets[blockIdx.x + 1]; 437 | 438 | assert(act_size <= max_act_num); 439 | 440 | FOR_IDX_ASYNC(act_idx, act_begin, act_end) { 441 | const int act_node = csc_acts.d_nodes[act_idx]; 442 | const float bp_delta = d_cmprs_bp_deltas[act_idx]; 443 | const int s_act_idx = act_idx - act_begin; 444 | s_act_nodes[s_act_idx] = act_node; 445 | s_bp_deltas[s_act_idx] = bp_delta; 446 | atomicAdd(d_bias_adam_ts + act_node, bp_delta); 447 | } 448 | __syncthreads(); 449 | 450 | FOR_IDX_ASYNC(prev_idx, prev_begin, prev_end) { 451 | const int prev_node = csc_prev.d_nodes[prev_idx]; 452 | const float prev_val = csc_prev.d_vals[prev_idx]; 453 | float prev_bp_delta = 0.; 454 | for (int s_act_idx = 0; s_act_idx < act_size; ++s_act_idx) { 455 | const int act_node = s_act_nodes[s_act_idx]; 456 | const int weight_idx = prev_node * weight_row_num + act_node; 457 | const float bp_delta = s_bp_deltas[s_act_idx]; 458 | if (prev_val > 0) { 459 | const float weight = d_weights_colmajor[weight_idx]; 460 | prev_bp_delta += bp_delta * weight; 461 | } 462 | atomicAdd(d_adam_ts + weight_idx, prev_val * bp_delta); 463 | } 464 | 465 | if (prev_val > 0) { 466 | prev_bp_delta += d_cmprs_prev_bp_deltas[prev_idx]; 467 | } 468 | d_cmprs_prev_bp_deltas[prev_idx] = prev_bp_delta; 469 | } 470 | } 471 | 472 | __global__ void bp_rowmajor_knl(const CscActNodes csc_acts, 473 | const CscActNodes csc_prev, 474 | const float *d_weights_rowmajor, 475 | const float *d_cmprs_bp_deltas, 476 | const int weight_col_num, const int max_act_num, 477 | float *d_cmprs_prev_bp_deltas, float *d_adam_ts, 478 | float *d_bias_adam_ts) { 479 | extern __shared__ char smem[]; 480 | float *s_bp_deltas = (float *)smem; // max_act_num 481 | int *s_act_nodes = (int *)(s_bp_deltas + max_act_num); // max_act_num 482 | 483 | const int act_begin = csc_acts.d_offsets[blockIdx.x]; 484 | const int act_end = csc_acts.d_offsets[blockIdx.x + 1]; 485 | const int act_size = act_end - act_begin; 486 | const int prev_begin = csc_prev.d_offsets[blockIdx.x]; 487 | const int prev_end = csc_prev.d_offsets[blockIdx.x + 1]; 488 | 489 | assert(act_size <= max_act_num); 490 | 491 | FOR_IDX_ASYNC(act_idx, act_begin, act_end) { 492 | const int act_node = csc_acts.d_nodes[act_idx]; 493 | const float bp_delta = d_cmprs_bp_deltas[act_idx]; 494 | const int s_act_idx = act_idx - act_begin; 495 | s_act_nodes[s_act_idx] = act_node; 496 | s_bp_deltas[s_act_idx] = bp_delta; 497 | atomicAdd(d_bias_adam_ts + act_node, bp_delta); 498 | } 499 | __syncthreads(); 500 | 501 | FOR_IDX_ASYNC(prev_idx, prev_begin, prev_end) { 502 | const int prev_node = csc_prev.d_nodes[prev_idx]; 503 | const float prev_val = csc_prev.d_vals[prev_idx]; 504 | float prev_bp_delta = 0.; 505 | for (int s_act_idx = 0; s_act_idx < act_size; ++s_act_idx) { 506 | const int act_node = s_act_nodes[s_act_idx]; 507 | const int weight_idx = act_node * weight_col_num + prev_node; 508 | const float bp_delta = s_bp_deltas[s_act_idx]; 509 | if (prev_val > 0) { 510 | const float weight = d_weights_rowmajor[weight_idx]; 511 | prev_bp_delta += bp_delta * weight; 512 | } 513 | atomicAdd(d_adam_ts + weight_idx, prev_val * bp_delta); 514 | } 515 | 516 | if (prev_val > 0) { 517 | prev_bp_delta += d_cmprs_prev_bp_deltas[prev_idx]; 518 | } 519 | d_cmprs_prev_bp_deltas[prev_idx] = prev_bp_delta; 520 | } 521 | } 522 | 523 | __global__ void bp_rowmajor_no_sm_knl(const CscActNodes csc_acts, 524 | const CscActNodes csc_prev, 525 | const float *d_weights_rowmajor, 526 | const float *d_cmprs_bp_deltas, 527 | const int weight_col_num, 528 | float *d_cmprs_prev_bp_deltas, 529 | float *d_adam_ts, float *d_bias_adam_ts) { 530 | const int act_begin = csc_acts.d_offsets[blockIdx.x]; 531 | const int act_end = csc_acts.d_offsets[blockIdx.x + 1]; 532 | const int prev_begin = csc_prev.d_offsets[blockIdx.x]; 533 | const int prev_end = csc_prev.d_offsets[blockIdx.x + 1]; 534 | 535 | FOR_IDX_ASYNC(act_idx, act_begin, act_end) { 536 | const int act_node = csc_acts.d_nodes[act_idx]; 537 | const float bp_delta = d_cmprs_bp_deltas[act_idx]; 538 | atomicAdd(d_bias_adam_ts + act_node, bp_delta); 539 | } 540 | __syncthreads(); 541 | 542 | FOR_IDX_ASYNC(prev_idx, prev_begin, prev_end) { 543 | const int prev_node = csc_prev.d_nodes[prev_idx]; 544 | const float prev_val = csc_prev.d_vals[prev_idx]; 545 | float prev_bp_delta = 0.; 546 | for (int act_idx = act_begin; act_idx < act_end; ++act_idx) { 547 | const int act_node = csc_acts.d_nodes[act_idx]; 548 | const int weight_idx = act_node * weight_col_num + prev_node; 549 | const float bp_delta = d_cmprs_bp_deltas[act_idx]; 550 | if (prev_val > 0) { 551 | const float weight = d_weights_rowmajor[weight_idx]; 552 | prev_bp_delta += bp_delta * weight; 553 | } 554 | atomicAdd(d_adam_ts + weight_idx, prev_val * bp_delta); 555 | } 556 | 557 | if (prev_val > 0) { 558 | prev_bp_delta += d_cmprs_prev_bp_deltas[prev_idx]; 559 | } 560 | d_cmprs_prev_bp_deltas[prev_idx] = prev_bp_delta; 561 | } 562 | } 563 | 564 | __global__ void bp_rowmajor_slide_knl( 565 | const CscActNodes csc_acts, const CscActNodes csc_prev, 566 | const float *d_weights_rowmajor, const float *d_cmprs_bp_deltas, 567 | const int weight_col_num, const int max_prev_num, 568 | float *d_cmprs_prev_bp_deltas, float *d_adam_ts, float *d_bias_adam_ts) { 569 | extern __shared__ char smem[]; 570 | float *s_prev_bp_deltas = (float *)smem; // max_prev_num 571 | int *s_prev_nodes = (int *)(s_prev_bp_deltas + max_prev_num); // max_prev_num 572 | float *s_prev_vals = (float *)(s_prev_nodes + max_prev_num); // max_prev_num 573 | 574 | const int act_begin = csc_acts.d_offsets[blockIdx.x]; 575 | const int act_end = csc_acts.d_offsets[blockIdx.x + 1]; 576 | const int prev_begin = csc_prev.d_offsets[blockIdx.x]; 577 | const int prev_end = csc_prev.d_offsets[blockIdx.x + 1]; 578 | const int prev_size = prev_end - prev_begin; 579 | 580 | assert(prev_size <= max_prev_num); 581 | 582 | FOR_IDX_ASYNC(s_prev_idx, 0, prev_size) { 583 | const int prev_idx = s_prev_idx + prev_begin; 584 | const float prev_val = csc_prev.d_vals[prev_idx]; 585 | s_prev_nodes[s_prev_idx] = csc_prev.d_nodes[prev_idx]; 586 | s_prev_vals[s_prev_idx] = prev_val; 587 | s_prev_bp_deltas[s_prev_idx] = 588 | prev_val > 0 ? d_cmprs_prev_bp_deltas[prev_idx] : 0; 589 | } 590 | __syncthreads(); 591 | 592 | FOR_IDX_SYNC(act_idx, act_begin, act_end) { 593 | int act_node; 594 | float bp_delta; 595 | if (act_idx < act_end) { 596 | act_node = csc_acts.d_nodes[act_idx]; 597 | bp_delta = d_cmprs_bp_deltas[act_idx]; 598 | atomicAdd(d_bias_adam_ts + act_node, bp_delta); 599 | } 600 | 601 | // TODO: better utilize the bandwidth 602 | for (int s_prev_idx = 0; s_prev_idx < prev_size; ++s_prev_idx) { 603 | int prev_node, weight_idx; 604 | float prev_val, thread_inc = 0; 605 | if (act_idx < act_end) { 606 | prev_node = s_prev_nodes[s_prev_idx]; 607 | prev_val = s_prev_vals[s_prev_idx]; 608 | weight_idx = act_node * weight_col_num + prev_node; 609 | if (prev_val > 0) { 610 | const float weight = d_weights_rowmajor[weight_idx]; 611 | thread_inc = bp_delta * weight; 612 | } 613 | } 614 | thread_inc = block_reduce(thread_inc); 615 | if (threadIdx.x == 0) s_prev_bp_deltas[s_prev_idx] += thread_inc; 616 | if (act_idx < act_end) { 617 | atomicAdd(d_adam_ts + weight_idx, prev_val * bp_delta); 618 | } 619 | } 620 | } 621 | __syncthreads(); 622 | 623 | FOR_IDX_ASYNC(prev_idx, prev_begin, prev_end) { 624 | d_cmprs_prev_bp_deltas[prev_idx] = s_prev_bp_deltas[prev_idx - prev_begin]; 625 | } 626 | } 627 | 628 | __global__ void bp_first_layer_knl(const CscActNodes csc_acts, 629 | const CscActNodes csc_prev, 630 | const float *d_cmprs_bp_deltas, 631 | const int weight_row_num, 632 | const int max_act_num, float *d_adam_ts, 633 | float *d_bias_adam_ts) { 634 | extern __shared__ char smem[]; 635 | float *s_bp_deltas = (float *)smem; // max_act_num 636 | int *s_act_nodes = (int *)(s_bp_deltas + max_act_num); // max_act_num 637 | 638 | const int act_begin = csc_acts.d_offsets[blockIdx.x]; 639 | const int act_end = csc_acts.d_offsets[blockIdx.x + 1]; 640 | const int act_size = act_end - act_begin; 641 | const int prev_begin = csc_prev.d_offsets[blockIdx.x]; 642 | const int prev_end = csc_prev.d_offsets[blockIdx.x + 1]; 643 | 644 | assert(act_size <= max_act_num); 645 | 646 | FOR_IDX_ASYNC(act_idx, act_begin, act_end) { 647 | const int act_node = csc_acts.d_nodes[act_idx]; 648 | const float bp_delta = d_cmprs_bp_deltas[act_idx]; 649 | const int s_act_idx = act_idx - act_begin; 650 | s_act_nodes[s_act_idx] = act_node; 651 | s_bp_deltas[s_act_idx] = bp_delta; 652 | atomicAdd(d_bias_adam_ts + act_node, bp_delta); 653 | } 654 | __syncthreads(); 655 | 656 | FOR_IDX_ASYNC(prev_idx, prev_begin, prev_end) { 657 | const int prev_node = csc_prev.d_nodes[prev_idx]; 658 | const float prev_val = csc_prev.d_vals[prev_idx]; 659 | for (int s_act_idx = 0; s_act_idx < act_size; ++s_act_idx) { 660 | const int act_node = s_act_nodes[s_act_idx]; 661 | const int weight_idx = prev_node * weight_row_num + act_node; 662 | const float bp_delta = s_bp_deltas[s_act_idx]; 663 | atomicAdd(d_adam_ts + weight_idx, prev_val * bp_delta); 664 | } 665 | } 666 | } 667 | 668 | __global__ void update_weights_knl(float *d_weights, float *d_adam_ts, 669 | float *d_adam_moms, float *d_adam_vels, 670 | const float lr, const int weight_size) { 671 | const int idx = threadIdx.x + blockIdx.x * blockDim.x; 672 | if (idx >= weight_size) return; 673 | 674 | // const float t = d_adam_ts[idx]; 675 | // d_adam_ts[idx] = 0; 676 | const float t = atomicExch(d_adam_ts + idx, 0); 677 | 678 | float mom = d_adam_moms[idx]; 679 | d_adam_moms[idx] = mom = BETA1 * mom + (1 - BETA1) * t; 680 | 681 | float vel = d_adam_vels[idx]; 682 | d_adam_vels[idx] = vel = BETA2 * vel + (1 - BETA2) * t * t; 683 | 684 | // d_weights[idx] += lr * mom / (sqrtf(vel) + EPS); 685 | // atomicAdd(d_weights + idx, lr * mom / (sqrtf(vel) + EPS)); 686 | 687 | // d_weights[idx] += __fdividef(lr * mom, sqrtf(vel) + EPS); 688 | atomicAdd(d_weights + idx, __fdividef(lr * mom, sqrtf(vel) + EPS)); 689 | } 690 | --------------------------------------------------------------------------------