├── .gitignore ├── tests ├── test_runner.cpp ├── testbin.c └── tests.cpp ├── LICENSE.txt ├── Makefile ├── blas_enum.h ├── README.md ├── librfn.h ├── basic_python_implementation.py ├── cpu_operations.cpp ├── rfn.py ├── cpu_operations.h ├── blas_sparse_proto.h ├── librfn.cpp ├── gpu_operations.h └── gpu_operations.cu /.gitignore: -------------------------------------------------------------------------------- 1 | RFN/* 2 | *.o 3 | *.so 4 | *.npy 5 | *.pyc 6 | *.log 7 | *.pkl 8 | *.mp4 9 | .ipynb_checkpoints 10 | -------------------------------------------------------------------------------- /tests/test_runner.cpp: -------------------------------------------------------------------------------- 1 | #define CATCH_CONFIG_MAIN 2 | #include "catch.hpp" 3 | 4 | 5 | /* As explained in 6 | * https://github.com/philsquared/Catch/blob/master/docs/slow-compiles.md 7 | * This file is only here to speed up compilation 8 | * by keeping the runner-implementation in its own file 9 | */ 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | librfn: An implementation of Rectified Factor Networks 2 | Copyright (C) 2014-2017 Thomas Unterthiner 3 | Additional contributions by Thomas Adler, Balázs Bencze 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License along 16 | with this program; if not, write to the Free Software Foundation, Inc., 17 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 | -------------------------------------------------------------------------------- /tests/testbin.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "../librfn.h" 8 | 9 | #ifndef M_PI 10 | #define M_PI 3.14159265358979323846 11 | #endif 12 | 13 | 14 | // random in (0, 1] 15 | static double rand_unif(void) { 16 | return (rand())/(RAND_MAX+1.0); 17 | } 18 | /* 19 | // generates random samples from a 0/1 Gaussian via Box-Mueller 20 | static double rand_normal(void) { 21 | return sqrt(-2.0*log(rand_unif())) * cos(2.0*M_PI*rand_unif()); 22 | } 23 | */ 24 | 25 | float time_diff(struct timeval *t2, struct timeval *t1) { 26 | long int diff = (t2->tv_usec + 1000000 * t2->tv_sec) - (t1->tv_usec + 1000000 * t1->tv_sec); 27 | return diff / 1000000.0f; 28 | } 29 | 30 | 31 | 32 | int main(int argc, char** argv) { 33 | int n = 50000; 34 | int m = 784; 35 | int k = 2048; 36 | int n_iter = 10; 37 | int gpu_id = -1; 38 | 39 | if (argc > 1) 40 | k = atoi(argv[1]); 41 | 42 | if (argc > 2) 43 | n_iter = atoi(argv[2]); 44 | 45 | if (argc > 3) 46 | m = atoi(argv[3]); 47 | 48 | if (argc > 4) 49 | gpu_id = atoi(argv[4]); 50 | 51 | 52 | float* X = (float*) malloc(n*m*sizeof(float)); 53 | float* W = (float*) malloc(n*k*sizeof(float)); 54 | float* P = (float*) malloc(m*sizeof(float)); 55 | 56 | for (int i = 0; i < n*m; ++i) 57 | X[i] = 5.0f* rand_unif() - 0.5; 58 | for (int i = 0; i < n*k; ++i) 59 | W[i] = rand_unif() - 0.5; 60 | 61 | struct timeval t0, t1; 62 | gettimeofday(&t0, 0); 63 | train_gpu(X, W, P, n, m, k, n_iter, 0.1, 0.1, 1e-2, 0.0, 0.0, 32, gpu_id); 64 | //train_cpu(X, W, P, n, m, k, n_iter, 0.1, 0.1, 1e-2, 0.0, 0.0, 32); 65 | gettimeofday(&t1, 0); 66 | printf("time for rfn: %3.4fs\n", time_diff(&t1, &t0)); 67 | free(X); 68 | free(W); 69 | free(P); 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | NVCC := nvcc -std=c++11 2 | CC := gcc -std=c99 3 | CXX := g++ -std=c++11 4 | LINK := g++ 5 | USEGPU = yes 6 | DEBUG = no 7 | 8 | 9 | # BLAS/LAPACK implementation 10 | # Change these lines if you don't have an MKL but an other BLAS/LAPACK 11 | LIBS=-llapack -lblas 12 | #INCPATH=-I/opt/intel/mkl/include/ 13 | #LIBS=-L/opt/intel/mkl/lib/intel64/ -lmkl_rt 14 | 15 | 16 | GPUINC=-I/usr/local/cuda/include 17 | GPULIB=-L/usr/local/cuda/lib64 18 | GPUSO=-lcublas -lcurand -lcuda -lcudart -lcusolver -lgomp -lcusparse 19 | 20 | 21 | ifeq ($(DEBUG), no) 22 | CFLAGS=-O3 -DNDEBUG -Wall -fPIC -march=native 23 | LDFLAGS=-O3 -flto -Wall -fPIC 24 | else 25 | CFLAGS=-g -Wall -fPIC -march=native $(INCPATH) 26 | LDFLAGS= -g -Wall -fPIC $(LIBPATH) $(LIBS) 27 | endif 28 | 29 | ifeq ($(USEGPU),yes) 30 | INCPATH+=$(GPUINC) 31 | LIBS+=$(GPULIB) $(GPUSO) 32 | else 33 | CFLAGS+=-DNOGPU 34 | endif 35 | 36 | CFLAGS+=$(INCPATH) 37 | LDFLAGS+=$(LIBPATH) $(LIBS) 38 | CXXFLAGS=$(CFLAGS) 39 | 40 | 41 | # uncomment needed architectures as required 42 | NVCCFLAGS=--use_fast_math $(addprefix -Xcompiler , $(CXXFLAGS)) \ 43 | -gencode arch=compute_30,code=sm_35 \ 44 | -gencode arch=compute_50,code=sm_50 \ 45 | -gencode arch=compute_52,code=sm_52 \ 46 | -gencode arch=compute_61,code=sm_61 47 | 48 | 49 | SOURCES=librfn.cpp cpu_operations.cpp nist_spblas.cc 50 | OBJECTS=librfn.o cpu_operations.o nist_spblas.o 51 | 52 | ifeq ($(USEGPU),yes) 53 | SOURCES+=gpu_operations.cu 54 | OBJECTS+=gpu_operations.o 55 | endif 56 | 57 | all: $(SOURCES) librfn.so 58 | 59 | test: gpu_operations.o cpu_operations.o tests/tests.o tests/test_runner.o 60 | g++ $(LDFLAGS) $^ -o $@ $(LIBS) 61 | ./test 62 | 63 | testbin: librfn.so tests/testbin.o 64 | gcc tests/testbin.o -o testbin $(LIBPATH) $(LDFLAGS) -L./ -lrfn 65 | 66 | librfn.so: $(OBJECTS) 67 | $(CXX) $(LDFLAGS) $^ -o $@ $(LIBS) -shared 68 | 69 | gpu_operations.o: gpu_operations.cu 70 | $(NVCC) $(NVCCFLAGS) -o $@ -c $< 71 | 72 | clean: 73 | rm -rf *.o librfn.so tests/*.o 74 | -------------------------------------------------------------------------------- /blas_enum.h: -------------------------------------------------------------------------------- 1 | #ifndef BLAS_ENUM_H 2 | #define BLAS_ENUM_H 3 | 4 | /* Enumerated types */ 5 | 6 | enum blas_order_type { 7 | blas_rowmajor = 101, 8 | blas_colmajor = 102 }; 9 | 10 | enum blas_trans_type { 11 | blas_no_trans = 111, 12 | blas_trans = 112, 13 | blas_conj_trans = 113 }; 14 | 15 | enum blas_uplo_type { 16 | blas_upper = 121, 17 | blas_lower = 122 }; 18 | 19 | enum blas_diag_type { 20 | blas_non_unit_diag = 131, 21 | blas_unit_diag = 132 }; 22 | 23 | enum blas_side_type { 24 | blas_left_side = 141, 25 | blas_right_side = 142 }; 26 | 27 | enum blas_cmach_type { 28 | blas_base = 151, 29 | blas_t = 152, 30 | blas_rnd = 153, 31 | blas_ieee = 154, 32 | blas_emin = 155, 33 | blas_emax = 156, 34 | blas_eps = 157, 35 | blas_prec = 158, 36 | blas_underflow = 159, 37 | blas_overflow = 160, 38 | blas_sfmin = 161}; 39 | 40 | enum blas_norm_type { 41 | blas_one_norm = 171, 42 | blas_real_one_norm = 172, 43 | blas_two_norm = 173, 44 | blas_frobenius_norm = 174, 45 | blas_inf_norm = 175, 46 | blas_real_inf_norm = 176, 47 | blas_max_norm = 177, 48 | blas_real_max_norm = 178 }; 49 | 50 | enum blas_sort_type { 51 | blas_increasing_order = 181, 52 | blas_decreasing_order = 182 }; 53 | 54 | enum blas_conj_type { 55 | blas_conj = 191, 56 | blas_no_conj = 192 }; 57 | 58 | enum blas_jrot_type { 59 | blas_jrot_inner = 201, 60 | blas_jrot_outer = 202, 61 | blas_jrot_sorted = 203 }; 62 | 63 | enum blas_prec_type { 64 | blas_prec_single = 211, 65 | blas_prec_double = 212, 66 | blas_prec_indigenous = 213, 67 | blas_prec_extra = 214 }; 68 | 69 | enum blas_base_type { 70 | blas_zero_base = 221, 71 | blas_one_base = 222 }; 72 | 73 | enum blas_symmetry_type { 74 | blas_general = 231, 75 | blas_symmetric = 232, 76 | blas_hermitian = 233, 77 | blas_triangular = 234, 78 | blas_lower_triangular = 235, 79 | blas_upper_triangular = 236, 80 | blas_lower_symmetric = 237, 81 | blas_upper_symmetric = 238, 82 | blas_lower_hermitian = 239, 83 | blas_upper_hermitian = 240 }; 84 | 85 | enum blas_field_type { 86 | blas_complex = 241, 87 | blas_real = 242, 88 | blas_double_precision = 243, 89 | blas_single_precision = 244 }; 90 | 91 | enum blas_size_type { 92 | blas_num_rows = 251, 93 | blas_num_cols = 252, 94 | blas_num_nonzeros = 253 }; 95 | 96 | enum blas_handle_type{ 97 | blas_invalid_handle = 261, 98 | blas_new_handle = 262, 99 | blas_open_handle = 263, 100 | blas_valid_handle = 264}; 101 | 102 | enum blas_sparsity_optimization_type { 103 | blas_regular = 271, 104 | blas_irregular = 272, 105 | blas_block = 273, 106 | blas_unassembled = 274 }; 107 | 108 | #endif 109 | /* BLAS_ENUM_H */ 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # librfn: Rectified Factor Networks 2 | 3 | Rectified Factor Networks (RFNs) are an unsupervised technique that learns a non-linear, high-dimensional representation of its input. The underlying algorithm has been published in 4 | 5 | *Rectified Factor Networks*, Djork-Arné Clevert, Andreas Mayr, Thomas Unterthiner, Sepp Hochreiter, NIPS 2015. 6 | 7 | librfn is implemented in C++ and can be easily integrated in existing code bases. It also contains a high-level Python wrapper for ease of use. The library can run in either CPU or GPU mode. For larger models the GPU mode offers large speedups and is the recommended mode. 8 | 9 | 10 | # Installation 11 | 12 | 1. (optional) Adjust the Makefile to your needs 13 | 2. Type `make` to start the building process 14 | 3. To use the python wrapper, just copy `rfn.py` and `librfn.so` into your working directory. 15 | 16 | 17 | # Requirements 18 | To run the GPU code, you require a CUDA 7.5 (or higher) compatible GPU. While in theory CUDA 7.0 is also supported, it contains a bug that results in a memory leak when running librfn (and your program is likely to crash with an out-of-memory error). 19 | 20 | If you do not have access to a GPU, you can disable GPU support by setting `USEGPU = no` in the Makefile. 21 | 22 | Note that librfn makes heavy use of BLAS and LAPACK, so make sure to link it to a high-quality implementation to get optimal speed (e.g. OpenBLAS or MKL) by modifying the Makefile. 23 | 24 | 25 | # Usage 26 | 27 | The following code trains an RFN on MNIST and plots the resulting filters:: 28 | 29 | import numpy as np 30 | import matplotlib.pyplot as plt 31 | 32 | from sklearn.datasets import fetch_mldata 33 | mnist = fetch_mldata('MNIST original') 34 | X = mnist['data'] / 255.0 35 | 36 | from rfn import * 37 | W, P, Wout = train_rfn(X, 128, 500, 0.1, 0.1, 1e-1, 0.0, gpu_id=0) 38 | 39 | # plot weights 40 | fig, ax = plt.subplots(5, 5, figsize=(8, 8)) 41 | for i, a in enumerate(ax.flat): 42 | a.pcolorfast(W[i].reshape(28, 28), cmap=plt.cm.Greys_r) 43 | a.set_ylim(28, 0) 44 | a.grid("off") 45 | a.set_axis_off() 46 | fig.subplots_adjust(0, 0, 1, 1, 0, 0) 47 | fig 48 | 49 | # calculate hidden units and reconstructions 50 | H = np.maximum(0, np.dot(Wout, X.T)) 51 | R = np.dot(H.T, W) 52 | 53 | # plot reconstructions 54 | np.random.shuffle(R) # shuffle samples, otherwhise we only plot 0s 55 | fig, ax = plt.subplots(5, 5, figsize=(8, 8)) 56 | for i, a in enumerate(ax.flat): 57 | a.pcolorfast(R[i].reshape(28, 28), cmap=plt.cm.Greys_r) 58 | a.set_ylim(28, 0) 59 | a.grid("off") 60 | a.set_axis_off() 61 | fig.subplots_adjust(0, 0, 1, 1, 0, 0) 62 | fig 63 | 64 | 65 | # Implementation Note 66 | 67 | The RFN algorithm is based on the EM algorithm. Within the E-step, the published algorithm includes a projection procedure that can be implemented in several ways (see the RFN paper's supplemental section 9). To make sure no optimzation constraints are violated during this projection, the original publication tries the simplest method first, but backs out to more and more complicated updates if easier method fail (suppl. section 9.5.3). 68 | In contrast, librfn always uses the simplest/fastest projection method. This is a simplification/approximation of the original algorithm that nevertheless works very well in practice. 69 | 70 | 71 | # License 72 | librfn was developed by Thomas Unterthiner and is licensed under the [General Public License (GPL) Version 2 or higher](http://www.gnu.org/licenses/gpl-2.0.html) See ``License.txt`` for details. 73 | -------------------------------------------------------------------------------- /librfn.h: -------------------------------------------------------------------------------- 1 | #ifndef LIBRFN_H 2 | #define LIBRFN_H 3 | 4 | /* 5 | Copyright © 2015-2017 Thomas Unterthiner 6 | Additional Contributions by Thomas Adler, Balázs Bencze 7 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 8 | */ 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | 14 | 15 | const int USE_CPU = 2; 16 | 17 | 18 | /** 19 | * Trains an RFN network. 20 | * 21 | * Note: All arguments are assumed to be in C-order (ie., row-major) 22 | * and in host (ie., CPU) memory. 23 | * If necessary, any transfers from and to the GPU will be 24 | * done internally by the function itself. 25 | * 26 | * @param X [n, m] data matrix, with 1 sample per row 27 | * @param W [k, m] weight matrix, expected to be pre-initialized 28 | * @param P [m, ] vector, used to store Psi 29 | * @param n number of samples 30 | * @param m number of input features 31 | * @param k number of hidden units 32 | * @param n_iter number of iterations the algorithm will run 33 | * @param learnrate learnrate 34 | * @param dropout_rate the dropout rate for hidden activations 35 | * @param input_dropout_rate the dropout rate for input units 36 | * @param seed seed for the random number generation 37 | * @param gpu_id ID of the GPU that this will run on 38 | * If this is -1 use the GPU with the most free memory 39 | * If this is -2, the CPU is used instead of the GPU 40 | * 41 | * @return 0 on success, 1 otherwise. The trained network will be stored 42 | * in the W_host and P_host variables. 43 | */ 44 | int train_rfn(const float* X, float* W, float* P, const int n, 45 | const int m, const int k, const int n_iter, int batch_size, 46 | const float etaW, const float etaP, const float minP, const float h_threshold, 47 | const float dropout_rate, const float input_noise_rate, 48 | const float l2_weightdecay, const float l1_weightdecay, 49 | const float momentum, 50 | const int noise_type, const int activation_type, const int apply_scaling, 51 | const int applyNewtonUpdate, unsigned long seed, int gpu_id); 52 | 53 | 54 | /** 55 | * Trains an RFN network. 56 | * The parameters are the same as in `int train_rfn`, except that X is encoded 57 | * as a sparse matrix in CSR format. 58 | * 59 | * Note: the number of nonzero elements of X should be stored in Xrowptr[n] 60 | */ 61 | int train_rfn_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr, 62 | float* W, float* P, const int n, 63 | const int m, const int k, const int n_iter, int batch_size, 64 | const float etaW, const float etaP, const float minP, const float h_threshold, 65 | const float dropout_rate, const float input_noise_rate, 66 | const float l2_weightdecay, const float l1_weightdecay, 67 | const float momentum, 68 | const int noise_type, const int activation_type, const int apply_scaling, 69 | const int applyNewtonUpdate, unsigned long seed, int gpu_id); 70 | 71 | /** 72 | * Given a trained RFN, this will calculate the weights that are used to 73 | * estimate the hidden activations. 74 | * 75 | * This needs access to the training data, as the W need to incorporate 76 | * the scaling that would otherwise be done on the hidden activations. 77 | * The scaling parameters have to be fitted on the training data's H. 78 | * 79 | * Note: All arguments are assumed to be in C-order (ie., row-major) 80 | * and in host (ie., CPU) memory. Any necessary transfers from and to the GPU 81 | * will be done internally by the function itself. 82 | * 83 | * @param X [n, m] training data matrix, with 1 sample per row 84 | * @param W [k, m] RFN weight matrix 85 | * @param P [m] vector, contains Psi 86 | * @param Wout [k, m] output weight matrix 87 | * @param n number of training samples 88 | * @param m number of input features 89 | * @param k number of hidden units 90 | * @param gpu_id ID of the GPU that this will run on 91 | * If this is -1 use the GPU with the most free memory 92 | * If this is -2, the CPU is used instead of the GPU 93 | */ 94 | void calculate_W(const float* X, const float* W, const float* P, float* Wout, 95 | const int n, const int m, const int k, 96 | const int activation_type, const int apply_scaling, const float h_threshold, 97 | int gpu_id); 98 | 99 | /** 100 | * Given a trained RFN, this will calculate the weights that are used to 101 | * estimate the hidden activations. 102 | * 103 | * The parameters are the same as in `void calculate_W`, except that X is encoded 104 | * as a sparse matrix in CSR format. 105 | * 106 | * Note: the number of nonzero elements of X should be stored in Xrowptr[n] 107 | */ 108 | void calculate_W_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr, 109 | const float* W, const float* P, float* Wout, 110 | const int n, const int m, const int k, 111 | const int activation_type, const int apply_scaling, const float h_threshold, 112 | int gpu_id); 113 | 114 | #ifdef __cplusplus 115 | } 116 | #endif 117 | 118 | #endif /* LIBRFN_H */ 119 | -------------------------------------------------------------------------------- /basic_python_implementation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | """ 3 | Implements the RFN algorithm as easily understandable code. 4 | 5 | Copyright © 2015 Thomas Unterthiner 6 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 7 | 8 | Contains a very basic CPU and a GPU implementation that is easy to understand. 9 | This code is meant as an instructional ressource, and not suited for production 10 | runs. 11 | 12 | The GPU implementation assumes that scikits.cuda.linalg works properly 13 | (which in turn requires CULA). Also, this requires the current development 14 | version of scikits.cuda (as of 2014-08-11). 15 | """ 16 | 17 | import time 18 | import numpy as np 19 | from scikits.cuda import linalg as la 20 | import pycuda.curandom as curand 21 | import pycuda.gpuarray as gpu 22 | import pycuda.elementwise as el 23 | import pycuda.driver as drv 24 | 25 | from pycuda.compiler import SourceModule 26 | from pycuda.tools import DeviceMemoryPool 27 | from scikits.cuda.cublas import cublasSgemv 28 | from pycuda.elementwise import ElementwiseKernel 29 | from pycuda import cumath 30 | 31 | _dropout_kernel = None 32 | _saltpepper_kernel = None 33 | _rng_state = None 34 | _rng_blocks = 128 35 | _rng_threads = 128 36 | 37 | _mempool = DeviceMemoryPool() 38 | 39 | def init_rng(seed): 40 | global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks 41 | from pycuda.characterize import sizeof 42 | ds = sizeof("curandState", "#include ") 43 | _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds) 44 | 45 | src = SourceModule( 46 | ''' 47 | #include 48 | 49 | extern "C" 50 | { 51 | __global__ void setup_rng(curandState* rng_state, const unsigned seed) 52 | { 53 | const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; 54 | curand_init(seed, tid, 0, &rng_state[tid]); 55 | } 56 | 57 | __global__ void dropout_eltw(float* x, const unsigned size, 58 | float dropout_rate, 59 | curandState* rng_state) { 60 | const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; 61 | const unsigned num_threads = gridDim.x*blockDim.x; 62 | curandState localState = rng_state[tid]; 63 | for (unsigned i = tid; i < size; i += num_threads) 64 | x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; 65 | rng_state[tid] = localState; 66 | } 67 | 68 | __global__ void saltpepper_eltw(float* x, const unsigned size, 69 | float dropout_rate, 70 | curandState* rng_state) { 71 | const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x; 72 | const unsigned num_threads = gridDim.x*blockDim.x; 73 | curandState localState = rng_state[tid]; 74 | for (unsigned i = tid; i < size; i += num_threads) 75 | x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; 76 | x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i]; 77 | rng_state[tid] = localState; 78 | } 79 | } 80 | ''', no_extern_c=True) 81 | setup_rng = src.get_function("setup_rng") 82 | setup_rng.prepare("Pi") 83 | setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1), 84 | _rng_state, np.uint32(seed)) 85 | _dropout_kernel = src.get_function("dropout_eltw") 86 | _dropout_kernel.prepare("PifP") 87 | _saltpepper_kernel = src.get_function("saltpepper_eltw") 88 | _saltpepper_kernel.prepare("PifP") 89 | 90 | 91 | def dropout(X, dropout_rate): 92 | _dropout_kernel.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1), 93 | X.gpudata, np.prod(X.shape), np.float32(dropout_rate), _rng_state) 94 | return X 95 | 96 | 97 | def saltpepper_noise(X, dropout_rate): 98 | _saltpepper_kernel.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1), 99 | X.gpudata, np.prod(X.shape), np.float32(dropout_rate), _rng_state) 100 | return X 101 | 102 | _unitvariance_step1_kernel = ElementwiseKernel( 103 | "float* X, float* mean, float* Xsq, const unsigned height", 104 | "float tmp = X[i] - mean[i % height]; Xsq[i] = tmp*tmp;") 105 | 106 | _unitvariance_step2_kernel = ElementwiseKernel( 107 | "float* work1, const unsigned k", 108 | "work1[i] = (work1[i] > 1e-7) ? rsqrtf(work1[i]) : 1.0;") 109 | 110 | _unitvariance_step3_kernel = ElementwiseKernel( 111 | "float* X, float* mean, const unsigned height", 112 | "X[i] *= mean[i % height];") 113 | 114 | def to_unit_variance(H): 115 | ''' Scales H so that column has a variance of 1. ''' 116 | from scikits.cuda.misc import _global_cublas_handle as cublas_handle 117 | ones = gpu.empty((H.shape[0], 1), np.float32, allocator=_mempool.allocate) 118 | ones.fill(1.0) 119 | Hsq = gpu.empty(H.shape, np.float32, allocator=_mempool.allocate) 120 | mean = gpu.empty((1, H.shape[1]), np.float32, allocator=_mempool.allocate) 121 | cublasSgemv(cublas_handle, "n", H.shape[1], H.shape[0], 122 | 1.0/H.shape[0], H.gpudata, H.shape[1], ones.gpudata, 123 | 1, 0.0, mean.gpudata, 1) 124 | _unitvariance_step1_kernel(H, mean, Hsq, H.shape[1]) 125 | cublasSgemv(cublas_handle, "n", Hsq.shape[1], H.shape[0], 126 | 1.0/H.shape[0], Hsq.gpudata, H.shape[1], ones.gpudata, 127 | 1, 0.0, mean.gpudata, 1) 128 | _unitvariance_step2_kernel(mean, H.shape[1]) 129 | _unitvariance_step3_kernel(H, mean, H.shape[1]) 130 | return H 131 | 132 | 133 | def calculate_H_gpu(X, W, P): 134 | WPW = la.add_diag(P, la.dot(W, W, "t", "n")) 135 | tmp = la.dot(W, la.inv(WPW, overwrite=True)) 136 | H = la.dot(X, tmp, "n", "t") 137 | H = gpu.maximum(H, 0) 138 | H = to_unit_variance(H) 139 | return H, tmp 140 | 141 | 142 | def train_rfn_gpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate, input_droput_rate, minPsi=0.1, seed=32): 143 | k = n_hidden 144 | n, m = X.shape 145 | W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32) 146 | P = np.array([0.1] * m, dtype=np.float32) 147 | XXdiag = np.diag(np.dot(X.T, X) / n).copy() # explicit copy to avoid numpy 1.8 warning 148 | W = gpu.to_gpu(W, allocator=_mempool.allocate) 149 | P = gpu.to_gpu(P, allocator=_mempool.allocate) 150 | X = gpu.to_gpu(X, allocator=_mempool.allocate) 151 | XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate) 152 | I = la.eye(k, dtype=np.float32) 153 | 154 | init_rng(seed) 155 | t0 = time.time() 156 | for cur_iter in range(n_iter): 157 | H, tmp = calculate_H_gpu(X, W, P) 158 | if dropout_rate > 0: 159 | dropout(H, dropout_rate) 160 | Xtmp = X 161 | if input_dropout_rate > 0: 162 | Xtmp = X.copy() 163 | saltpepper_noise(Xtmp, input_dropout_rate) 164 | U = la.dot(Xtmp, H, "t", "n") / n 165 | S = la.dot(H, H, "t", "n") / n 166 | S += I 167 | S -= la.dot(tmp, W, "n", "t") 168 | Cii = la.dot(la.dot(W, S, "t") - 2*U, W) 169 | 170 | Sinv = la.inv(S, overwrite=True) 171 | dW = la.dot(Sinv, U, "n", "t") - W 172 | dP = XXdiag + la.diag(Cii) - P 173 | 174 | W += learnrateW * dW 175 | P += learnratePsi * dP 176 | 177 | P = gpu.maximum(P, minPsi) 178 | if cur_iter % 25 == 0: 179 | print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0) 180 | return W.get(), P.get() 181 | 182 | 183 | def train_rfn_cpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate): 184 | n, m = X.shape 185 | k = n_hidden 186 | W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32) 187 | P = np.array([0.1] * m) 188 | H = np.zeros((k, n), dtype=np.float32) 189 | C = np.dot(X.T, X) / n 190 | 191 | t0 = time.time() 192 | for cur_iter in range(n_iter): 193 | I = np.eye(k, dtype=np.float32) 194 | tmp = I + np.dot(W * 1.0/P, W.T) 195 | tmp = np.linalg.inv(tmp) 196 | Wout = np.dot(tmp, W) * (1.0/P) 197 | H = np.dot(Wout, X.T) 198 | 199 | H = np.maximum(0, H) 200 | H /= (H.std(1) + 1e-9)[:, None] 201 | if dropout_rate > 0: 202 | H *= np.random.binomial(1, 1-dropout_rate, size=H.shape).astype(np.float32) 203 | 204 | U = np.dot(X.T, H.T) / n 205 | S = (np.dot(H, H.T) + tmp) / n 206 | 207 | dW = np.dot(np.linalg.inv(S), U.T) - W 208 | Cii = C - np.dot(-2*U + np.dot(W.T, S), W) 209 | dP = np.diag(Cii) - P 210 | 211 | W += learnrateW * dW 212 | P += learnratePsi * dP 213 | 214 | P = np.maximum(P, 0.1) 215 | 216 | if cur_iter % 25 == 0: 217 | print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0) 218 | return W, H, P 219 | -------------------------------------------------------------------------------- /tests/tests.cpp: -------------------------------------------------------------------------------- 1 | #include "catch.hpp" 2 | #include "../cpu_operations.h" 3 | #include "../gpu_operations.h" 4 | #include 5 | 6 | #include 7 | float time_diff(struct timeval *t2, struct timeval *t1) { 8 | long int diff = (t2->tv_usec + 1000000 * t2->tv_sec) - (t1->tv_usec + 1000000 * t1->tv_sec); 9 | return diff / 1000000.0f; 10 | } 11 | 12 | 13 | 14 | using namespace std; 15 | 16 | TEST_CASE( "to_host_and_to_device", "[gpu]" ) { 17 | GPU_Operations op(6, 6, 6, 0, -1); 18 | float X_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; 19 | float* X_d = op.to_device(X_h, sizeof(X_h)); 20 | 21 | float* X2_h = (float*) malloc(sizeof(X_h)); 22 | op.copy_to_host(X_d, X2_h, sizeof(X_h)); 23 | for (size_t i = 0; i < sizeof(X_h)/sizeof(X_h[0]); ++i) { 24 | CHECK(X_h[i] == X2_h[i]); 25 | } 26 | free(X2_h); 27 | op.free(X_d); 28 | } 29 | 30 | 31 | template 32 | float* test_variance(OP& op, float* X, unsigned nrows, unsigned ncols, float* expected) { 33 | float* var = (float*) op.malloc(ncols*sizeof(X[0])); 34 | op.calculate_column_variance(X, nrows, ncols, var); 35 | float* res = (float*) malloc(ncols*sizeof(X[0])); 36 | op.copy_to_host(var, res, ncols*sizeof(var[0])); 37 | for (size_t i = 0; i < 3; ++i) { 38 | CHECK(res[i] == expected[i]); 39 | } 40 | free(res); 41 | return var; 42 | } 43 | 44 | 45 | TEST_CASE( "Calculate Variance", "[operations]" ) { 46 | GPU_Operations gpu_op(512, 512, 512, 0, -1); 47 | CPU_Operations cpu_op(512, 512, 512, 0, -1); 48 | float X_h[] = {1.0, 2.0, 3.0, 49 | 4.0, 6.0, 10.0}; 50 | float expected[] = {2.25, 4, 12.25}; 51 | float* res_h = test_variance(cpu_op, X_h, 2, 3, expected); 52 | cpu_op.free(res_h); 53 | float* X_d = gpu_op.to_device(X_h, sizeof(X_h)); 54 | float* res_d = test_variance(gpu_op, X_d, 2, 3, expected); 55 | gpu_op.free(res_d); 56 | gpu_op.free(X_d); 57 | } 58 | 59 | 60 | // the pointer-to-memberfunction thingy is pretty ugly :( 61 | template 62 | float* test_scale(OP& op, 63 | void (OP::*scalefunc)(float*, unsigned int, unsigned int, float*) const, 64 | float* X, float* s, unsigned nrows, unsigned ncols, float* expected) { 65 | float* scale = op.to_device(s, ncols*sizeof(X[0])); 66 | (op.*scalefunc)(X, nrows, ncols, scale); 67 | float* res = (float*) malloc(ncols*nrows*sizeof(X[0])); 68 | op.copy_to_host(X, res, ncols*nrows*sizeof(X[0])); 69 | for (size_t i = 0; i < nrows*ncols; ++i) { 70 | CHECK(expected[i] == res[i]); 71 | } 72 | free(res); 73 | return 0; 74 | } 75 | 76 | 77 | TEST_CASE( "Scale columns CPU", "[operations]" ) { 78 | CPU_Operations op(6, 6, 6, 0, -1); 79 | float X_h[] = {1.0, 2.0, 3.0, 80 | 4.0, 6.0, 10.0}; 81 | float s_h[] = {1.0, 2.0, 3.0}; 82 | float Exp_h[] = {1.0, 4.0, 9.0, 83 | 4.0, 12.0, 30.0}; 84 | test_scale(op, &CPU_Operations::scale_columns, X_h, s_h, 2, 3, Exp_h); 85 | } 86 | 87 | 88 | TEST_CASE( "Scale columns GPU", "[operations]" ) { 89 | GPU_Operations op(6, 6, 6, 0, -1); 90 | float X_h[] = {1.0, 2.0, 3.0, 91 | 4.0, 6.0, 10.0}; 92 | float s_h[] = {1.0, 2.0, 3.0}; 93 | float Exp_h[] = {1.0, 4.0, 9.0, 94 | 4.0, 12.0, 30.0}; 95 | float* X_d = op.to_device(X_h, sizeof(X_h)); 96 | test_scale(op, &GPU_Operations::scale_columns, X_d, s_h, 2, 3, Exp_h); 97 | op.free(X_d); 98 | } 99 | 100 | 101 | TEST_CASE( "Scale rows CPU", "[operations]" ) { 102 | CPU_Operations op(6, 6, 6, 0, -1); 103 | float X_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 104 | 4.0, 6.0, 10.0, 1.0, 1.5}; 105 | float s_h[] = {2.0, 4.0}; 106 | float Exp_h[] = { 2.0, 4.0, 6.0, 8.0, 10.0, 107 | 16.0, 24.0, 40.0, 4.0, 6.0}; 108 | test_scale(op, &CPU_Operations::scale_rows, X_h, s_h, 2, 5, Exp_h); 109 | } 110 | 111 | 112 | TEST_CASE( "Scale rows GPU", "[operations]" ) { 113 | GPU_Operations op(6, 6, 6, 0, -1); 114 | float X_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 115 | 4.0, 6.0, 10.0, 1.0, 1.5}; 116 | float s_h[] = {2.0, 4.0}; 117 | float Exp_h[] = { 2.0, 4.0, 6.0, 8.0, 10.0, 118 | 16.0, 24.0, 40.0, 4.0, 6.0}; 119 | float* X_d = op.to_device(X_h, sizeof(X_h)); 120 | test_scale(op, &GPU_Operations::scale_rows, X_d, s_h, 2, 5, Exp_h); 121 | op.free(X_d); 122 | } 123 | 124 | 125 | TEST_CASE( "invsqrt cpu", "[operations]" ) { 126 | CPU_Operations op(6, 6, 6, 0, -1); 127 | float x_h[] = {0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0}; 128 | float e_h[] = {1.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0}; 129 | int n = sizeof(x_h) / sizeof(x_h[0]); 130 | for (int i = 1; i < n; ++i) 131 | e_h[i] = 1.0f / sqrt(x_h[i]); 132 | op.invsqrt(x_h, n); 133 | for (size_t i = 0; i < 3; ++i) { 134 | CHECK(abs(x_h[i] - e_h[i]) < 1e-3); 135 | } 136 | } 137 | 138 | 139 | TEST_CASE( "invsqrt gpu", "[operations]" ) { 140 | GPU_Operations op(6, 6, 6, 0, -1); 141 | float x_h[] = {0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0}; 142 | float e_h[] = {1.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0}; 143 | int n = sizeof(x_h) / sizeof(x_h[0]); 144 | for (int i = 1; i < n; ++i) 145 | e_h[i] = 1.0f / sqrt(x_h[i]); 146 | float* x_d = op.to_device(x_h, sizeof(x_h)); 147 | op.invsqrt(x_d, n); 148 | float* res = (float*) malloc(n*sizeof(x_h[0])); 149 | op.copy_to_host(x_d, res, n*sizeof(x_h[0])); 150 | for (size_t i = 0; i < 3; ++i) { 151 | CHECK(abs(res[i] - e_h[i]) < 1e-3); 152 | } 153 | op.free(x_d); 154 | } 155 | 156 | 157 | TEST_CASE( "filleye cpu", "[operations]" ) { 158 | unsigned n = 10; 159 | CPU_Operations op(n, n, n, 0, -1); 160 | float* x = op.malloc(n*n*sizeof(float)); 161 | op.fill_eye(x, 10); 162 | double s = 0.0; 163 | for (unsigned i = 0; i < n; ++i) { 164 | for (unsigned j = 0; j < n; ++j) { 165 | if (i == j) { 166 | CHECK(x[i*n+j] == 1.0); 167 | } else { 168 | s += abs(x[i*n+j]); 169 | } 170 | } 171 | } 172 | CHECK(s == 0.0); 173 | op.free(x); 174 | } 175 | 176 | 177 | TEST_CASE( "filleye gpu", "[operations]" ) { 178 | unsigned n = 10; 179 | CPU_Operations cpu_op(n, n, n, 0, -1); 180 | GPU_Operations op(n, n, n, 0, -1); 181 | float* x_d = op.malloc(n*n*sizeof(float)); 182 | op.fill_eye(x_d, 10); 183 | float *x = cpu_op.malloc(n*n*sizeof(float)); 184 | op.copy_to_host(x_d, x, n*n*sizeof(float)); 185 | double s = 0.0; 186 | for (unsigned i = 0; i < n; ++i) { 187 | for (unsigned j = 0; j < n; ++j) { 188 | if (i == j) { 189 | CHECK(x[i*n+j] == 1.0); 190 | } else { 191 | s += abs(x[i*n+j]); 192 | } 193 | } 194 | } 195 | CHECK(s == 0.0); 196 | op.free(x_d); 197 | } 198 | 199 | 200 | TEST_CASE( "Variance of CPU/GPU on large matrices", "[cpu_vs_gpu]" ) { 201 | unsigned n = 428; 202 | unsigned m = 554; 203 | CPU_Operations cpu_op(m, n, m, 0, -1); 204 | GPU_Operations gpu_op(m, n, m, 0, -1); 205 | 206 | float* X_h = cpu_op.malloc(n*m*sizeof(float)); 207 | for (unsigned i = 0; i < n*m; ++i) { 208 | X_h[i] = 10*((rand()+1.0)/(RAND_MAX+1.0)) - 5.0; 209 | } 210 | float *X_d = gpu_op.to_device(X_h, n*m*sizeof(float)); 211 | 212 | float* var_h = cpu_op.malloc(m*sizeof(float)); 213 | float* var_d = gpu_op.malloc(m*sizeof(float)); 214 | cpu_op.calculate_column_variance(X_h, n, m, var_h); 215 | gpu_op.calculate_column_variance(X_d, n, m, var_d); 216 | float* var_gpu_h = cpu_op.malloc(m*sizeof(float)); 217 | gpu_op.to_host(var_d, var_gpu_h, m*sizeof(float)); 218 | 219 | for (unsigned i = 0; i < m; ++i) 220 | CHECK(abs(var_h[i] - var_gpu_h[i]) < 1e-3); 221 | cpu_op.free(var_h); 222 | cpu_op.free(var_gpu_h); 223 | } 224 | 225 | 226 | 227 | TEST_CASE( "dgmm CPU/GPU", "[operations]" ) { 228 | unsigned n = 10; 229 | unsigned k = 10; 230 | unsigned m = 12; 231 | CPU_Operations cpu_op(m, n, k, 0, -1); 232 | GPU_Operations gpu_op(m, n, k, 0, -1); 233 | float* xh = cpu_op.malloc(m*k*sizeof(float)); 234 | float* ah = cpu_op.malloc(m*sizeof(float)); 235 | float* ch = cpu_op.malloc(m*k*sizeof(float)); 236 | for (int i = 0; i < m*n; ++i) 237 | xh[i] = 10* (rand() / RAND_MAX); 238 | for (int i = 0; i < n; ++i) 239 | ah[i] = 50* (rand() / RAND_MAX); 240 | cpu_op.dgmm("l", m, k, xh, m, ah, 1, ch, m); 241 | 242 | float* xd = gpu_op.to_device(xh, m*k*sizeof(float)); 243 | float* ad = gpu_op.to_device(ah, m*sizeof(float)); 244 | float* cd = gpu_op.to_device(ch, m*k*sizeof(float)); 245 | gpu_op.dgmm("l", m, k, xd, m, ad, 1, cd, m); 246 | 247 | float* dh = cpu_op.malloc(m*k*sizeof(float)); 248 | gpu_op.copy_to_host(cd, dh, m*k*sizeof(float)); 249 | for (unsigned i = 0; i < m*k; ++i) { 250 | CHECK(ch[i] == dh[i]); 251 | } 252 | } 253 | -------------------------------------------------------------------------------- /cpu_operations.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2015-2017 Thomas Unterthiner 3 | Additional Contributions by Thomas Adler, Balázs Bencze 4 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 5 | */ 6 | 7 | #include "cpu_operations.h" 8 | #include 9 | 10 | 11 | /* This is the interface for RFN's sparse matrix operations. 12 | * If you want to use the generic implementation, compile nist_spblas.cc, 13 | * If you want to use the MKL, compile mkl_sparse_impl.cpp and link to MKL. */ 14 | 15 | CPU_Operations::SparseMatrix create(int row, int col); /* empty */ 16 | CPU_Operations::SparseMatrix suscr_csr(int m, int n, float *val, int *col, int *ptr); /* from csr */ 17 | void destroy(CPU_Operations::SparseMatrix A); 18 | 19 | /* select row subset */ 20 | CPU_Operations::SparseMatrix srowsubset(CPU_Operations::SparseMatrix A, int first_row, int nrow); /* allocates new matrix */ 21 | 22 | /* column means and variances */ 23 | void scolmeans(CPU_Operations::SparseMatrix A, float *means); 24 | void scolvars(CPU_Operations::SparseMatrix A, float *vars); 25 | 26 | /* scale rows/cols */ 27 | void sscalecols(CPU_Operations::SparseMatrix A, float *s); 28 | void sscalerows(CPU_Operations::SparseMatrix A, float *s); 29 | 30 | /* set element (set to zero will delete entry) */ 31 | void ssetelement( CPU_Operations::SparseMatrix A, int row, int col, float val ); 32 | void ssetelement( CPU_Operations::SparseMatrix A, int idx, float val ); 33 | 34 | /* get element reference */ 35 | float &sgetelement( CPU_Operations::SparseMatrix A, int row, int col); 36 | float &sgetelement( CPU_Operations::SparseMatrix A, int idx ); 37 | 38 | /* get element pointer */ 39 | float *sgetelementp( CPU_Operations::SparseMatrix A, int row, int col ); 40 | float *sgetelementp( CPU_Operations::SparseMatrix A, int idx ); 41 | 42 | /* sgemm routines with sparse matrix being lhs (A) or rhs (B) of the product */ 43 | void susgemm(char sidea, char transa, char transb, int nohs, const float &alpha, CPU_Operations::SparseMatrix A, 44 | const float *B, int ldB, const float &beta, float *C, int ldC); 45 | 46 | /* checks whether A is a valid handle */ 47 | bool handle_valid(CPU_Operations::SparseMatrix A); 48 | 49 | /* debug */ 50 | namespace NIST_SPBLAS 51 | {void print(int A);} 52 | 53 | 54 | using std::max; 55 | 56 | //float* CPU_Operations::ones = 0; 57 | 58 | CPU_Operations::CPU_Operations(const int m, const int n, const int k, 59 | unsigned long seed, int gpu_id) { 60 | srand(seed); 61 | int maxsize = max(max(n, m), k); 62 | ones = malloc(maxsize*sizeof(float)); 63 | for (int i = 0; i < maxsize; ++i) 64 | ones[i] = 1.0f; 65 | 66 | var_tmp = malloc(maxsize*sizeof(float)); 67 | } 68 | 69 | 70 | CPU_Operations::~CPU_Operations() { 71 | free(ones); 72 | free(var_tmp); 73 | } 74 | 75 | CPU_Operations::SparseMatrix CPU_Operations::create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m){ 76 | return suscr_csr(n, m, (float*) Xvals, (int*) Xcols, (int*) Xrowptr); 77 | } 78 | 79 | CPU_Operations::SparseMatrix CPU_Operations::get_batch(SparseMatrix X, int ldx, int batch_num, int batch_size) { 80 | return srowsubset(X, batch_num * batch_size, batch_size); 81 | } 82 | 83 | void CPU_Operations::scale_rows(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const { 84 | sscalerows(X, s); 85 | } 86 | 87 | 88 | static void colmeans(const float* X, float* means, const int nrows, const int ncols) { 89 | memset(means, 0, ncols*sizeof(float)); 90 | for (int i = 0; i < nrows; ++i) { 91 | for (int j = 0; j < ncols; ++j) { 92 | means[j] += X[i*ncols+j]; 93 | } 94 | } 95 | for (int j = 0; j < ncols; ++j) 96 | means[j] /= nrows; 97 | } 98 | 99 | 100 | void CPU_Operations::dgmm(const char* mode, const int m, const int n, const float* A, 101 | int lda, const float* x, int incx, float* C, int ldc) const { 102 | if (mode[0] == 'l' || mode[0] == 'L') { 103 | for (int i = 0; i < n; ++i) { 104 | for (int j = 0; j < m; ++j) 105 | C[i*ldc+j] = A[i*lda+j] * x[j]; 106 | } 107 | } else { 108 | for (int i = 0; i < n; ++i) { 109 | for (int j = 0; j < m; ++j) 110 | C[i*ldc+j] = A[i*lda+j] * x[i]; 111 | } 112 | } 113 | } 114 | 115 | 116 | void CPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 117 | const SparseMatrix a, const int lda, const float *b, const int ldb, const float beta, float *c, 118 | const int ldc) const { 119 | /* The gemm interface is understood as a column-major routine. The sparse implementation, 120 | * however, is row-major, so we need to compute B^T * A^T = C^T instead of A * B = C. The 121 | * transposition is implicitly performed by A, B and C being column-major. */ 122 | susgemm('r', transa[0], transb[0], n, alpha, a, b, ldb, beta, c, ldc); 123 | } 124 | 125 | void CPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 126 | const float *a, const int lda, const SparseMatrix b, const int ldb, const float beta, float *c, 127 | const int ldc) const { 128 | susgemm('l', transb[0], transa[0], m, alpha, b, a, lda, beta, c, ldc); 129 | } 130 | 131 | CPU_Operations::SparseMatrix CPU_Operations::memcpy_matrix(SparseMatrix &dest, SparseMatrix src, int nrows_to_copy, int src_ncol, int first_row = 0) const { 132 | free(dest); 133 | return dest = srowsubset(src, first_row, nrows_to_copy); 134 | } 135 | 136 | void CPU_Operations::free(SparseMatrix a) const { 137 | if (handle_valid(a)) 138 | destroy(a); 139 | } 140 | 141 | CPU_Operations::SparseMatrix CPU_Operations::malloc_matrix(int rows, int cols, SparseMatrix dummy) { 142 | return create(rows, cols); 143 | } 144 | 145 | void CPU_Operations::calculate_column_variance(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* variances) { 146 | memset(variances, 0, ncols * sizeof(float)); 147 | scolvars(X, variances); 148 | } 149 | 150 | void CPU_Operations::scale_columns(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const { 151 | sscalecols(X, s); 152 | } 153 | 154 | void CPU_Operations::dropout(SparseMatrix X, const unsigned size, const float dropout_rate) const { 155 | assert(0.0f <= dropout_rate && dropout_rate <= 1.0f); 156 | for (unsigned i = 0; i < size; ++i) 157 | /* TODO: write a routine sgetlement that leaves X const */ 158 | if (rand_unif() < dropout_rate) { 159 | float *v = sgetelementp(X, i); 160 | 161 | if (v != NULL) 162 | *v = 0.f; 163 | } 164 | } 165 | 166 | void CPU_Operations::add_saltpepper_noise(SparseMatrix X, const unsigned size, const float noise_rate) const { 167 | assert(0.0f <= noise_rate && noise_rate <= 1.0f); 168 | for (unsigned i = 0; i < size; ++i) { 169 | if (rand_unif() < noise_rate) { 170 | float *v = sgetelementp(X, i); 171 | 172 | if (v != NULL) 173 | *v = (rand_unif() < 0.5 ? 0.0f : 1.0f); 174 | } 175 | } 176 | } 177 | 178 | /* gauss noise makes no sense on sparse matrices */ 179 | void CPU_Operations::add_gauss_noise(SparseMatrix X, const unsigned size, const float noise_rate) const { 180 | assert(0.0 <= noise_rate); 181 | for (unsigned i = 0; i < size; ++i) { 182 | float *v = sgetelementp(X, i); 183 | 184 | if (v != NULL) 185 | *v += rand_normal() * noise_rate; 186 | } 187 | } 188 | 189 | 190 | void CPU_Operations::calculate_column_variance(const float* X, const unsigned nrows, 191 | const unsigned ncols, float* variances) { 192 | colmeans(X, var_tmp, nrows, ncols); 193 | memset(variances, 0, ncols*sizeof(float)); 194 | for (unsigned i = 0; i < nrows; ++i) { 195 | for (unsigned j = 0; j < ncols; ++j) { 196 | const float x = X[i*ncols+j] - var_tmp[j]; 197 | variances[j] += x*x; 198 | } 199 | } 200 | 201 | for (unsigned j = 0; j < ncols; ++j) { 202 | variances[j] /= nrows; 203 | } 204 | } 205 | 206 | 207 | void CPU_Operations::invsqrt(float* s, const unsigned n) const { 208 | for (unsigned j = 0; j < n; ++j) { 209 | if (s[j] == 0) 210 | s[j] = 1.0f; 211 | else 212 | s[j] = 1.0 / sqrtf(s[j]); 213 | } 214 | } 215 | 216 | void CPU_Operations::scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const { 217 | for (unsigned i = 0; i < nrows; ++i) { 218 | for (unsigned j = 0; j < ncols; ++j) { 219 | X[i*ncols+j] *= s[j]; 220 | } 221 | } 222 | } 223 | 224 | void CPU_Operations::scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const { 225 | for (unsigned i = 0; i < nrows; ++i) { 226 | for (unsigned j = 0; j < ncols; ++j) { 227 | X[i*ncols+j] *= s[i]; 228 | } 229 | } 230 | } 231 | 232 | 233 | /// Prints a column major matrix. 234 | void CPU_Operations::printMatrixCM(const float* a, int n, int m, const char* fmt) { 235 | const char* format = fmt == 0 ? "%1.3f " : fmt; 236 | for (int i = 0; i < n; ++i) { 237 | for (int j =0 ; j < m; ++j) 238 | printf(format, a[i + j*n]); 239 | printf("\n"); 240 | } 241 | printf("\n"); 242 | } 243 | 244 | 245 | /// Prints a row-major matrix 246 | void CPU_Operations::printMatrixRM(const float* a, int n, int m, const char* fmt) { 247 | const char* format = fmt == 0 ? "%1.3f " : fmt; 248 | for (int i = 0; i < n; ++i) { 249 | for (int j =0 ; j < m; ++j) 250 | printf(format, a[i*m + j]); 251 | printf("\n"); 252 | } 253 | } 254 | 255 | void CPU_Operations::printMatrixCM(const SparseMatrix a, int n, int m, const char *fmt) { 256 | NIST_SPBLAS::print(a); 257 | } 258 | 259 | void CPU_Operations::printMatrixRM(const SparseMatrix a, int n, int m, const char *fmt) { 260 | NIST_SPBLAS::print(a); 261 | } 262 | -------------------------------------------------------------------------------- /rfn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | ''' 3 | Python wrapper for librfn. 4 | 5 | Copyright © 2015-2017 Thomas Unterthiner 6 | Additional Contributions by Thomas Adler, Balázs Bencze 7 | 8 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 9 | ''' 10 | 11 | import os 12 | import time 13 | import ctypes as ct 14 | import numpy as np 15 | import matplotlib.pyplot as plt 16 | import warnings 17 | from scipy import sparse 18 | 19 | 20 | import sys 21 | if sys.version_info < (3,): 22 | range = xrange 23 | 24 | 25 | _curdir = os.path.dirname(os.path.realpath(__file__)) 26 | _librfn = ct.cdll.LoadLibrary(os.path.join(_curdir, 'librfn.so')) 27 | _default_gpu_id = -1 28 | _use_cpu_id = -2 29 | 30 | _librfn.calculate_W.argtypes = [ 31 | np.ctypeslib.ndpointer(np.float32), 32 | np.ctypeslib.ndpointer(np.float32), 33 | np.ctypeslib.ndpointer(np.float32), 34 | np.ctypeslib.ndpointer(np.float32), 35 | ct.c_int, ct.c_int, ct.c_int, 36 | ct.c_int, ct.c_int, ct.c_float, 37 | ct.c_int] 38 | 39 | 40 | _librfn.train_rfn.restype = ct.c_int 41 | _librfn.train_rfn.argtypes = [ 42 | np.ctypeslib.ndpointer(np.float32), 43 | np.ctypeslib.ndpointer(np.float32), 44 | np.ctypeslib.ndpointer(np.float32), 45 | ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int, 46 | ct.c_float, ct.c_float, ct.c_float, ct.c_float, 47 | ct.c_float, ct.c_float, ct.c_float, ct.c_float, ct.c_float, 48 | ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int, 49 | ct.c_int 50 | ] 51 | 52 | 53 | _librfn.calculate_W_sparse.argtypes = [ 54 | np.ctypeslib.ndpointer(np.float32), 55 | np.ctypeslib.ndpointer(np.int32), 56 | np.ctypeslib.ndpointer(np.int32), 57 | np.ctypeslib.ndpointer(np.float32), 58 | np.ctypeslib.ndpointer(np.float32), 59 | np.ctypeslib.ndpointer(np.float32), 60 | ct.c_int, ct.c_int, ct.c_int, 61 | ct.c_int, ct.c_int, ct.c_float, 62 | ct.c_int] 63 | 64 | 65 | _librfn.train_rfn_sparse.restype = ct.c_int 66 | _librfn.train_rfn_sparse.argtypes = [ 67 | np.ctypeslib.ndpointer(np.float32), 68 | np.ctypeslib.ndpointer(np.int32), 69 | np.ctypeslib.ndpointer(np.int32), 70 | np.ctypeslib.ndpointer(np.float32), 71 | np.ctypeslib.ndpointer(np.float32), 72 | ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int, 73 | ct.c_float, ct.c_float, ct.c_float, ct.c_float, 74 | ct.c_float, ct.c_float, ct.c_float, ct.c_float, ct.c_float, 75 | ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int, 76 | ct.c_int 77 | ] 78 | 79 | 80 | _input_noise_types = {"dropout": 1, "saltpepper": 2, "gaussian": 3} 81 | _activation_types = {"linear": 0, "relu": 1, "leaky": 2, "sigmoid": 3, "tanh": 4} 82 | 83 | def train_rfn(X, n_hidden, n_iter, etaW, etaP, minP, dropout_rate, 84 | input_noise_rate=0.0, startP=0.1, startW=None, 85 | l2_weightdecay=0.0, l1_weightdecay=0.0, 86 | input_noise_type="saltpepper", activation="relu", 87 | h_threshold=0.0, momentum=0.0, applyNewtonUpdate=True, 88 | batch_size=-1, seed=None, gpu_id="default"): 89 | '''Trains a Rectified Factor Network (RFN). 90 | 91 | Trains an RFN as explained in 92 | "Rectified Factor Networks", Clevert et al., NIPS 2015 93 | 94 | Parameters 95 | ---------- 96 | X : array-like, shape = (n_samples, n_features) 97 | Input samples 98 | 99 | n_hidden : int 100 | Number of latent variables to estimate 101 | 102 | n_iter : int 103 | Number of iterations to run the algorithm 104 | 105 | etaW : float 106 | Learning rate of the W parameter 107 | 108 | etaP : float 109 | Learning rate of the Psi parameter 110 | (It's probably save to set this to the same value as etaW) 111 | 112 | minP : float 113 | Minimal value for Psi. Should be in 1e-8 - 1e-1 114 | 115 | dropout_rate : float in [0, 1) 116 | Dropout rate for the latent variables 117 | 118 | input_noise_rate : float 119 | Noise/dropout rate for input variables 120 | 121 | startW : array-like, shape = (n_hidden, n_features) 122 | Optional pre-initialized weights parameters. Useful if one wants to 123 | continue training of an old result. 124 | 125 | l2_weightdecay : float 126 | L2 penalty for weight decay 127 | 128 | l2_weightdecay : float 129 | L1 penalty for weight decay 130 | 131 | input_noise_type : one of 'dropout', 'saltpepper' or 'gaussian' 132 | Type of input noise 133 | 134 | activation : one of ('linear', 'relu', 'leaky', 'sigmoid', 'tanh') 135 | Activation function for hidden/latent variables. 136 | 137 | h_threshold : float 138 | Threshhold for rectifying/leaky activations 139 | 140 | momentum : float 141 | Momentum term for learning 142 | 143 | applyNewtonUpdate : boolean 144 | Whether to use a Newton update (default) or a Gradient Descent step. 145 | 146 | batch_size : int 147 | If > 2, this will activate mini-batch learning instead of full 148 | batch learning. 149 | 150 | seed : int 151 | Seed for the random number generator 152 | 153 | gpu_id : int or "cpu" 154 | ID of the gpu device to use. If set to "cpu", the calculations will 155 | be performed on the CPU instead. 156 | 157 | 158 | Returns 159 | ------- 160 | A tuple of three elements: 161 | 162 | W : array-like, shape = (n_hidden, n_features) 163 | The weight matrix W used in the paper, used to transform the 164 | hidden/latent variables back to visibles. 165 | Psi : array-like, shape = (n_features, ) 166 | Variance of each input feature dimension (Psi in the paper's formulas) 167 | Wout : array-like, shape = (n_hidden, n_features) 168 | Weight matrix needed to transform the visible variables back into 169 | hidden variables. Normally this is done via 170 | `H = np.maximum(0, np.dot(Wout, X.T))` 171 | ''' 172 | 173 | if seed is None: 174 | # should be different for each call on each process 175 | seed = np.uint32(hash(os.getpid() + time.time()) % 4294967295) 176 | if gpu_id == "default": 177 | gpu_id = _default_gpu_id 178 | elif gpu_id == "cpu": 179 | gpu_id = _use_cpu_id 180 | 181 | rng = np.random.RandomState(seed) 182 | if startW is None: 183 | W = rng.normal(scale=0.01, size=(n_hidden, X.shape[1])).astype(np.float32) 184 | else: 185 | W = startW 186 | if isinstance(startP, np.ndarray): 187 | P = startP 188 | else: 189 | P = np.array([startP] * X.shape[1], dtype=np.float32) 190 | 191 | Wout = np.empty((W.shape[0], W.shape[1]), np.float32) 192 | 193 | if sparse.issparse(X): 194 | X = X.tocsr().astype(np.float32) 195 | _librfn.train_rfn_sparse(X.data, X.indices, X.indptr, W, P, X.shape[0], X.shape[1], n_hidden, n_iter, 196 | batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 197 | l2_weightdecay, l1_weightdecay, momentum, _input_noise_types[input_noise_type], 198 | _activation_types[activation], 1, applyNewtonUpdate, seed, gpu_id) 199 | _librfn.calculate_W_sparse(X.data, X.indices, X.indptr, W, P, Wout, 200 | X.shape[0], X.shape[1], W.shape[0], 201 | _activation_types[activation], 1, h_threshold, 202 | gpu_id) 203 | else: 204 | X = X.astype(np.float32, order="C") 205 | _librfn.train_rfn(X, W, P, X.shape[0], X.shape[1], n_hidden, n_iter, 206 | batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 207 | l2_weightdecay, l1_weightdecay, momentum, _input_noise_types[input_noise_type], 208 | _activation_types[activation], 1, applyNewtonUpdate, seed, gpu_id) 209 | _librfn.calculate_W(X, W, P, Wout, 210 | X.shape[0], X.shape[1], W.shape[0], 211 | _activation_types[activation], 1, h_threshold, 212 | gpu_id) 213 | 214 | return W, P, Wout 215 | 216 | 217 | from sklearn.base import BaseEstimator, TransformerMixin 218 | class RectifiedFactorNetwork(BaseEstimator, TransformerMixin): 219 | '''Implements a sklearn interface for RFN.''' 220 | def __init__(self, n_hidden=128, n_iter=50, etaW=0.1, etaP=0.1, minP=1e-2, dropout_rate=0.0, 221 | input_noise_rate=0.0, startP=0.1, startW=None, 222 | l2_weightdecay=0.0, l1_weightdecay=0.0, 223 | input_noise_type="saltpepper", activation="relu", 224 | h_threshold=0.0, momentum=0.0, applyNewtonUpdate=True, 225 | batch_size=-1, seed=None, gpu_id="default"): 226 | self.n_hidden = n_hidden 227 | self.n_iter = n_iter 228 | self.etaW = etaW 229 | self.etaP = etaP 230 | self.minP = minP 231 | self.dropout_rate = dropout_rate 232 | self.input_noise_rate = input_noise_rate 233 | self.startP = startP 234 | self.startW = startW 235 | self.l2_weightdecay = l2_weightdecay 236 | self.l1_weightdecay = l1_weightdecay 237 | self.input_noise_type = input_noise_type 238 | self.activation = activation 239 | self.h_threshold = h_threshold 240 | self.momentum = momentum 241 | self.applyNewtonUpdate = applyNewtonUpdate 242 | self.batch_size = batch_size 243 | self.seed = seed 244 | self.gpu_id = gpu_id 245 | 246 | def fit(self, x, y=None): 247 | res = train_rfn(x, self.n_hidden, self.n_iter, self.etaW, self.etaP, self.minP, self.dropout_rate, 248 | self.input_noise_rate, self.startP, self.startW, 249 | self.l2_weightdecay, self.l1_weightdecay, 250 | self.input_noise_type, self.activation, 251 | self.h_threshold, self.momentum, self.applyNewtonUpdate, 252 | self.batch_size, self.seed, self.gpu_id) 253 | self.w, self.psi, self.wout = res 254 | return self 255 | 256 | def transform(self, x): 257 | h = np.maximum(np.dot(x, self.wout.T), 0) 258 | s = h.std(1) 259 | s[s < 1e-6] = 1 260 | h /= s[:, None] ## TODO: should I really scale the h? 261 | return h 262 | 263 | def inverse_transform(self, h): 264 | r = np.dot(h, self.w) 265 | return r 266 | -------------------------------------------------------------------------------- /cpu_operations.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2015-2017 Thomas Unterthiner 3 | Additional Contributions by Thomas Adler, Balázs Bencze 4 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #ifndef M_PI 14 | #define M_PI 3.14159265358979323846 15 | #endif 16 | 17 | #include /* for typeid */ 18 | 19 | 20 | extern "C" { 21 | extern void sgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const float *alpha, 22 | const float *a, const int *lda, const float *b, const int *ldb, const float *beta, float *c, const int *ldc); 23 | 24 | extern void ssymm_(const char *side, const char *uplo, const int *m, const int *n, const float *alpha, const float *a, 25 | const int *lda, const float *b, const int *ldb, const float *beta, float *c, const int *ldc); 26 | 27 | extern void saxpy_(const int *n, const float *alpha, const float *dx, const int *incx, float *dy, const int *incy); 28 | extern int spotrf_(const char *uplo, int *n, float *a, int * lda, int *info); 29 | extern int spotrs_(const char *uplo, int *n, int *nrhs, float * a, int *lda, float *b, int *ldb, int *info); 30 | extern int sposv_(const char *uplo, int *n, int *nrhs, float * a, int *lda, float *b, int *ldb, int *info); 31 | extern int spotri_(const char *uplo, int *n, float *a, int *lda, int *info); 32 | } 33 | 34 | using std::cos; 35 | using std::log; 36 | using std::sqrt; 37 | 38 | using std::rand; 39 | using std::srand; 40 | 41 | // random in (0, 1] 42 | inline double rand_unif(void) { 43 | return (rand() + 1.0) / (RAND_MAX + 1.0); 44 | } 45 | 46 | // generates random samples from a 0/1 Gaussian via Box-Mueller 47 | inline double rand_normal(void) { 48 | return sqrt(-2.0 * log(rand_unif())) * cos(2.0 * M_PI * rand_unif()); 49 | } 50 | 51 | 52 | inline double rand_exp(double lambda) /* inversion sampling */ 53 | { 54 | return -log(1 - rand_unif()) / lambda; 55 | } 56 | 57 | class CPU_Operations { 58 | float* var_tmp; 59 | 60 | public: 61 | 62 | float* ones; 63 | 64 | typedef int SparseMatrix; 65 | 66 | static SparseMatrix create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m); 67 | 68 | 69 | template 70 | T init_invalid(void) { 71 | return (typeid(T) == typeid(SparseMatrix) ? (T) -1 : (T) 0); 72 | } 73 | 74 | CPU_Operations(const int m, const int n, const int k, unsigned long seed, int gpu_id); 75 | ~CPU_Operations(); 76 | 77 | float* to_device(const float* src, const int size) const { 78 | return (float*) src; 79 | } 80 | 81 | SparseMatrix to_device(SparseMatrix src, const int size) const { 82 | return src; 83 | } 84 | 85 | float* to_host(const float* src, float* dest, const int size) const { 86 | return dest; 87 | } 88 | 89 | float* copy_to_host(const float* src, float* dst, size_t size) const { 90 | memcpy(dst, src, size); 91 | return dst; 92 | } 93 | 94 | float* get_batch(const float* X, int ncol, int batch_num, int batch_size) { 95 | /* return pointer */ 96 | return (float*) &X[batch_num * batch_size * ncol]; 97 | } 98 | 99 | SparseMatrix get_batch(SparseMatrix X, int ldx, int batch_num, int batch_size); 100 | 101 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 102 | const float *a, const int lda, const float *b, const int ldb, const float beta, float *c, 103 | const int ldc) const { 104 | sgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); 105 | } 106 | 107 | 108 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 109 | const SparseMatrix a, const int lda, const float *b, const int ldb, const float beta, float *c, 110 | const int ldc) const; 111 | 112 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 113 | const float *a, const int lda, const SparseMatrix b, const int ldb, const float beta, float *c, 114 | const int ldc) const; 115 | 116 | void dgmm(const char* mode, const int m, const int n, const float* A, int lda, const float* x, int incx, float* C, 117 | int ldc) const; 118 | 119 | void symm(const char *side, const char *uplo, const int m, const int n, const float alpha, const float *a, 120 | const int lda, const float *b, const int ldb, const float beta, float *c, const int ldc) const { 121 | ssymm_(side, uplo, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc); 122 | } 123 | 124 | void axpy(const int n, const float alpha, const float *dx, const int incx, float *dy, const int incy) const { 125 | saxpy_(&n, &alpha, dx, &incx, dy, &incy); 126 | } 127 | 128 | int posv(const char* uplo, int n, int nrhs, float* a, int lda, float* b, int ldb) const { 129 | int info; 130 | int retval = sposv_(uplo, &n, &nrhs, a, &lda, b, &ldb, &info); 131 | 132 | if (info != 0) 133 | printf("info: %d\n", info); 134 | 135 | assert(!info); 136 | 137 | return retval; 138 | } 139 | 140 | int potrf(const char *uplo, int n, float* a, int lda) const { 141 | int info; 142 | int retval = spotrf_(uplo, &n, a, &lda, &info); 143 | assert(!info); 144 | return retval; 145 | } 146 | 147 | int potrs(const char *uplo, int n, int nrhs, float* a, int lda, float *b, int ldb, int *info) const { 148 | return spotrs_(uplo, &n, &nrhs, a, &lda, b, &ldb, info); 149 | } 150 | 151 | int potri(const char *uplo, int n, float *a, int lda) const { 152 | int info; 153 | int retval = spotri_(uplo, &n, a, &lda, &info); 154 | assert(!info); 155 | return retval; 156 | } 157 | 158 | void* memset(void* dest, int ch, size_t count) const { 159 | return std::memset(dest, ch, count); 160 | } 161 | 162 | float* memcpy(void* dest, const void *src, size_t count) const { 163 | return (float*) std::memcpy(dest, src, count); 164 | } 165 | 166 | float *memcpy_matrix(float *dest, float *src, int nrows_to_copy, int src_ncol, int first_row = 0) const { 167 | return memcpy(dest, &src[first_row * src_ncol], nrows_to_copy * src_ncol * sizeof(float)); 168 | } 169 | 170 | SparseMatrix memcpy_matrix(SparseMatrix &dest, SparseMatrix src, int nrows_to_copy, int src_ncol, int first_row) const; 171 | 172 | void free(void* ptr) const { 173 | if (ptr != 0) 174 | std::free(ptr); 175 | } 176 | 177 | void free(SparseMatrix a) const; 178 | 179 | void free_sparse(void *ptr) { 180 | } 181 | 182 | void free_sparse(SparseMatrix a) { 183 | free(a); 184 | } 185 | 186 | void free_devicememory(void* ptr) const { 187 | ; 188 | } 189 | 190 | void free_devicememory(SparseMatrix X) const { 191 | } 192 | 193 | template 194 | T malloc_matrix(int rows, int cols) { 195 | return malloc_matrix(rows, cols, init_invalid()); 196 | } 197 | 198 | SparseMatrix malloc_matrix(int rows, int cols, SparseMatrix dummy); 199 | 200 | float *malloc_matrix(int rows, int cols, float *dummy) { 201 | return malloc(rows * cols * sizeof(float)); 202 | } 203 | 204 | float* malloc(size_t size) const { 205 | return (float*) std::malloc(size); 206 | } 207 | 208 | void maximum(float* x, const float value, const int size) const { 209 | for (int i = 0; i < size; ++i) 210 | x[i] = fmaxf(x[i], value); 211 | } 212 | 213 | void leaky_relu(float* x, const float value, const int size) { 214 | for (int i = 0; i < size; ++i) 215 | x[i] = (x[i] < 0.0f) ? x[i] * value : x[i]; 216 | } 217 | 218 | void sigmoid(float* x, const int size) const { 219 | for (int i = 0; i < size; ++i) { 220 | x[i] = 1 / (1 + expf(-x[i])); 221 | } 222 | } 223 | 224 | void tanh(float* x, const int size) const { 225 | for (int i = 0; i < size; ++i) { 226 | x[i] = tanhf(x[i]); 227 | } 228 | } 229 | 230 | void fill_eye(float* a, int n) const { 231 | memset(a, 0, n * n * sizeof(float)); 232 | for (int i = 0; i < n; ++i) 233 | a[i * n + i] = 1.0f; 234 | } 235 | 236 | void fill(float* X, const int size, const float value) const { 237 | for (int i = 0; i < size; ++i) { 238 | X[i] = value; 239 | } 240 | } 241 | 242 | void calculate_column_variance(const float* X, const unsigned nrows, const unsigned ncols, float* variances); 243 | void calculate_column_variance(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* variances); 244 | 245 | void invsqrt(float* s, const unsigned n) const; 246 | 247 | void scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const; 248 | void scale_columns(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const; 249 | 250 | void scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const; 251 | void scale_rows(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const; 252 | 253 | void dropout(float* X, const unsigned size, const float dropout_rate) const { 254 | assert(0.0f <= dropout_rate && dropout_rate <= 1.0f); 255 | for (unsigned i = 0; i < size; ++i) 256 | X[i] = rand_unif() < dropout_rate ? 0.0f : X[i]; 257 | } 258 | 259 | void dropout(SparseMatrix X, const unsigned size, const float dropout_rate) const; 260 | 261 | void add_saltpepper_noise(float* X, const unsigned size, const float noise_rate) const { 262 | assert(0.0f <= noise_rate && noise_rate <= 1.0f); 263 | for (unsigned i = 0; i < size; ++i) { 264 | if (rand_unif() < noise_rate) { 265 | X[i] = rand_unif() < 0.5 ? 0.0f : 1.0f; 266 | } 267 | } 268 | } 269 | 270 | void add_saltpepper_noise(SparseMatrix X, const unsigned size, const float noise_rate) const; 271 | 272 | void add_gauss_noise(float* X, const unsigned size, const float noise_rate) const { 273 | assert(0.0 <= noise_rate); 274 | for (unsigned i = 0; i < size; ++i) 275 | X[i] += rand_normal() * noise_rate; 276 | } 277 | 278 | /* gauss noise makes no sense on sparse matrices */ 279 | void add_gauss_noise(SparseMatrix X, const unsigned size, const float noise_rate) const; 280 | 281 | void invert(float* X, const unsigned size) const { 282 | for (unsigned i = 0; i < size; ++i) 283 | X[i] = 1.0f / X[i]; 284 | } 285 | 286 | void soft_threshold(float* x, const float alpha, const int size) const { 287 | float f; 288 | for (int i = 0; i < size; ++i) { 289 | f = x[i]; 290 | x[i] = f > 0 ? fmaxf(0., f - alpha) : fminf(0., f + alpha); 291 | } 292 | } 293 | 294 | // Useful for debugging 295 | static void printMatrixCM(const float* a, int n, int m, const char* fmt); 296 | static void printMatrixCM(const SparseMatrix a, int n, int m, const char *fmt); 297 | 298 | static void printMatrixRM(const float* a, int n, int m, const char* fmt); 299 | static void printMatrixRM(const SparseMatrix a, int n, int m, const char *fmt); 300 | 301 | void prints(const float* f, unsigned l) const {} 302 | 303 | void printsu(const int* f, unsigned l) const {} 304 | 305 | void printm(const char* name, const SparseMatrix a, int n, int m) const { 306 | printf("%s\n", name); 307 | printMatrixCM(a, n, m, 0); 308 | } 309 | 310 | void printm(const char* name, const float* a, int n, int m) const { 311 | printf("%s\n", name); 312 | printMatrixCM(a, n, m, 0); 313 | } 314 | }; 315 | -------------------------------------------------------------------------------- /blas_sparse_proto.h: -------------------------------------------------------------------------------- 1 | #ifndef BLAS_SPARSE_PROTO_H 2 | #define BLAS_SPARSE_PROTO_H 3 | 4 | typedef int blas_sparse_matrix; 5 | 6 | 7 | /* Level 1 Computational Routines */ 8 | 9 | void BLAS_susdot( enum blas_conj_type conj, int nz, const float *x, 10 | const int *indx, const float *y, int incy, float *r, 11 | enum blas_base_type index_base ); 12 | void BLAS_dusdot( enum blas_conj_type conj, int nz, const double *x, 13 | const int *indx, const double *y, int incy, double *r, 14 | enum blas_base_type index_base ); 15 | void BLAS_cusdot( enum blas_conj_type conj, int nz, const void *x, 16 | const int *indx, const void *y, int incy, void *r, 17 | enum blas_base_type index_base ); 18 | void BLAS_zusdot( enum blas_conj_type conj, int nz, const void *x, 19 | const int *indx, const void *y, int incy, void *r, 20 | enum blas_base_type index_base ); 21 | 22 | void BLAS_susaxpy( int nz, float alpha, const float *x, const int *indx, 23 | float *y, int incy, enum blas_base_type index_base ); 24 | void BLAS_dusaxpy( int nz, double alpha, const double *x, const int *indx, 25 | double *y, int incy, enum blas_base_type index_base ); 26 | void BLAS_cusaxpy( int nz, const void *alpha, const void *x, const int *indx, 27 | void *y, int incy, enum blas_base_type index_base ); 28 | void BLAS_zusaxpy( int nz, const void *alpha, const void *x, const int *indx, 29 | void *y, int incy, enum blas_base_type index_base ); 30 | 31 | void BLAS_susga( int nz, const float *y, int incy, float *x, const int *indx, 32 | enum blas_base_type index_base ); 33 | void BLAS_dusga( int nz, const double *y, int incy, double *x, const int *indx, 34 | enum blas_base_type index_base ); 35 | void BLAS_cusga( int nz, const void *y, int incy, void *x, const int *indx, 36 | enum blas_base_type index_base ); 37 | void BLAS_zusga( int nz, const void *y, int incy, void *x, const int *indx, 38 | enum blas_base_type index_base ); 39 | 40 | void BLAS_susgz( int nz, float *y, int incy, float *x, const int *indx, 41 | enum blas_base_type index_base ); 42 | void BLAS_dusgz( int nz, double *y, int incy, double *x, const int *indx, 43 | enum blas_base_type index_base ); 44 | void BLAS_cusgz( int nz, void *y, int incy, void *x, const int *indx, 45 | enum blas_base_type index_base ); 46 | void BLAS_zusgz( int nz, void *y, int incy, void *x, const int *indx, 47 | enum blas_base_type index_base ); 48 | 49 | void BLAS_sussc( int nz, const float *x, float *y, int incy, const int *indx, 50 | enum blas_base_type index_base ); 51 | void BLAS_dussc( int nz, const double *x, double *y, int incy, const int *indx, 52 | enum blas_base_type index_base ); 53 | void BLAS_cussc( int nz, const void *x, void *y, int incy, const int *indx, 54 | enum blas_base_type index_base ); 55 | void BLAS_zussc( int nz, const void *x, void *y, int incy, const int *indx, 56 | enum blas_base_type index_base ); 57 | 58 | /* Level 2 Computational Routines */ 59 | 60 | int BLAS_susmv( enum blas_trans_type transa, float alpha, 61 | blas_sparse_matrix A, const float *x, int incx, float *y, int incy ); 62 | int BLAS_dusmv( enum blas_trans_type transa, double alpha, 63 | blas_sparse_matrix A, const double *x, int incx, double *y, int incy ); 64 | int BLAS_cusmv( enum blas_trans_type transa, const void *alpha, 65 | blas_sparse_matrix A, const void *x, int incx, void *y, int incy ); 66 | int BLAS_zusmv( enum blas_trans_type transa, const void *alpha, 67 | blas_sparse_matrix A, const void *x, int incx, void *y, int incy ); 68 | 69 | int BLAS_sussv( enum blas_trans_type transt, float alpha, 70 | blas_sparse_matrix T, float *x, int incx ); 71 | int BLAS_dussv( enum blas_trans_type transt, double alpha, 72 | blas_sparse_matrix T, double *x, int incx ); 73 | int BLAS_cussv( enum blas_trans_type transt, const void *alpha, 74 | blas_sparse_matrix T, void *x, int incx ); 75 | int BLAS_zussv( enum blas_trans_type transt, const void *alpha, 76 | blas_sparse_matrix T, void *x, int incx ); 77 | 78 | /* Level 3 Computational Routines */ 79 | 80 | int BLAS_susmm( enum blas_order_type order, enum blas_trans_type transa, 81 | int nrhs, float alpha, blas_sparse_matrix A, const float *b, int ldb, 82 | float *c, int ldc ); 83 | int BLAS_dusmm( enum blas_order_type order, enum blas_trans_type transa, 84 | int nrhs, double alpha, blas_sparse_matrix A, const double *b, 85 | int ldb, double *c, int ldc ); 86 | int BLAS_cusmm( enum blas_order_type order, enum blas_trans_type transa, 87 | int nrhs, const void *alpha, blas_sparse_matrix A, const void *b, 88 | int ldb, void *c, int ldc ); 89 | int BLAS_zusmm( enum blas_order_type order, enum blas_trans_type transa, 90 | int nrhs, const void *alpha, blas_sparse_matrix A, const void *b, 91 | int ldb, void *c, int ldc ); 92 | 93 | int BLAS_sussm( enum blas_order_type order, enum blas_trans_type transt, 94 | int nrhs, float alpha, int t, float *b, int ldb ); 95 | int BLAS_dussm( enum blas_order_type order, enum blas_trans_type transt, 96 | int nrhs, double alpha, int t, double *b, int ldb ); 97 | int BLAS_cussm( enum blas_order_type order, enum blas_trans_type transt, 98 | int nrhs, const void *alpha, int t, void *b, int ldb ); 99 | int BLAS_zussm( enum blas_order_type order, enum blas_trans_type transt, 100 | int nrhs, const void *alpha, int t, void *b, int ldb ); 101 | 102 | /* Handle Management Routines */ 103 | 104 | /* Creation Routines */ 105 | 106 | blas_sparse_matrix BLAS_suscr_begin( int m, int n ); 107 | blas_sparse_matrix BLAS_duscr_begin( int m, int n ); 108 | blas_sparse_matrix BLAS_cuscr_begin( int m, int n ); 109 | blas_sparse_matrix BLAS_zuscr_begin( int m, int n ); 110 | 111 | 112 | blas_sparse_matrix BLAS_suscr_block_begin( int Mb, int Nb, int k, int l ); 113 | blas_sparse_matrix BLAS_duscr_block_begin( int Mb, int Nb, int k, int l ); 114 | blas_sparse_matrix BLAS_cuscr_block_begin( int Mb, int Nb, int k, int l ); 115 | blas_sparse_matrix BLAS_zuscr_block_begin( int Mb, int Nb, int k, int l ); 116 | 117 | blas_sparse_matrix BLAS_suscr_variable_block_begin( int Mb, int Nb, 118 | const int *k, const int *l ); 119 | blas_sparse_matrix BLAS_duscr_variable_block_begin( int Mb, int Nb, 120 | const int *k, const int *l ); 121 | blas_sparse_matrix BLAS_cuscr_variable_block_begin( int Mb, int Nb, 122 | const int *k, const int *l ); 123 | blas_sparse_matrix BLAS_zuscr_variable_block_begin( int Mb, int Nb, 124 | const int *k, const int *l ); 125 | 126 | 127 | /* Insertion Routines */ 128 | 129 | int BLAS_suscr_insert_entry( blas_sparse_matrix A, float val, int i, int j ); 130 | int BLAS_duscr_insert_entry( blas_sparse_matrix A, double val, int i, int j ); 131 | int BLAS_cuscr_insert_entry( blas_sparse_matrix A, const void *val, int i, int j ); 132 | int BLAS_zuscr_insert_entry( blas_sparse_matrix A, const void *val, int i, int j ); 133 | 134 | int BLAS_suscr_insert_entries( blas_sparse_matrix A, int nz, const float *val, 135 | const int *indx, const int *jndx ); 136 | int BLAS_duscr_insert_entries( blas_sparse_matrix A, int nz, const double *val, 137 | const int *indx, const int *jndx ); 138 | int BLAS_cuscr_insert_entries( blas_sparse_matrix A, int nz, const void *val, 139 | const int *indx, const int *jndx ); 140 | int BLAS_zuscr_insert_entries( blas_sparse_matrix A, int nz, const void *val, 141 | const int *indx, const int *jndx ); 142 | 143 | int BLAS_suscr_insert_col( blas_sparse_matrix A, int j, int nz, 144 | const float *val, const int *indx ); 145 | int BLAS_duscr_insert_col( blas_sparse_matrix A, int j, int nz, 146 | const double *val, const int *indx ); 147 | int BLAS_cuscr_insert_col( blas_sparse_matrix A, int j, int nz, 148 | const void *val, const int *indx ); 149 | int BLAS_zuscr_insert_col( blas_sparse_matrix A, int j, int nz, 150 | const void *val, const int *indx ); 151 | 152 | int BLAS_suscr_insert_row( blas_sparse_matrix A, int i, int nz, 153 | const float *val, const int *indx ); 154 | int BLAS_duscr_insert_row( blas_sparse_matrix A, int i, int nz, 155 | const double *val, const int *indx ); 156 | int BLAS_cuscr_insert_row( blas_sparse_matrix A, int i, int nz, 157 | const void *val, const int *indx ); 158 | int BLAS_zuscr_insert_row( blas_sparse_matrix A, int i, int nz, 159 | const void *val, const int *indx ); 160 | 161 | int BLAS_suscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 162 | const float *val, const int row_stride, 163 | const int col_stride, const int *indx, 164 | const int *jndx ); 165 | int BLAS_duscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 166 | const double *val, const int row_stride, 167 | const int col_stride, const int *indx, 168 | const int *jndx ); 169 | int BLAS_cuscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 170 | const void *val, const int row_stride, 171 | const int col_stride, const int *indx, 172 | const int *jndx ); 173 | int BLAS_zuscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 174 | const void *val, const int row_stride, 175 | const int col_stride, const int *indx, 176 | const int *jndx ); 177 | 178 | int BLAS_suscr_insert_block( blas_sparse_matrix A, const float *val, 179 | int row_stride, int col_stride, int i, int j ); 180 | int BLAS_duscr_insert_block( blas_sparse_matrix A, const double *val, 181 | int row_stride, int col_stride, int i, int j ); 182 | int BLAS_cuscr_insert_block( blas_sparse_matrix A, const void *val, 183 | int row_stride, int col_stride, int i, int j ); 184 | int BLAS_zuscr_insert_block( blas_sparse_matrix A, const void *val, 185 | int row_stride, int col_stride, int i, int j ); 186 | 187 | /* Completion of Construction Routines */ 188 | 189 | int BLAS_suscr_end( blas_sparse_matrix A ); 190 | int BLAS_duscr_end( blas_sparse_matrix A ); 191 | int BLAS_cuscr_end( blas_sparse_matrix A ); 192 | int BLAS_zuscr_end( blas_sparse_matrix A ); 193 | 194 | /* Matrix Property Routines */ 195 | 196 | int BLAS_usgp( blas_sparse_matrix A, int pname ); 197 | 198 | int BLAS_ussp( blas_sparse_matrix A, int pname ); 199 | 200 | /* Destruction Routine */ 201 | 202 | int BLAS_usds( blas_sparse_matrix A ); 203 | 204 | /* custom */ 205 | blas_sparse_matrix create(int row, int col); 206 | void destroy(blas_sparse_matrix A); 207 | 208 | /* column means and variances */ 209 | void scolmeans(blas_sparse_matrix A, float *means); 210 | void scolvars(blas_sparse_matrix A, float *vars); 211 | void srowmeans(blas_sparse_matrix A, float *means); 212 | void srowvars(blas_sparse_matrix A, float *vars); 213 | 214 | /* scale rows/cols */ 215 | void sscalecols(blas_sparse_matrix A, float *s); 216 | void sscalerows(blas_sparse_matrix A, float *s); 217 | 218 | /* select row subset */ 219 | blas_sparse_matrix srowsubset(blas_sparse_matrix A, int first_row, int nrow); 220 | 221 | /* construct from CSR data */ 222 | blas_sparse_matrix suscr_csr(int m, int n, float *x, int *row, int *col); 223 | 224 | /* set element (set to zero will delete entry) */ 225 | void ssetelement( blas_sparse_matrix A, int row, int col, float val ); 226 | void ssetelement( blas_sparse_matrix A, int idx, float val ); 227 | 228 | /* get element reference */ 229 | float &sgetelement( blas_sparse_matrix A, int row, int col); 230 | float &sgetelement( blas_sparse_matrix A, int idx ); 231 | 232 | /* sgemm routines with sparse matrix being lhs (A) or rhs (B) of the product */ 233 | void susgemm(enum blas_side_type sidea, enum blas_trans_type transa, enum blas_trans_type transb, 234 | int nohs, const float &alpha, blas_sparse_matrix A, const float *B, int ldB, const float &beta, float *C, int ldC); 235 | void susgemma(enum blas_order_type order, enum blas_trans_type transa, enum blas_trans_type transb, 236 | int nrhs, const float &alpha, blas_sparse_matrix A, const float *B, int ldB, const float &beta, 237 | float *C, int ldC); 238 | void susgemmb(enum blas_order_type order, enum blas_trans_type transa, enum blas_trans_type transb, 239 | int nlhs, const float &alpha, const float *A, int ldA, blas_sparse_matrix B, const float &beta, 240 | float *C, int ldC); 241 | 242 | /* checks whether A is a valid handle */ 243 | bool handle_valid(blas_sparse_matrix A); 244 | 245 | /* debug */ 246 | namespace NIST_SPBLAS 247 | {void print(int A);} 248 | 249 | #include 250 | 251 | #define dbg_printf(...) do { \ 252 | printf(__VA_ARGS__); \ 253 | fflush(stdout); \ 254 | } while (0) 255 | 256 | #endif 257 | /* BLAS_SPARSE_PROTO_H */ 258 | -------------------------------------------------------------------------------- /librfn.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2015-2017 Thomas Unterthiner 3 | Additional Contributions by Thomas Adler, Balázs Bencze 4 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 5 | */ 6 | 7 | #include "librfn.h" 8 | #include 9 | #include 10 | #include "cpu_operations.h" 11 | 12 | #ifndef NOGPU 13 | #include "gpu_operations.h" 14 | #endif 15 | 16 | float time_diff(struct timeval *t2, struct timeval *t1) { 17 | long int diff = (t2->tv_usec + 1000000 * t2->tv_sec) - (t1->tv_usec + 1000000 * t1->tv_sec); 18 | return diff / 1000000.0f; 19 | } 20 | 21 | template 22 | int calculate_W_impl_invertMxM(OP& op, const float* W, const float* P, float* Wout, 23 | const int k, const int m, 24 | float* WWPchol, float* WWPinv) { 25 | op.gemm("n","t", m, m, k, 1.0f, W, m, W, m, 0.0f, WWPchol, m); 26 | op.axpy(m, 1.0f, P, 1, WWPchol, m+1); 27 | op.fill_eye(WWPinv, m); 28 | op.posv("u", m, m, WWPchol, m, WWPinv, m); 29 | op.gemm("t", "n", m, k, m, 1.0f, WWPinv, m, W, m, 0.0f, Wout, m); 30 | return 0; 31 | } 32 | 33 | 34 | //better option if m > k ( = W is tall), involves k*k inverse 35 | template 36 | int calculate_W_impl_invertKxK(OP& op, const float* W, const float* Pinv, float* Wout, 37 | const int k, const int m, 38 | float* Wtmp, float* WPWchol, float* WPWinv) { 39 | op.dgmm("l", m, k, W, m, Pinv, 1, Wtmp, m); 40 | op.gemm("t", "n", k, k, m, 1.0f, W, m, Wtmp, m, 0.0f, WPWchol, k); 41 | op.axpy(k, 1.0f, op.ones, 1, WPWchol, k+1); 42 | op.fill_eye(WPWinv, k); 43 | op.posv("u", k, k, WPWchol, k, WPWinv, k); 44 | op.gemm("n", "t", m, k, k, 1.0f, Wtmp, m, WPWinv, k, 0.0f, Wout, m); 45 | return 0; 46 | } 47 | 48 | 49 | // if isMoreHiddensThanFeatures is true, we will calculate the m*m inverse, otherwise the k*k one 50 | template 51 | int train(XTypeConst X_host, float* W_host, float* P_host, const int n, const int m, 52 | const int k, const int n_iter, int batch_size, const float etaW, const float etaP, 53 | const float minP, const float h_threshold, 54 | const float dropout_rate, const float input_noise_rate, 55 | const float l2_weightdecay, const float l1_weightdecay, const float momentum, 56 | const int input_noise_type, const int activation_type, const int apply_scaling, 57 | const int applyNewtonUpdate, unsigned long seed, int gpu_id) { 58 | if (batch_size == 1) { 59 | printf ("batch_size == 1 not supported, switching to full batch mode"); 60 | } 61 | 62 | OP op(n, m, k, seed, gpu_id); 63 | XType X = op.to_device(X_host, m*n*sizeof(float)); 64 | float* W = op.to_device(W_host, k*m*sizeof(float)); 65 | float* P = op.to_device(P_host, m*sizeof(float)); 66 | if (batch_size < 2) // no mini-batches, one batch=full dataset 67 | batch_size = n; 68 | int n_batches = n / batch_size; 69 | float* XCov_diag = op.malloc(m*sizeof(float)); 70 | 71 | float* H = op.malloc(k*batch_size*sizeof(float)); 72 | float* Wout = op.malloc(k*m*sizeof(float)); 73 | float* variance_H = op.malloc(k*sizeof(float)); 74 | float* S = op.malloc(k*k*sizeof(float)); 75 | float* Schol = op.malloc(k*k*sizeof(float)); 76 | float* U = op.malloc(m*k*sizeof(float)); 77 | float* Sinv = op.malloc(k*k*sizeof(float)); 78 | float* dW = op.malloc(m*k*sizeof(float)); 79 | float* C = op.malloc(m*m*sizeof(float)); 80 | 81 | XType Xtmp = op.template init_invalid(); 82 | if (input_noise_rate > 0.0f) 83 | { 84 | Xtmp = op.template malloc_matrix(batch_size, m); 85 | } 86 | 87 | // which matrices of the following we use depends on which inverse we use 88 | float* WWPchol = 0; 89 | float* WWPinv = 0; 90 | float* WPWchol = 0; 91 | float* WPWinv = 0; 92 | float* Wtmp = 0; 93 | if (isMoreHiddensThanFeatures) { 94 | WWPchol = op.malloc(m*m*sizeof(float)); 95 | WWPinv = op.malloc(m*m*sizeof(float)); 96 | } else { 97 | WPWchol = op.malloc(k*k*sizeof(float)); 98 | WPWinv = op.malloc(k*k*sizeof(float)); 99 | Wtmp = op.malloc(m*k*sizeof(float)); 100 | } 101 | float* dP = op.malloc(m*sizeof(float)); 102 | 103 | if (!dP) { // We've run out of memory somewhere 104 | op.free(dP); 105 | op.free(C); 106 | op.free(dW); 107 | op.free(Sinv); 108 | op.free(U); 109 | op.free(Schol); 110 | op.free(S); 111 | op.free(variance_H); 112 | op.free(Wout); 113 | op.free(H); 114 | op.free(WWPinv); 115 | op.free(WWPchol); 116 | op.free(WPWchol); 117 | op.free(WPWinv); 118 | op.free(Wtmp); 119 | op.free(Xtmp); 120 | op.free(XCov_diag); 121 | return -1; 122 | } 123 | struct timeval t0, t1; 124 | gettimeofday(&t0, 0); 125 | 126 | if (n == batch_size) 127 | op.calculate_column_variance(X, batch_size, m, XCov_diag); 128 | 129 | for (int cur_iter = 0; cur_iter < n_iter; ++cur_iter) { 130 | if (cur_iter % 25 == 0) { 131 | gettimeofday(&t1, 0); 132 | printf("epoch: %4d (time: %6.2fs)\n", cur_iter, time_diff(&t1, &t0)); 133 | } 134 | for (int cur_batch = 0; cur_batch < n_batches; ++cur_batch) { 135 | if (isMoreHiddensThanFeatures) { 136 | calculate_W_impl_invertMxM(op, W, P, Wout, k, m, WWPchol, WWPinv); 137 | } else { 138 | op.invert(P, m); // TODO: something better than inverting P twice, 139 | /* how about inverting P once into distinct mem? */ 140 | calculate_W_impl_invertKxK(op, W, P, Wout, k, m, Wtmp, WPWchol, WPWinv); 141 | op.invert(P, m); 142 | } 143 | 144 | XType Xnoise; 145 | 146 | if (input_noise_type && input_noise_rate > 0.0f) { 147 | op.memcpy_matrix(Xtmp, X, batch_size, m, cur_batch); 148 | switch(input_noise_type) { 149 | case 1: // dropout noise 150 | op.dropout(Xtmp, batch_size*m, input_noise_rate); 151 | break; 152 | case 2: // salt&pepper noise 153 | op.add_saltpepper_noise(Xtmp, batch_size*m, input_noise_rate); 154 | break; 155 | case 3: // gauss noise 156 | op.add_gauss_noise(Xtmp, batch_size*m, input_noise_rate); 157 | break; 158 | default: 159 | printf("invalid noise type"); 160 | assert(false); 161 | } 162 | Xnoise = Xtmp; 163 | } else { 164 | /* in case of sparse X, this is a copy operation! */ 165 | Xnoise = op.get_batch(X, m, cur_batch, batch_size); 166 | } 167 | 168 | op.gemm("t", "n", k, batch_size, m, 1.0f, Wout, m, Xnoise, m, 0.0f, H, k); 169 | 170 | if (!(input_noise_type && input_noise_rate > 0.0f)) 171 | { 172 | /* free matrix only if it's sparse */ 173 | op.free_sparse(Xnoise); 174 | } 175 | 176 | switch (activation_type) { 177 | case 1: op.maximum(H, h_threshold, batch_size*k); break; 178 | case 2: op.leaky_relu(H, h_threshold, batch_size*k); break; 179 | case 3: op.sigmoid(H, batch_size*k); break; 180 | case 4: op.tanh(H, batch_size*k); break; 181 | default: 182 | printf("invalid activation type"); 183 | assert(false); 184 | } 185 | 186 | if (apply_scaling) { 187 | op.calculate_column_variance(H, batch_size, k, variance_H); 188 | op.invsqrt(variance_H, k); 189 | op.scale_columns(H, batch_size, k, variance_H); 190 | } 191 | if (dropout_rate > 0.0f) { 192 | op.dropout(H, batch_size*k, dropout_rate); 193 | } 194 | op.gemm("n", "t", k, k, batch_size, 1.0f/batch_size, H, k, H, k, 0.0f, S, k); 195 | if (isMoreHiddensThanFeatures) { 196 | op.gemm("t", "n", k, k, m, -1.0f, Wout, m, W, m, 1.0f, S, k); 197 | op.axpy(k, 1.0f, op.ones, 0, S, k+1); 198 | } else { 199 | op.axpy(k*k, 1.0f, WPWinv, 1, S, 1); 200 | } 201 | XType XBatch = op.get_batch(X, m, cur_batch, batch_size); 202 | 203 | op.gemm("n", "t", m, k, batch_size, 1.0f/batch_size, XBatch, m, H, k, 0.0f, U, m); 204 | 205 | if (applyNewtonUpdate) { 206 | op.axpy(k, 1e-10, op.ones, 0, S, k+1); 207 | op.memcpy(Schol, S, k*k*sizeof(float)); 208 | op.fill_eye(Sinv, k); 209 | op.posv("u", k, k, Schol, k, Sinv, k); 210 | op.gemm("n", "n", m, k, k, 1.0f, U, m, Sinv, k, momentum, dW, m); 211 | op.axpy(m*k, -(1.0f+l2_weightdecay), W, 1, dW, 1); 212 | } else { 213 | op.gemm("n", "n", m, k, k, -1.0f, W, m, S, k, momentum, dW, m); 214 | op.axpy(m*k, 1.0f, U, 1, dW, 1); 215 | 216 | if (l2_weightdecay > 0.0f) { 217 | op.axpy(m*k, -l2_weightdecay, W, 1, dW, 1); 218 | } 219 | } 220 | 221 | op.gemm("n", "n", m, k, k, 1.0f, W, m, S, k, -2.0f, U, m); 222 | op.gemm("n", "t", m, m, k, 1.0f, U, m, W, m, 0.0f, C, m); 223 | 224 | if (batch_size < n) { 225 | op.calculate_column_variance(XBatch, batch_size, m, dP); 226 | } else { 227 | op.memcpy(dP, XCov_diag, m*sizeof(float)); 228 | } 229 | 230 | op.free_sparse(XBatch); 231 | 232 | op.axpy(m, 1.0f, C, m+1, dP, 1); 233 | op.axpy(m, -1.0f, P, 1, dP, 1); 234 | 235 | op.axpy(m, etaP/n_batches, dP, 1, P, 1); 236 | op.axpy(m*k, etaW/n_batches, dW, 1, W, 1); 237 | 238 | op.maximum(P, minP, m); 239 | 240 | if (l1_weightdecay > 0.0f) { 241 | op.soft_threshold(W, l1_weightdecay, m*k); 242 | } 243 | } 244 | } 245 | op.free(dP); 246 | op.free(C); 247 | op.free(dW); 248 | op.free(Sinv); 249 | op.free(U); 250 | op.free(Schol); 251 | op.free(S); 252 | op.free(H); 253 | op.free(variance_H); 254 | op.free(Wout); 255 | op.free(WWPinv); 256 | op.free(WWPchol); 257 | op.free(WPWchol); 258 | op.free(WPWinv); 259 | op.free(Wtmp); 260 | op.free(Xtmp); 261 | op.free(XCov_diag); 262 | op.free_devicememory(X); 263 | 264 | op.to_host(W, W_host, m*k*sizeof(float)); 265 | op.to_host(P, P_host, m*sizeof(float)); 266 | return 0; 267 | } 268 | 269 | 270 | template 271 | void calculate_W(XTypeConst X_host, const float* W_host, const float* P_host, 272 | float* Wout_host, const int n, const int m, const int k, 273 | const int activation_type, const int apply_scaling, 274 | const float h_threshold, int gpu_id) { 275 | OP op(n, m, k, 0, gpu_id); 276 | float* P_copy = (float*) malloc(m*sizeof(float)); 277 | memcpy(P_copy, P_host, m*sizeof(float)); // we might need to invert P 278 | float* Wout = op.to_device(Wout_host, k*m*sizeof(float)); 279 | float* W = op.to_device(W_host, k*m*sizeof(float)); 280 | float* P = op.to_device(P_copy, m*sizeof(float)); 281 | XType X = op.to_device(X_host, n*m*sizeof(float)); 282 | float* H = op.malloc(n*k*sizeof(float)); 283 | float* variance_H = op.malloc(k*sizeof(float)); 284 | 285 | if (k > m) { 286 | float* WWPchol = op.malloc(m*m*sizeof(float)); 287 | float* WWPinv = op.malloc(m*m*sizeof(float)); 288 | calculate_W_impl_invertMxM(op, W, P, Wout, k, m, WWPchol, WWPinv); 289 | op.free(WWPchol); 290 | op.free(WWPinv); 291 | } else { 292 | op.invert(P, m); 293 | float* WPWchol = op.malloc(k*k*sizeof(float)); 294 | float* WPWinv = op.malloc(k*k*sizeof(float)); 295 | float* Wtmp = op.malloc(m*k*sizeof(float)); 296 | calculate_W_impl_invertKxK(op, W, P, Wout, k, m, Wtmp, WPWchol, WPWinv); 297 | op.free(Wtmp); 298 | op.free(WPWinv); 299 | op.free(WPWchol); 300 | op.invert(P, m); 301 | } 302 | 303 | op.gemm("t", "n", k, n, m, 1.0f, Wout, m, X, m, 0.0f, H, k); 304 | 305 | switch (activation_type) { 306 | case 1: op.maximum(H, h_threshold, n*k); break; 307 | case 2: op.leaky_relu(H, h_threshold, n*k); break; 308 | case 3: op.sigmoid(H, n*k); break; 309 | case 4: op.tanh(H, n*k); break; 310 | default: 311 | printf("invalid noise type"); 312 | assert(false); 313 | } 314 | 315 | if (apply_scaling){ 316 | op.calculate_column_variance(H, n, k, variance_H); 317 | op.invsqrt(variance_H, k); 318 | op.scale_rows(Wout, k, m, variance_H); 319 | } 320 | 321 | op.free(variance_H); 322 | op.free(H); 323 | op.to_host(Wout, Wout_host, k*m*sizeof(float)); 324 | op.free_devicememory(W); 325 | op.free_devicememory(P); 326 | op.free_devicememory(X); 327 | free(P_copy); 328 | } 329 | 330 | 331 | extern "C" { 332 | 333 | int train_rfn(const float* X, float* W, float* P, const int n, 334 | const int m, const int k, const int n_iter, int batch_size, 335 | const float etaW, const float etaP, const float minP, const float h_threshold, 336 | const float dropout_rate, const float input_noise_rate, 337 | const float l2_weightdecay, const float l1_weightdecay, 338 | const float momentum, 339 | const int input_noise_type, const int activation_type, const int apply_scaling, 340 | const int applyNewtonUpdate, unsigned long seed, const int gpu_id) { 341 | 342 | if (gpu_id == USE_CPU) { 343 | if (k > m) { 344 | return train(X, W, P, n, m, k, 345 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 346 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1); 347 | } else { 348 | return train(X, W, P, n, m, k, 349 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 350 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1); 351 | } 352 | } else { 353 | #ifndef NOGPU 354 | if (k > m) { 355 | return train(X, W, P, n, m, k, 356 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 357 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id); 358 | } else { 359 | return train(X, W, P, n, m, k, 360 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 361 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id); 362 | } 363 | #else 364 | fprintf(stderr, "librfn was compiled without GPU support"); 365 | #endif 366 | } 367 | } 368 | 369 | 370 | int train_rfn_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr, 371 | float* W, float* P, const int n, const int m, 372 | const int k, const int n_iter, int batch_size, const float etaW, 373 | const float etaP, const float minP, const float h_threshold, 374 | const float dropout_rate, const float input_noise_rate, 375 | const float l2_weightdecay, const float l1_weightdecay, 376 | const float momentum, 377 | const int input_noise_type, const int activation_type, const int apply_scaling, 378 | const int applyNewtonUpdate, unsigned long seed, const int gpu_id) { 379 | if (gpu_id == USE_CPU) { 380 | const CPU_Operations::SparseMatrix X = CPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m); 381 | int retval = 0; 382 | if (k > m) { 383 | retval = train((CPU_Operations::SparseMatrix) X, W, P, n, m, k, 384 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 385 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1); 386 | } else { 387 | retval = train((CPU_Operations::SparseMatrix) X, W, P, n, m, k, 388 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 389 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1); 390 | } 391 | //destroy(X); 392 | return retval; 393 | } else { 394 | #ifndef NOGPU 395 | const GPU_Operations::SparseMatrix X = GPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m); 396 | if (k > m) { 397 | return train(&X, W, P, n, m, k, 398 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 399 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id); 400 | } else { 401 | return train(&X, W, P, n, m, k, 402 | n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate, 403 | l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id); 404 | } 405 | #else 406 | fprintf(stderr, "librfn was compiled without GPU support"); 407 | #endif 408 | } 409 | } 410 | 411 | 412 | void calculate_W(const float* X, const float* W, const float* P, float* Wout, 413 | const int n, const int m, const int k, const int activation_type, 414 | const int apply_scaling, const float h_threshold, int gpu_id) { 415 | if (gpu_id == USE_CPU) { 416 | return calculate_W(X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, gpu_id); 417 | } else { 418 | #ifndef NOGPU 419 | return calculate_W(X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, -1); 420 | #else 421 | fprintf(stderr, "librfn was compiled without GPU support"); 422 | #endif 423 | } 424 | } 425 | 426 | 427 | void calculate_W_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr, 428 | const float* W, const float* P, float* Wout, 429 | const int n, const int m, const int k, const int activation_type, 430 | const int apply_scaling, const float h_threshold, int gpu_id) { 431 | if (gpu_id == USE_CPU) { 432 | const CPU_Operations::SparseMatrix X = CPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m); 433 | calculate_W(X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, -1); 434 | //destroy(X); 435 | } else { 436 | #ifndef NOGPU 437 | const GPU_Operations::SparseMatrix X = GPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m); 438 | calculate_W(&X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, gpu_id); 439 | #else 440 | fprintf(stderr, "librfn was compiled without GPU support"); 441 | #endif 442 | } 443 | } 444 | 445 | } 446 | -------------------------------------------------------------------------------- /gpu_operations.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2015-2017 Thomas Unterthiner 3 | Additional Contributions by Thomas Adler, Balázs Bencze 4 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 5 | */ 6 | 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include /* for typeid */ 20 | 21 | 22 | using std::fprintf; 23 | 24 | inline cublasFillMode_t uplo_to_cublas(const char* uplo) { 25 | return tolower(uplo[0]) == 'l' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; 26 | } 27 | 28 | inline cusparseOperation_t op_to_cusparse(const char* op) { 29 | return tolower(op[0]) == 't' ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE; 30 | } 31 | 32 | static const char* cusparseErrorString(cusparseStatus_t error) { 33 | switch (error) { 34 | case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS"; 35 | case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED"; 36 | case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED"; 37 | case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE"; 38 | case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH"; 39 | case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR"; 40 | case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED"; 41 | case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; 42 | case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 43 | case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; 44 | default: return ""; 45 | } 46 | } 47 | 48 | static const char* cublasErrorString(cublasStatus_t error) { 49 | switch (error) { 50 | case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; 51 | case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; 52 | case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; 53 | case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; 54 | case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; 55 | case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; 56 | case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; 57 | case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; 58 | #if CUDA_VERSION >= 6000 59 | case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; 60 | #endif 61 | default: return ""; 62 | } 63 | } 64 | 65 | #ifndef DNDEBUG 66 | 67 | #define CUDA_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); } 68 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { 69 | if (code != cudaSuccess) { 70 | fprintf(stderr, "CUDA Error: %s %s:%d\n", cudaGetErrorString(code), file, line); 71 | if (abort) 72 | exit(code); 73 | } 74 | } 75 | 76 | #define CUBLAS_CALL(ans) { cublasAssert((ans), __FILE__, __LINE__); } 77 | inline void cublasAssert(cublasStatus_t code, const char *file, int line) { 78 | //printf("%d (%s:%d)\n", code, file, line); 79 | if (code != CUBLAS_STATUS_SUCCESS) { 80 | fprintf(stderr, "CUBLAS Error: %s %s:%d\n", cublasErrorString(code), file, line); 81 | exit(code); 82 | } 83 | } 84 | 85 | #define CUSPARSE_CALL(ans) { cusparseAssert((ans), __FILE__, __LINE__); } 86 | inline void cusparseAssert(cusparseStatus_t code, const char *file, int line) { 87 | // printf("%d (%s:%d)\n", code, file, line); 88 | if (code != CUSPARSE_STATUS_SUCCESS) { 89 | fprintf(stderr, "CUSPARSE Error: %s %s:%d\n", cusparseErrorString(code), file, line); 90 | exit(code); 91 | } 92 | } 93 | 94 | static const char* cusolverErrorString(cusolverStatus_t error) { 95 | switch (error) { 96 | case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS"; 97 | case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED"; 98 | case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED"; 99 | case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE"; 100 | case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH"; 101 | case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED"; 102 | case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR"; 103 | case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 104 | default: return ""; 105 | } 106 | } 107 | 108 | #define CUSOLVER_CALL(ans) { cusolverAssert((ans), __FILE__, __LINE__); } 109 | inline void cusolverAssert(cusolverStatus_t code, const char *file, int line) { 110 | //printf("%d (%s:%d)\n", code, file, line); 111 | if (code != CUSOLVER_STATUS_SUCCESS) { 112 | fprintf(stderr, "CUBLAS Error: %s %s:%d\n", cusolverErrorString(code), file, line); 113 | exit(code); 114 | } 115 | } 116 | 117 | #else 118 | #define CUBLAS_CALL(ans) (ans) 119 | #define CUDA_CALL(ans) (ans) 120 | #define CUSOLVER_CALL(ans) (ans) 121 | #define CUSPARSE_CALL(ans) (ans) 122 | #endif 123 | 124 | #define MAX_STREAMS 16 125 | 126 | 127 | 128 | class GPU_Operations { 129 | cublasHandle_t handle; 130 | curandState* rng_state; 131 | cusolverDnHandle_t cudense_handle; 132 | cusparseHandle_t cusparse_handle; 133 | std::map buffer_map; // keeps track of buffers allocated for potrf 134 | int* devinfo; // cuSOLVER error reporting 135 | cudaStream_t streams[MAX_STREAMS]; 136 | cusparseMatDescr_t descr; 137 | 138 | 139 | 140 | public: 141 | float* ones; 142 | 143 | struct SparseMatrix { 144 | float *values; 145 | int *columns; 146 | int *rowPointers; 147 | int m; // number of rows 148 | int nnz; // number of nonzero elements 149 | }; 150 | 151 | const SparseMatrix INVALID = { 152 | (float*)-1, (int*)-1, (int*)-1, 0, 0 153 | }; 154 | 155 | static SparseMatrix create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m); 156 | 157 | 158 | GPU_Operations(int n, int m, int k, unsigned long seed, int gpu_id); 159 | ~GPU_Operations(); 160 | 161 | float* to_device(const float* src, size_t size) const; 162 | int* to_device(const int* src, size_t size) const; 163 | SparseMatrix* to_device(const SparseMatrix* src, size_t size) const; 164 | 165 | float* to_host(float* src, float* dst, size_t size) const { 166 | CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 167 | free(src); 168 | return dst; 169 | } 170 | 171 | int* to_host(int* src, int* dst, size_t size) const { 172 | CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 173 | free(src); 174 | return dst; 175 | } 176 | 177 | float* copy_to_host(const float* src, float* dst, size_t size) const { 178 | CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 179 | return dst; 180 | } 181 | 182 | int* copy_to_host(const int* src, int* dst, size_t size) const { 183 | CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost)); 184 | return dst; 185 | } 186 | 187 | void set_stream(unsigned iterator) const { 188 | unsigned stream_id = iterator % MAX_STREAMS; 189 | CUBLAS_CALL(cublasSetStream_v2(handle, streams[stream_id])); 190 | } 191 | 192 | void synchronize_stream(unsigned iterator) const { 193 | unsigned stream_id = iterator % MAX_STREAMS; 194 | CUDA_CALL(cudaStreamSynchronize(streams[stream_id])); 195 | } 196 | 197 | void synchronize_all_streams() const { 198 | for (unsigned i = 0; i < MAX_STREAMS; ++i) { 199 | synchronize_stream(i); 200 | } 201 | } 202 | 203 | void default_stream() const { 204 | CUBLAS_CALL(cublasSetStream_v2(handle, NULL)); 205 | } 206 | 207 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 208 | const float *a, const int lda, const float *b, const int ldb, const float beta, float *c, 209 | const int ldc) const { 210 | cublasOperation_t ta = tolower(transa[0]) == 'n' ? CUBLAS_OP_N : CUBLAS_OP_T; 211 | cublasOperation_t tb = tolower(transb[0]) == 'n' ? CUBLAS_OP_N : CUBLAS_OP_T; 212 | CUBLAS_CALL(cublasSgemm(handle, ta, tb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)); 213 | } 214 | 215 | void gemm(const char *transa, const char *transb, const int m, 216 | const int n, const int k, const float alpha, 217 | const SparseMatrix* a, const int lda, const float *b, 218 | const int ldb, const float beta, float *c, 219 | const int ldc); 220 | 221 | void gemm(const char *transa, const char *transb, const int m, 222 | const int n, const int k, const float alpha, const float *a, 223 | const int lda, const SparseMatrix* b, const int ldb, 224 | const float beta, float *c, const int ldc); 225 | 226 | void dgmm(const char* mode, const int m, const int n, const float* A, 227 | int lda, const float* x, int incx, float* C, 228 | int ldc) const { 229 | cublasSideMode_t lr = mode[0] == 'l' ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; 230 | CUBLAS_CALL(cublasSdgmm(handle, lr, m, n, A, lda, x, incx, C, ldc)); 231 | } 232 | 233 | void symm(const char *side, const char *uplo, const int m, const int n, 234 | const float alpha, const float *a, const int lda, const float *b, 235 | const int ldb, const float beta, float *c, const int ldc) const { 236 | cublasSideMode_t s = tolower(side[0]) == 'l' ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; 237 | cublasFillMode_t ul = uplo_to_cublas(uplo); 238 | CUBLAS_CALL(cublasSsymm(handle, s, ul, m, n, &alpha,a, lda, b, ldb, &beta, c, ldc)); 239 | } 240 | 241 | void axpy(const int n, const float alpha, const float* x, const int incx, float *y, const int incy) const { 242 | CUBLAS_CALL(cublasSaxpy(handle, n, &alpha, x, incx, y, incy)); 243 | } 244 | 245 | int potrf(const char *uplo, int n, float* a, int lda) { 246 | cublasFillMode_t ul = uplo_to_cublas(uplo); 247 | int bufsize = 0; 248 | int info = 0; 249 | CUSOLVER_CALL(cusolverDnSpotrf_bufferSize(cudense_handle, ul, n, a, lda, &bufsize)); 250 | 251 | float* buffer = (float*) get_buffer(bufsize * sizeof(float)); 252 | 253 | CUSOLVER_CALL(cusolverDnSpotrf(cudense_handle, ul, n, a, lda, buffer, bufsize, devinfo)); 254 | CUDA_CALL(cudaMemcpy(&info, devinfo, sizeof(int), cudaMemcpyDeviceToHost)); 255 | return info; 256 | } 257 | 258 | void* get_buffer(size_t bufsize) { 259 | // See if we already have a buffer of correct size, otherwise allocate 260 | void* buffer = 0; 261 | auto it = buffer_map.find(bufsize); 262 | if (it != buffer_map.end()) { 263 | buffer = it->second; 264 | } else { 265 | buffer = malloc(bufsize); 266 | buffer_map[bufsize] = buffer; 267 | } 268 | return buffer; 269 | } 270 | 271 | int potrs(const char *uplo, int n, int nrhs, float * a, int lda, float *b, int ldb) const { 272 | int info; 273 | cublasFillMode_t ul = uplo_to_cublas(uplo); 274 | CUSOLVER_CALL(cusolverDnSpotrs(cudense_handle, ul, n, nrhs, a, lda, b, ldb, devinfo)); 275 | CUDA_CALL(cudaMemcpy(&info, devinfo, sizeof(info), cudaMemcpyDeviceToHost)); 276 | return info; 277 | } 278 | 279 | int posv(const char *uplo, int n, int nrhs, float * a, int lda, float *b, int ldb) { 280 | int info = potrf(uplo, n, a, lda); 281 | if (info == 0) 282 | info = potrs(uplo, n, nrhs, a, lda, b, ldb); 283 | return info; 284 | } 285 | 286 | void* memset(void* dest, int ch, size_t count) const { 287 | CUDA_CALL(cudaMemset(dest, ch, count)); 288 | return dest; 289 | } 290 | 291 | float* memcpy(void* dest, const void *src, size_t count) const { 292 | CUDA_CALL(cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice)); 293 | return 0; 294 | } 295 | 296 | void free(void* ptr) const { 297 | if (ptr != 0 && ptr != &INVALID) { 298 | CUDA_CALL(cudaFree(ptr)); 299 | } 300 | } 301 | 302 | void free_devicememory(void* ptr) const { 303 | if (ptr != 0) { 304 | CUDA_CALL(cudaFree(ptr)); 305 | } 306 | } 307 | 308 | void free_devicememory(SparseMatrix* matrix) { 309 | if (matrix != 0 && matrix != &INVALID) { 310 | free(matrix->columns); 311 | free(matrix->values); 312 | free(matrix->rowPointers); 313 | std::free(matrix); 314 | } 315 | } 316 | 317 | float* malloc(size_t size) const { 318 | float* retval = 0; 319 | cudaError_t err = cudaMalloc(&retval, size); 320 | CUDA_CALL(err); 321 | if (err != cudaSuccess) { 322 | fprintf(stderr, "cudaMalloc failed\n"); 323 | retval = 0; 324 | } 325 | return retval; 326 | } 327 | 328 | int* malloci(size_t size) const { 329 | int* retval = 0; 330 | cudaError_t err = cudaMalloc(&retval, size); 331 | CUDA_CALL(err); 332 | if (err != cudaSuccess) { 333 | fprintf(stderr, "cudaMalloc failed\n"); 334 | retval = 0; 335 | } 336 | return retval; 337 | } 338 | 339 | void fill_eye(float* X, unsigned n) const; 340 | void fill(float* X, const unsigned size, const float value) const; 341 | void maximum(float* x, const float value, const unsigned size) const; 342 | void leaky_relu(float* x, const float value, const unsigned size) const; 343 | void tanh(float* x, const unsigned size) const; 344 | void sigmoid(float* x, const unsigned size) const; 345 | void soft_threshold(float* x, const float alpha, const int size) const; 346 | void invsqrt(float* s, const unsigned n) const; 347 | 348 | void invert(float* X, const unsigned size) const; 349 | 350 | void calculate_column_variance(const float* X, const unsigned nrows, const unsigned ncols, float* variances) const; 351 | void scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const; 352 | void scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const; 353 | void dropout(float* X, const unsigned size, const float dropout_rate) const; 354 | void add_saltpepper_noise(float* X, const unsigned size, const float noise_rate) const; 355 | void add_gauss_noise(float* X, const unsigned size, const float noise_rate) const; 356 | 357 | void calculate_column_variance(const SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* variances); 358 | void scale_columns(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const; 359 | void scale_rows(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const; 360 | void dropout(SparseMatrix* X, const unsigned size, const float dropout_rate) const; 361 | void add_saltpepper_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const; 362 | void add_gauss_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const; 363 | 364 | template 365 | T init_invalid(void) { 366 | return (typeid(T) == typeid(SparseMatrix*) ? (T) &INVALID : (T) 0); 367 | } 368 | 369 | template 370 | T malloc_matrix(int rows, int cols) { 371 | return malloc_matrix(rows, cols, init_invalid()); 372 | } 373 | 374 | SparseMatrix* malloc_matrix(int rows, int cols, SparseMatrix* dummy) { 375 | SparseMatrix* matrix = (SparseMatrix*) std::malloc(sizeof(SparseMatrix)); 376 | return matrix; 377 | } 378 | 379 | float* malloc_matrix(int rows, int cols, float *dummy) { 380 | return (float*) malloc(rows * cols * sizeof(float)); 381 | } 382 | 383 | float *memcpy_matrix(float *dest, float *src, int nrows_to_copy, int src_ncol, int first_row = 0) const { 384 | return memcpy(dest, &src[first_row * src_ncol], nrows_to_copy * src_ncol * sizeof(float)); 385 | } 386 | 387 | SparseMatrix* memcpy_matrix(SparseMatrix* dest, SparseMatrix* src, int nrows_to_copy, int src_ncol, int first_row = 0) const { 388 | int fromIndex = 0; 389 | int toIndex = 0; 390 | CUDA_CALL(cudaMemcpy(&fromIndex, &src->rowPointers[first_row], sizeof(int), cudaMemcpyDeviceToHost)); 391 | CUDA_CALL(cudaMemcpy(&toIndex , &src->rowPointers[first_row + nrows_to_copy], sizeof(int), cudaMemcpyDeviceToHost)); 392 | 393 | dest->nnz = (toIndex - fromIndex); 394 | dest->m = nrows_to_copy; 395 | 396 | dest->values = malloc(dest->nnz * sizeof(float)); 397 | dest->columns = malloci(dest->nnz * sizeof(int)); 398 | dest->rowPointers = malloci((nrows_to_copy + 1) * sizeof(int)); 399 | 400 | memcpy(dest->values, &src->values[fromIndex], dest->nnz * sizeof(float)); 401 | memcpy(dest->columns, &src->columns[fromIndex], dest->nnz * sizeof(int)); 402 | memcpy(dest->rowPointers, &src->rowPointers[first_row], (nrows_to_copy + 1) * sizeof(int)); 403 | subtract_first_element(dest->rowPointers, nrows_to_copy + 1); 404 | 405 | return dest; 406 | } 407 | 408 | void subtract_first_element(int* a, unsigned len) const; 409 | 410 | void free_sparse(void *ptr) { 411 | } 412 | 413 | void free_sparse(SparseMatrix* a) { 414 | // see get batch 415 | if (handle_valid(a)) { 416 | free(a->rowPointers); 417 | std::free(a); 418 | } 419 | } 420 | 421 | bool handle_valid(SparseMatrix* a) { 422 | return a != &INVALID; 423 | } 424 | 425 | float* get_batch(const float* X, int ncol, int batch_num, int batch_size) { 426 | /* return pointer */ 427 | return (float*) &X[batch_num * batch_size * ncol]; 428 | } 429 | 430 | SparseMatrix* get_batch(SparseMatrix* X, int ncol, int batch_num, int batch_size) { 431 | // ncol can be ignored 432 | // batch_size number of rows 433 | int from = batch_num * batch_size; 434 | int nrows = batch_size; 435 | 436 | SparseMatrix* dest = (SparseMatrix*) std::malloc(sizeof(SparseMatrix)); 437 | int fromIndex = 0; 438 | int toIndex = 0; 439 | CUDA_CALL(cudaMemcpy(&fromIndex, &X->rowPointers[from], sizeof(int), cudaMemcpyDeviceToHost)); 440 | CUDA_CALL(cudaMemcpy(&toIndex , &X->rowPointers[from + nrows], sizeof(int), cudaMemcpyDeviceToHost)); 441 | 442 | dest->nnz = (toIndex - fromIndex); 443 | dest->m = nrows; 444 | dest->values = &X->values[fromIndex]; 445 | dest->columns = &X->columns[fromIndex]; 446 | dest->rowPointers = malloci((nrows + 1) * sizeof(int)); 447 | memcpy(dest->rowPointers, &X->rowPointers[from], (nrows + 1) * sizeof(int)); 448 | subtract_first_element(dest->rowPointers, nrows + 1); 449 | return dest; 450 | } 451 | 452 | SparseMatrix* transpose(const SparseMatrix* x, int ncol) { 453 | SparseMatrix* t = (SparseMatrix*) std::malloc(sizeof(SparseMatrix)); 454 | t->values = //(float*) get_buffer(x->nnz * sizeof(float)); 455 | malloc(x->nnz * sizeof(float)); 456 | t->columns = //(int*) get_buffer(x->nnz * sizeof(int)); 457 | malloci(x->nnz * sizeof(int)); 458 | t->rowPointers = //(int*) get_buffer((ncol + 1) * sizeof(int)); 459 | malloci((ncol + 1) * sizeof(int)); 460 | t->nnz = x->nnz; 461 | t->m = ncol; 462 | CUSPARSE_CALL(cusparseScsr2csc(cusparse_handle, x->m, ncol, x->nnz, x->values, x->rowPointers, x->columns, t->values, 463 | t->columns, t->rowPointers, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO)); 464 | 465 | return t; 466 | } 467 | 468 | // Useful for debugging 469 | void printm(const char* name, const SparseMatrix *a, int n, int m) const { 470 | printf("%s\n", name); 471 | printMatrixSPM(a, n, m, 0); 472 | } 473 | 474 | void printm(const char* name, const float* a, int n, int m) const { 475 | printf("%s\n", name); 476 | printMatrixRM(a, n, m, 0); 477 | } 478 | 479 | void printMatrixCM(const float* a, int n, int m, const char* fmt) const; 480 | void printMatrixRM(const float* a, int n, int m, const char* fmt) const; 481 | 482 | void printMatrixSP(const SparseMatrix* a, const char* fmt) const; 483 | void printMatrixRM(const SparseMatrix* a, int n, int m, const char* fmt) const { 484 | printMatrixSPM(a, n, m, fmt); 485 | } 486 | 487 | void printMatrixSPM(const SparseMatrix* a, int n, int m, const char* fmt) const; 488 | 489 | void prints(const float* f, unsigned l) const { 490 | float* src = (float*) std::malloc(l * sizeof(float)); 491 | copy_to_host(f, src, l * sizeof(float)); 492 | for (unsigned i = 0; i < l; ++i) { 493 | printf("%f ", src[i]); 494 | } 495 | printf("\n"); 496 | std::free(src); 497 | } 498 | 499 | void printsu(const int* f, unsigned l) const { 500 | int* src = (int*) std::malloc(l * sizeof(int)); 501 | copy_to_host(f, src, l * sizeof(int)); 502 | for (unsigned i = 0; i < l; ++i) { 503 | printf("%d ", src[i]); 504 | } 505 | printf("\n"); 506 | std::free(src); 507 | } 508 | }; 509 | -------------------------------------------------------------------------------- /gpu_operations.cu: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2015-2017 Thomas Unterthiner 3 | Additional Contributions by Thomas Adler, Balázs Bencze 4 | Licensed under GPL, version 2 or a later (see LICENSE.txt) 5 | */ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "gpu_operations.h" 13 | 14 | static const int RNG_THREADS = 128; 15 | static const int RNG_BLOCKS = 128; 16 | 17 | // taken from PyCUDA 18 | void get_grid_sizes(int problemsize, int* blocks, int* threads) { 19 | int min_threads = 32; 20 | int max_threads = 256; 21 | int max_blocks = 384; 22 | 23 | if (problemsize < min_threads) { 24 | *blocks = 1; 25 | *threads = min_threads; 26 | } else if (problemsize < max_blocks * min_threads) { 27 | *blocks = (problemsize + min_threads - 1) / min_threads; 28 | *threads = min_threads; 29 | } else if (problemsize < max_blocks * max_threads) { 30 | *blocks = max_blocks; 31 | int grp = (problemsize + min_threads - 1) / min_threads; 32 | *threads = ((grp + max_blocks - 1) / max_blocks) * min_threads; 33 | } else { 34 | *blocks = max_blocks; 35 | *threads = max_threads; 36 | } 37 | } 38 | 39 | __global__ void setup_rng(curandState* rng_state, unsigned long seed) { 40 | const int tid = blockIdx.x * blockDim.x + threadIdx.x; 41 | curand_init(seed, tid, 0, &rng_state[tid]); 42 | } 43 | 44 | __global__ void dropout_eltw(float* x, const unsigned size, const float dropout_rate, curandState* rng_state) { 45 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 46 | const unsigned num_threads = gridDim.x * blockDim.x; 47 | curandState localState = rng_state[tid]; 48 | for (unsigned i = tid; i < size; i += num_threads) 49 | x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i]; 50 | rng_state[tid] = localState; 51 | } 52 | 53 | __global__ void saltpepper_noise_eltw(float* x, const unsigned size, const float noise_rate, curandState* rng_state) { 54 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 55 | const unsigned num_threads = gridDim.x * blockDim.x; 56 | curandState localState = rng_state[tid]; 57 | for (unsigned i = tid; i < size; i += num_threads) 58 | if (curand_uniform(&localState) < noise_rate) { 59 | x[i] = (curand_uniform(&localState) < 0.5f) ? 0.0f : 1.0f; 60 | } 61 | rng_state[tid] = localState; 62 | 63 | } 64 | 65 | __global__ void gauss_noise_eltw(float* x, const unsigned size, const float noise_rate, curandState* rng_state) { 66 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 67 | const unsigned num_threads = gridDim.x * blockDim.x; 68 | curandState localState = rng_state[tid]; 69 | for (unsigned i = tid; i < size; i += num_threads) 70 | x[i] += curand_normal(&localState) * noise_rate; 71 | rng_state[tid] = localState; 72 | 73 | } 74 | 75 | __global__ void leaky_relu_eltw(float* x, const float value, const unsigned size) { 76 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 77 | const unsigned num_threads = gridDim.x * blockDim.x; 78 | for (unsigned i = tid; i < size; i += num_threads) { 79 | x[i] = (x[i] < 0.0f) ? x[i] * value : x[i]; 80 | } 81 | } 82 | 83 | __global__ void maximum_eltw(float* x, const float value, const unsigned size) { 84 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 85 | const unsigned num_threads = gridDim.x * blockDim.x; 86 | for (unsigned i = tid; i < size; i += num_threads) { 87 | x[i] = fmaxf(x[i], value); 88 | } 89 | } 90 | 91 | __global__ void sigmoid_eltw(float* x, const unsigned size) { 92 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 93 | const unsigned num_threads = gridDim.x * blockDim.x; 94 | for (unsigned i = tid; i < size; i += num_threads) { 95 | x[i] = 1 / (1 + __expf(-x[i])); 96 | } 97 | } 98 | 99 | __global__ void tanh_eltw(float* x, const unsigned size) { 100 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 101 | const unsigned num_threads = gridDim.x * blockDim.x; 102 | for (unsigned i = tid; i < size; i += num_threads) { 103 | x[i] = tanhf(x[i]); 104 | } 105 | } 106 | 107 | __global__ void softthreshold_eltw(float* x, float alpha, const unsigned size) { 108 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 109 | const unsigned num_threads = gridDim.x * blockDim.x; 110 | for (unsigned i = tid; i < size; i += num_threads) { 111 | const float f = x[i]; 112 | x[i] = f > 0 ? fmaxf(0., f - alpha) : fminf(0., f + alpha); 113 | } 114 | } 115 | 116 | __global__ void fill_eltw(float* x, const unsigned size, const float value) { 117 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 118 | const unsigned num_threads = gridDim.x * blockDim.x; 119 | for (unsigned i = tid; i < size; i += num_threads) { 120 | x[i] = value; 121 | } 122 | } 123 | 124 | __global__ void invert_eltw(float* x, const unsigned size) { 125 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 126 | const unsigned num_threads = gridDim.x * blockDim.x; 127 | for (unsigned i = tid; i < size; i += num_threads) { 128 | x[i] = 1.0f / x[i]; 129 | } 130 | } 131 | 132 | __global__ void col_variance_kernel(const float* X, float* var, const unsigned nrows, const unsigned ncols) { 133 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 134 | const unsigned num_threads = blockDim.x * gridDim.x; 135 | for (unsigned i = tid; i < ncols; i += num_threads) { 136 | var[i] = 0.0; 137 | for (unsigned j = 0; j < nrows; ++j) { 138 | var[i] += X[j * ncols + i]; 139 | } 140 | float m = var[i] / nrows; 141 | var[i] = 0.0; 142 | for (unsigned j = 0; j < nrows; ++j) { 143 | float tmp = X[j * ncols + i] - m; 144 | var[i] += tmp * tmp; 145 | } 146 | var[i] /= nrows; 147 | } 148 | } 149 | 150 | __global__ void invsqrt_eltw(float* x, const unsigned k) { 151 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 152 | const unsigned num_threads = blockDim.x * gridDim.x; 153 | for (unsigned i = tid; i < k; i += num_threads) { 154 | x[i] = (x[i] > 1e-7) ? rsqrtf(x[i]) : 1.0; 155 | } 156 | } 157 | 158 | __global__ void scale_columns_kernel(float* X, float* a, const unsigned nrows, const unsigned ncols) { 159 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 160 | const unsigned num_threads = blockDim.x * gridDim.x; 161 | for (unsigned i = tid; i < ncols * nrows; i += num_threads) { 162 | X[i] *= a[i % ncols]; 163 | } 164 | } 165 | 166 | __global__ void scale_rows_kernel(float* X, float* a, const unsigned nrows, const unsigned ncols) { 167 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 168 | const unsigned num_threads = blockDim.x * gridDim.x; 169 | for (unsigned i = tid; i < ncols * nrows; i += num_threads) { 170 | X[i] *= a[i / ncols]; 171 | } 172 | } 173 | 174 | __global__ void subtract_first_kernel(int* x, const unsigned len) { 175 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 176 | const unsigned num_threads = blockDim.x * gridDim.x; 177 | const unsigned elem = x[0]; 178 | for (unsigned i = tid; i < len; i += num_threads) { 179 | x[i] -= elem; 180 | } 181 | } 182 | 183 | __global__ void sparse_col_variance_kernel(const GPU_Operations::SparseMatrix X, float* var, const unsigned nrows, 184 | const unsigned ncols) { 185 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 186 | const unsigned num_threads = blockDim.x * gridDim.x; 187 | for (unsigned i = tid; i < ncols; i += num_threads) { 188 | var[i] = 0.0; 189 | for (unsigned j = 0; j < X.nnz; ++j) { 190 | if (X.columns[j] == i) { 191 | var[i] += X.values[j]; 192 | } 193 | } 194 | float m = var[i] / nrows; 195 | var[i] = 0.0; 196 | unsigned nonzero_per_column = 0; 197 | for (unsigned j = 0; j < X.nnz; ++j) { 198 | if (X.columns[j] == i) { 199 | float tmp = X.values[j] - m; 200 | var[i] += tmp * tmp; 201 | nonzero_per_column++; 202 | } 203 | } 204 | var[i] += (nrows - nonzero_per_column) * (m * m); 205 | var[i] /= nrows; 206 | } 207 | } 208 | 209 | __global__ void sparse_row_variance_kernel(const GPU_Operations::SparseMatrix X, float* var, const unsigned nrows, 210 | const unsigned ncols) { 211 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 212 | const unsigned num_threads = blockDim.x * gridDim.x; 213 | for (unsigned i = tid; i < nrows; i += num_threads) { 214 | var[i] = 0.0; 215 | int from = X.rowPointers[i]; 216 | int to = X.rowPointers[i + 1]; 217 | for (int j = from; j < to; ++j) { 218 | var[i] += X.values[j]; 219 | } 220 | float m = var[i] / ncols; 221 | var[i] = 0.0; 222 | for (int j = from; j < to; ++j) { 223 | float tmp = X.values[j] - m; 224 | var[i] += tmp * tmp; 225 | } 226 | var[i] += (ncols - to + from) * (m * m); 227 | var[i] /= ncols; 228 | } 229 | } 230 | 231 | __global__ void sparse_scale_columns_kernel(GPU_Operations::SparseMatrix X, float* a, const unsigned nrows, const unsigned ncols) { 232 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 233 | const unsigned num_threads = blockDim.x * gridDim.x; 234 | for (unsigned i = tid; i < X.nnz; i += num_threads) { 235 | X.values[i] *= a[X.columns[i]]; 236 | } 237 | } 238 | 239 | __global__ void sparse_scale_rows_kernel(GPU_Operations::SparseMatrix X, float* a) { 240 | const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x; 241 | const unsigned num_threads = blockDim.x * gridDim.x; 242 | for (unsigned i = tid; i < X.m; i += num_threads) { 243 | for (unsigned j = X.rowPointers[i]; j < X.rowPointers[i + 1]; ++j) { 244 | X.values[j] *= a[i]; 245 | } 246 | } 247 | } 248 | 249 | GPU_Operations::GPU_Operations(const int n, const int m, const int k, unsigned long seed, int gpu_id) { 250 | // if no GPU was specified, try to pick the best one automatically 251 | if (gpu_id < 0) { 252 | gpu_id = 0; 253 | int num_devices, device; 254 | cudaGetDeviceCount(&num_devices); 255 | if (num_devices > 1) { 256 | size_t max_freememory = 0; 257 | for (device = 0; device < num_devices; device++) { 258 | size_t free, total; 259 | cudaSetDevice(device); 260 | cudaMemGetInfo(&free, &total); 261 | cudaDeviceProp prop; 262 | cudaGetDeviceProperties(&prop, device); 263 | //printf("Found device %d (%s) with %d MiB of free memory\n", 264 | // device, prop.name, free / (1024l*1024l)); 265 | if (free > max_freememory) { 266 | max_freememory = free; 267 | gpu_id = device; 268 | } 269 | cudaDeviceReset(); 270 | } 271 | } 272 | } 273 | assert(gpu_id >= 0); 274 | cudaSetDevice(gpu_id); 275 | 276 | // the following call does not work if the current process has already 277 | // called into librfn previously. Then, this call will return 278 | // cudaErrorSetOnActiveProcess. Resetting the device won't work either, 279 | // because then the subsequent cublasCreate call will just fail with 280 | // CUBLAS_STATUS_NOT_INITIALIZED. I don't know why any of this is happening 281 | //CUDA_CALL(cudaSetDeviceFlags(cudaDeviceScheduleYield)); 282 | 283 | cublasStatus_t status = cublasCreate(&handle); 284 | if (status != CUBLAS_STATUS_SUCCESS) { 285 | const char* errmsg = cublasErrorString(status); 286 | fprintf(stderr, "CUBLAS initialization error: %s\n", errmsg); 287 | cudaDeviceReset(); 288 | throw std::runtime_error(errmsg); 289 | } 290 | CUSOLVER_CALL(cusolverDnCreate(&cudense_handle)); 291 | CUDA_CALL(cudaMalloc(&rng_state, RNG_BLOCKS * RNG_THREADS * sizeof(curandState))); 292 | setup_rng<<>>(rng_state, seed); 293 | int ones_size = n > k ? n : k; 294 | ones = (float*) malloc(ones_size * sizeof(float)); 295 | fill(ones, ones_size, 1.0f); 296 | CUDA_CALL(cudaMalloc(&devinfo, sizeof(int))); 297 | 298 | cusparseStatus_t sp_status = cusparseCreate(&cusparse_handle); 299 | if (sp_status != CUSPARSE_STATUS_SUCCESS) { 300 | fprintf(stderr, "cuSparse: %d\n", sp_status); 301 | cudaDeviceReset(); 302 | throw std::runtime_error("cuSparse error"); 303 | } 304 | 305 | for (int i = 0; i < MAX_STREAMS; i++) { 306 | CUDA_CALL(cudaStreamCreate(&streams[i])); 307 | } 308 | 309 | CUSPARSE_CALL(cusparseCreateMatDescr(&descr)); 310 | CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL)); 311 | CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO)); 312 | } 313 | 314 | GPU_Operations::~GPU_Operations() { 315 | free(devinfo); 316 | free(ones); 317 | for (auto i : buffer_map) { 318 | free(i.second); 319 | } 320 | CUSOLVER_CALL(cusolverDnDestroy(cudense_handle)); 321 | CUBLAS_CALL(cublasDestroy(handle)); 322 | for (int i = 0; i < MAX_STREAMS; i++) { 323 | CUDA_CALL(cudaStreamSynchronize(streams[i])); 324 | CUDA_CALL(cudaStreamDestroy(streams[i])); 325 | } 326 | CUSPARSE_CALL(cusparseDestroyMatDescr(descr)); 327 | } 328 | 329 | GPU_Operations::SparseMatrix GPU_Operations::create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m){ 330 | SparseMatrix X = {(float*) Xvals, (int*) Xcols, (int*) Xrowptr, n, Xrowptr[n]}; 331 | return X; 332 | } 333 | 334 | 335 | float* GPU_Operations::to_device(const float* src, size_t size) const { 336 | float* dst = 0; 337 | CUDA_CALL(cudaMalloc(&dst, size)); 338 | CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); 339 | return dst; 340 | } 341 | 342 | int* GPU_Operations::to_device(const int* src, size_t size) const { 343 | int* dst = 0; 344 | CUDA_CALL(cudaMalloc(&dst, size)); 345 | CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice)); 346 | return dst; 347 | } 348 | 349 | GPU_Operations::SparseMatrix* GPU_Operations::to_device(const SparseMatrix* src, size_t size) const { 350 | SparseMatrix* dst = (SparseMatrix*) std::malloc(sizeof(SparseMatrix)); 351 | 352 | dst->values = to_device(src->values, src->nnz * sizeof(float)); 353 | dst->columns = to_device(src->columns, src-> nnz * sizeof(int)); 354 | dst->rowPointers = to_device(src->rowPointers, (src->m + 1) * sizeof(int)); 355 | dst->m = src->m; 356 | dst->nnz = src->nnz; 357 | 358 | return dst; 359 | } 360 | 361 | void GPU_Operations::fill(float* X, const unsigned size, const float value) const { 362 | int threads, blocks; 363 | get_grid_sizes(size, &threads, &blocks); 364 | fill_eltw<<>>(X, size, value); 365 | assert(!cudaGetLastError()); 366 | } 367 | 368 | void GPU_Operations::dropout(float* X, const unsigned size, const float dropout_rate) const { 369 | dropout_eltw<<>>(X, size, dropout_rate, rng_state); 370 | assert(!cudaGetLastError()); 371 | } 372 | 373 | void GPU_Operations::add_gauss_noise(float* X, const unsigned size, const float noise_rate) const { 374 | gauss_noise_eltw<<>>(X, size, noise_rate, rng_state); 375 | assert(!cudaGetLastError()); 376 | } 377 | 378 | void GPU_Operations::add_saltpepper_noise(float* X, const unsigned size, const float noise_rate) const { 379 | saltpepper_noise_eltw<<>>(X, size, noise_rate, rng_state); 380 | assert(!cudaGetLastError()); 381 | } 382 | 383 | void GPU_Operations::invert(float* X, const unsigned size) const { 384 | int threads, blocks; 385 | get_grid_sizes(size, &threads, &blocks); 386 | invert_eltw<<>>(X, size); 387 | assert(!cudaGetLastError()); 388 | } 389 | 390 | void GPU_Operations::maximum(float* x, const float value, const unsigned size) const { 391 | int threads, blocks; 392 | get_grid_sizes(size, &threads, &blocks); 393 | maximum_eltw<<>>(x, value, size); 394 | assert(!cudaGetLastError()); 395 | } 396 | 397 | void GPU_Operations::leaky_relu(float* x, const float value, const unsigned size) const { 398 | int threads, blocks; 399 | get_grid_sizes(size, &threads, &blocks); 400 | leaky_relu_eltw<<>>(x, value, size); 401 | assert(!cudaGetLastError()); 402 | } 403 | 404 | void GPU_Operations::sigmoid(float* x, const unsigned size) const { 405 | int threads, blocks; 406 | get_grid_sizes(size, &threads, &blocks); 407 | sigmoid_eltw<<>>(x, size); 408 | assert(!cudaGetLastError()); 409 | } 410 | 411 | void GPU_Operations::tanh(float* x, const unsigned size) const { 412 | int threads, blocks; 413 | get_grid_sizes(size, &threads, &blocks); 414 | tanh_eltw<<>>(x, size); 415 | assert(!cudaGetLastError()); 416 | } 417 | 418 | void GPU_Operations::soft_threshold(float* x, const float alpha, const int size) const { 419 | int threads, blocks; 420 | get_grid_sizes(size, &threads, &blocks); 421 | softthreshold_eltw<<>>(x, alpha, size); 422 | assert(!cudaGetLastError()); 423 | } 424 | 425 | void GPU_Operations::fill_eye(float* X, unsigned n) const { 426 | memset(X, 0, n * n * sizeof(float)); 427 | axpy(n, 1.0f, ones, 0, X, n + 1); 428 | } 429 | 430 | void GPU_Operations::calculate_column_variance(const float* X, const unsigned nrows, const unsigned ncols, 431 | float* variance) const { 432 | int threads, blocks; 433 | get_grid_sizes(ncols, &threads, &blocks); 434 | col_variance_kernel<<>>(X, variance, nrows, ncols); 435 | } 436 | 437 | void GPU_Operations::invsqrt(float* s, const unsigned n) const { 438 | int t, b; 439 | get_grid_sizes(n, &t, &b); 440 | invsqrt_eltw<<>>(s, n); 441 | } 442 | 443 | void GPU_Operations::scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const { 444 | 445 | int threads, blocks; 446 | get_grid_sizes(ncols * nrows, &threads, &blocks); 447 | scale_columns_kernel<<>>(X, s, nrows, ncols); 448 | } 449 | 450 | void GPU_Operations::scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const { 451 | int threads, blocks; 452 | get_grid_sizes(ncols * nrows, &threads, &blocks); 453 | scale_rows_kernel<<>>(X, s, nrows, ncols); 454 | } 455 | 456 | void GPU_Operations::subtract_first_element(int* a, unsigned len) const { 457 | int threads, blocks; 458 | get_grid_sizes(len, &threads, &blocks); 459 | subtract_first_kernel<<>>(a, len); 460 | } 461 | 462 | void GPU_Operations::calculate_column_variance(const SparseMatrix* X, const unsigned nrows, const unsigned ncols, 463 | float* variance) { 464 | int threads, blocks; 465 | SparseMatrix* x_transpose = transpose(X, ncols); 466 | get_grid_sizes(nrows, &threads, &blocks); 467 | sparse_row_variance_kernel<<>>(*x_transpose, variance, ncols, nrows); 468 | free(x_transpose->columns); 469 | free(x_transpose->values); 470 | free(x_transpose->rowPointers); 471 | std::free(x_transpose); 472 | 473 | } 474 | 475 | void GPU_Operations::scale_columns(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const { 476 | 477 | int threads, blocks; 478 | get_grid_sizes(X->nnz, &threads, &blocks); 479 | sparse_scale_columns_kernel<<>>(*X, s, nrows, ncols); 480 | } 481 | 482 | void GPU_Operations::scale_rows(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const { 483 | int threads, blocks; 484 | get_grid_sizes(X->m, &threads, &blocks); 485 | sparse_scale_rows_kernel<<>>(*X, s); 486 | } 487 | 488 | void GPU_Operations::dropout(SparseMatrix* X, const unsigned size, const float dropout_rate) const { 489 | dropout_eltw<<>>(X->values, size, dropout_rate, rng_state); 490 | assert(!cudaGetLastError()); 491 | } 492 | 493 | void GPU_Operations::add_gauss_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const { 494 | gauss_noise_eltw<<>>(X->values, size, noise_rate, rng_state); 495 | assert(!cudaGetLastError()); 496 | } 497 | 498 | void GPU_Operations::add_saltpepper_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const { 499 | saltpepper_noise_eltw<<>>(X->values, size, noise_rate, rng_state); 500 | assert(!cudaGetLastError()); 501 | } 502 | 503 | void GPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha, 504 | const SparseMatrix* a, const int lda, const float *b, const int ldb, const float beta, float *c, 505 | const int ldc) { 506 | cusparseOperation_t opA = op_to_cusparse(transa); 507 | cusparseOperation_t opB = op_to_cusparse(transb); 508 | 509 | SparseMatrix* row_major_a = transpose(a, opA != CUSPARSE_OPERATION_NON_TRANSPOSE ? k : m); 510 | 511 | int ncol_a = k; 512 | if (opA != CUSPARSE_OPERATION_NON_TRANSPOSE) { 513 | ncol_a = a->m; 514 | } 515 | 516 | CUSPARSE_CALL(cusparseScsrmm2(cusparse_handle, opA, opB, row_major_a->m, n, ncol_a, 517 | row_major_a->nnz, &alpha, descr, row_major_a->values, row_major_a->rowPointers, row_major_a->columns, b, ldb, &beta, c, ldc)); 518 | 519 | 520 | free(row_major_a->columns); 521 | free(row_major_a->values); 522 | free(row_major_a->rowPointers); 523 | std::free(row_major_a); 524 | } 525 | 526 | /*void GPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, 527 | const float alpha, const float *a, const int lda, const SparseMatrix* b, const int ldb, 528 | const float beta, float *c, const int ldc) { 529 | cusparseOperation_t opA = op_to_cusparse(transa); 530 | cusparseOperation_t opB = op_to_cusparse(transb); 531 | SparseMatrix* b_trans; 532 | 533 | if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) { 534 | b_trans = transpose(b, n); 535 | } else { 536 | b_trans = (SparseMatrix*) std::malloc(sizeof(SparseMatrix)); 537 | b_trans->values = b->values; 538 | b_trans->columns = b->columns; 539 | b_trans->rowPointers = b->rowPointers; 540 | b_trans->m = b->m; 541 | b_trans->nnz = b->nnz; 542 | } 543 | 544 | int m_a = m; // number of rows of A 545 | int n_a = k; // number of columns of A 546 | if (opA != CUSPARSE_OPERATION_NON_TRANSPOSE) { 547 | m_a = k; 548 | n_a = m; 549 | } 550 | 551 | int bufsize; 552 | CUSPARSE_CALL(cusparseSgemvi_bufferSize(cusparse_handle, opA, m_a, n_a, b_trans->nnz, &bufsize)); 553 | void* buffer = get_buffer(bufsize); 554 | 555 | int* row_pointers = (int*) std::malloc((b_trans->m + 1) * sizeof(int)); 556 | copy_to_host(b_trans->rowPointers, row_pointers, (b_trans->m + 1) * sizeof(int)); 557 | 558 | for(unsigned r = 0; r < b_trans->m; ++r) { 559 | int row_pointer = row_pointers[r]; 560 | int nnz = row_pointers[r + 1] - row_pointer; 561 | 562 | set_stream(r); 563 | 564 | if (nnz == 0) { 565 | CUBLAS_CALL(cublasSscal_v2(handle, n, &beta, &c[r * ldc], 1)); 566 | } else if (nnz > 0) { 567 | CUSPARSE_CALL(cusparseSgemvi(cusparse_handle, opA, m_a, n_a, &alpha, a, lda, nnz, 568 | &b_trans->values[row_pointer], &b_trans->columns[row_pointer], &beta, &c[r * ldc], CUSPARSE_INDEX_BASE_ZERO, buffer)); 569 | } else { 570 | printf("Internal error"); 571 | exit(1); 572 | } 573 | } 574 | 575 | synchronize_all_streams(); 576 | default_stream(); 577 | 578 | if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) { 579 | free(b_trans->values); 580 | free(b_trans->columns); 581 | free(b_trans->rowPointers); 582 | } 583 | std::free(b_trans); 584 | std::free(row_pointers); 585 | }*/ 586 | 587 | void GPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, 588 | const float alpha, const float *a, const int lda, const SparseMatrix* b, const int ldb, 589 | const float beta, float *c, const int ldc) { 590 | cusparseOperation_t opA = op_to_cusparse(transa); 591 | cusparseOperation_t opB = op_to_cusparse(transb); 592 | SparseMatrix* b2; 593 | float alpha_t = 1.0f; 594 | float beta_t = 0.0f; 595 | 596 | //3) 597 | int b2_ncol = 0; 598 | if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) { 599 | b2 = transpose(b, n); 600 | b2_ncol = b->m; 601 | } else { 602 | b2 = (SparseMatrix*) std::malloc(sizeof(SparseMatrix)); 603 | b2->values = b->values; 604 | b2->columns = b->columns; 605 | b2->rowPointers = b->rowPointers; 606 | b2->m = b->m; 607 | b2->nnz = b->nnz; 608 | b2_ncol = k; 609 | } 610 | //4) 611 | float* c2 = (float*) get_buffer(m*n * sizeof(float)); 612 | memcpy(c2, c, m*n * sizeof(float)); 613 | if (beta != 0.0f) { 614 | CUBLAS_CALL(cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, n, m, &alpha_t, c, ldc, &beta_t, NULL, 0, c2, ldc)); 615 | } 616 | 617 | // 4.5 618 | cusparseOperation_t opA2; 619 | if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) { 620 | opA2 = CUSPARSE_OPERATION_TRANSPOSE; 621 | } else { 622 | opA2 = CUSPARSE_OPERATION_NON_TRANSPOSE; 623 | } 624 | 625 | //5) 626 | CUSPARSE_CALL(cusparseScsrmm2(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, opA2, b2->m, m, b2_ncol, b2->nnz, &alpha, descr, 627 | b2->values, b2->rowPointers, b2->columns, a, lda, &beta, c2, b2->m)); 628 | 629 | //6 630 | CUBLAS_CALL(cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha_t, c2, b2->m, &beta_t, (float*)0, b2->m, c, ldc)); 631 | if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) { 632 | free(b2->columns); 633 | free(b2->values); 634 | free(b2->rowPointers); 635 | } 636 | std::free(b2); 637 | 638 | } 639 | 640 | // Debugging 641 | void GPU_Operations::printMatrixRM(const float* a, int n, int m, const char* fmt) const { 642 | const char* format = fmt == 0 ? "%1.3f " : fmt; 643 | size_t size = n * m * sizeof(float); 644 | float* tmp = (float*) std::malloc(size); 645 | CUDA_CALL(cudaMemcpy(tmp, a, size, cudaMemcpyDeviceToHost)); 646 | for (int i = 0; i < n; ++i) { 647 | for (int j = 0; j < m; ++j) 648 | printf(format, tmp[i * m + j]); 649 | printf("\n"); 650 | } 651 | printf("\n"); 652 | std::free(tmp); 653 | } 654 | 655 | void GPU_Operations::printMatrixCM(const float* a, int n, int m, const char* fmt) const { 656 | const char* format = fmt == 0 ? "%1.3f " : fmt; 657 | size_t size = n * m * sizeof(float); 658 | float* tmp = (float*) std::malloc(size); 659 | CUDA_CALL(cudaMemcpy(tmp, a, size, cudaMemcpyDeviceToHost)); 660 | for (int i = 0; i < n; ++i) { 661 | for (int j = 0; j < m; ++j) 662 | printf(format, tmp[i + j * n]); 663 | printf("\n"); 664 | } 665 | printf("\n"); 666 | std::free(tmp); 667 | } 668 | 669 | void GPU_Operations::printMatrixSP(const SparseMatrix *a, const char* fmt) const { 670 | const char* format = fmt == 0 ? "%1.3f " : fmt; 671 | size_t size_values = a->nnz * sizeof(float); 672 | size_t size_columns = a->nnz * sizeof(int); 673 | size_t size_pointers = (a->m + 1)* sizeof(int); 674 | 675 | float* tmp_vals = (float*) std::malloc(size_values); 676 | int* tmp_cols = (int*) std::malloc(size_columns); 677 | int* tmp_pointers = (int*) std::malloc(size_pointers); 678 | 679 | CUDA_CALL(cudaMemcpy(tmp_vals, a->values, size_values, cudaMemcpyDeviceToHost)); 680 | CUDA_CALL(cudaMemcpy(tmp_cols, a->columns, size_columns, cudaMemcpyDeviceToHost)); 681 | CUDA_CALL(cudaMemcpy(tmp_pointers, a->rowPointers, size_pointers, cudaMemcpyDeviceToHost)); 682 | 683 | printf("values: "); 684 | for (int i = 0; i < a->nnz; i++) { 685 | printf(format, tmp_vals[i]); 686 | } 687 | printf("\npointers: "); 688 | for (int i = 0; i < a->m + 1; i++) { 689 | printf("%d ", tmp_pointers[i]); 690 | } 691 | printf("\ncolumns: "); 692 | for (int i = 0; i < a->nnz; i++) { 693 | printf("%d ", tmp_cols[i]); 694 | } 695 | printf("\n"); 696 | std::free(tmp_vals); 697 | std::free(tmp_cols); 698 | std::free(tmp_pointers); 699 | } 700 | 701 | void GPU_Operations::printMatrixSPM(const SparseMatrix *a, int n, int m, const char* fmt) const { 702 | const char* format = fmt == 0 ? "%1.3f " : fmt; 703 | size_t size_values = a->nnz * sizeof(float); 704 | size_t size_columns = a->nnz * sizeof(int); 705 | size_t size_pointers = (a->m + 1)* sizeof(int); 706 | 707 | float* tmp_vals = (float*) std::malloc(size_values); 708 | int* tmp_cols = (int*) std::malloc(size_columns); 709 | int* tmp_pointers = (int*) std::malloc(size_pointers); 710 | 711 | CUDA_CALL(cudaMemcpy(tmp_vals, a->values, size_values, cudaMemcpyDeviceToHost)); 712 | CUDA_CALL(cudaMemcpy(tmp_cols, a->columns, size_columns, cudaMemcpyDeviceToHost)); 713 | CUDA_CALL(cudaMemcpy(tmp_pointers, a->rowPointers, size_pointers, cudaMemcpyDeviceToHost)); 714 | 715 | for (int i = 0; i < n; i++) { 716 | int rowPointer = tmp_pointers[i]; 717 | int nnz = tmp_pointers[i + 1] - rowPointer; 718 | int found = 0; 719 | for (int j = 0; j < m; j++) { 720 | if (found < nnz) { 721 | if (j == tmp_cols[rowPointer + found]) { 722 | printf(format, tmp_vals[rowPointer + found]); 723 | found++; 724 | } else { 725 | printf(format, 0.0f); 726 | } 727 | } else { 728 | printf(format, 0.0f); 729 | } 730 | } 731 | printf("\n"); 732 | } 733 | printf("\n"); 734 | std::free(tmp_vals); 735 | std::free(tmp_cols); 736 | std::free(tmp_pointers); 737 | } 738 | --------------------------------------------------------------------------------