├── .gitignore
├── tests
    ├── test_runner.cpp
    ├── testbin.c
    └── tests.cpp
├── LICENSE.txt
├── Makefile
├── blas_enum.h
├── README.md
├── librfn.h
├── basic_python_implementation.py
├── cpu_operations.cpp
├── rfn.py
├── cpu_operations.h
├── blas_sparse_proto.h
├── librfn.cpp
├── gpu_operations.h
└── gpu_operations.cu


/.gitignore:
--------------------------------------------------------------------------------
 1 | RFN/*
 2 | *.o
 3 | *.so
 4 | *.npy
 5 | *.pyc
 6 | *.log
 7 | *.pkl
 8 | *.mp4
 9 | .ipynb_checkpoints
10 | 


--------------------------------------------------------------------------------
/tests/test_runner.cpp:
--------------------------------------------------------------------------------
 1 | #define CATCH_CONFIG_MAIN
 2 | #include "catch.hpp"
 3 | 
 4 | 
 5 | /* As explained in
 6 |  * https://github.com/philsquared/Catch/blob/master/docs/slow-compiles.md
 7 |  * This file is only here to speed up compilation
 8 |  * by keeping the runner-implementation in its own file
 9 |  */
10 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 |     librfn: An implementation of Rectified Factor Networks
 2 |     Copyright (C) 2014-2017  Thomas Unterthiner
 3 |     Additional contributions by Thomas Adler, Balázs Bencze
 4 | 
 5 |     This program is free software; you can redistribute it and/or modify
 6 |     it under the terms of the GNU General Public License as published by
 7 |     the Free Software Foundation; either version 2 of the License, or
 8 |     (at your option) any later version.
 9 | 
10 |     This program is distributed in the hope that it will be useful,
11 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
12 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 |     GNU General Public License for more details.
14 | 
15 |     You should have received a copy of the GNU General Public License along
16 |     with this program; if not, write to the Free Software Foundation, Inc.,
17 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 | 


--------------------------------------------------------------------------------
/tests/testbin.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdio.h>
 3 | #include <stdlib.h>
 4 | #include <string.h>
 5 | #include <math.h>
 6 | #include <sys/time.h>
 7 | #include "../librfn.h"
 8 | 
 9 | #ifndef M_PI
10 | #define M_PI 3.14159265358979323846
11 | #endif
12 | 
13 | 
14 | // random in (0, 1]
15 | static double rand_unif(void) {
16 |     return (rand())/(RAND_MAX+1.0);
17 | }
18 | /*
19 | // generates random samples from a 0/1 Gaussian via Box-Mueller
20 | static double rand_normal(void) {
21 |     return sqrt(-2.0*log(rand_unif())) * cos(2.0*M_PI*rand_unif());
22 | }
23 | */
24 | 
25 | float time_diff(struct timeval *t2, struct timeval *t1) {
26 |     long int diff = (t2->tv_usec + 1000000 * t2->tv_sec) - (t1->tv_usec + 1000000 * t1->tv_sec);
27 |     return diff / 1000000.0f;
28 | }
29 | 
30 | 
31 | 
32 | int main(int argc, char** argv) {
33 |     int n = 50000;
34 |     int m = 784;
35 |     int k = 2048;
36 |     int n_iter = 10;
37 |     int gpu_id = -1;
38 | 
39 |     if (argc > 1)
40 |         k = atoi(argv[1]);
41 | 
42 |     if (argc > 2)
43 |         n_iter = atoi(argv[2]);
44 | 
45 |     if (argc > 3)
46 |         m = atoi(argv[3]);
47 | 
48 |     if (argc > 4)
49 |         gpu_id = atoi(argv[4]);
50 | 
51 | 
52 |     float* X = (float*) malloc(n*m*sizeof(float));
53 |     float* W = (float*) malloc(n*k*sizeof(float));
54 |     float* P = (float*) malloc(m*sizeof(float));
55 | 
56 |     for (int i = 0; i < n*m; ++i)
57 |         X[i] = 5.0f* rand_unif() - 0.5;
58 |     for (int i = 0; i < n*k; ++i)
59 |         W[i] = rand_unif() - 0.5;
60 | 
61 |     struct timeval t0, t1;
62 |     gettimeofday(&t0, 0);
63 |     train_gpu(X, W, P, n, m, k, n_iter, 0.1, 0.1, 1e-2, 0.0, 0.0, 32, gpu_id);
64 |     //train_cpu(X, W, P, n, m, k, n_iter, 0.1, 0.1, 1e-2, 0.0, 0.0, 32);
65 |     gettimeofday(&t1, 0);
66 |     printf("time for rfn: %3.4fs\n", time_diff(&t1, &t0));
67 |     free(X);
68 |     free(W);
69 |     free(P);
70 |     return 0;
71 | }
72 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC := nvcc -std=c++11
 2 | CC := gcc -std=c99
 3 | CXX := g++ -std=c++11
 4 | LINK := g++
 5 | USEGPU = yes
 6 | DEBUG = no
 7 | 
 8 | 
 9 | # BLAS/LAPACK implementation
10 | # Change these lines if you don't have an MKL but an other BLAS/LAPACK
11 | LIBS=-llapack -lblas
12 | #INCPATH=-I/opt/intel/mkl/include/
13 | #LIBS=-L/opt/intel/mkl/lib/intel64/ -lmkl_rt
14 | 
15 | 
16 | GPUINC=-I/usr/local/cuda/include
17 | GPULIB=-L/usr/local/cuda/lib64
18 | GPUSO=-lcublas -lcurand -lcuda -lcudart -lcusolver -lgomp -lcusparse
19 | 
20 | 
21 | ifeq ($(DEBUG), no)
22 | 	CFLAGS=-O3 -DNDEBUG -Wall -fPIC -march=native
23 | 	LDFLAGS=-O3 -flto -Wall -fPIC
24 | else
25 | 	CFLAGS=-g -Wall -fPIC -march=native $(INCPATH)
26 | 	LDFLAGS= -g -Wall -fPIC $(LIBPATH) $(LIBS)
27 | endif
28 | 
29 | ifeq ($(USEGPU),yes)
30 | 	INCPATH+=$(GPUINC)
31 | 	LIBS+=$(GPULIB) $(GPUSO)
32 | else
33 | 	CFLAGS+=-DNOGPU
34 | endif
35 | 
36 | CFLAGS+=$(INCPATH)
37 | LDFLAGS+=$(LIBPATH) $(LIBS)
38 | CXXFLAGS=$(CFLAGS)
39 | 
40 | 
41 | # uncomment needed architectures as required
42 | NVCCFLAGS=--use_fast_math $(addprefix -Xcompiler , $(CXXFLAGS)) \
43 |            -gencode arch=compute_30,code=sm_35 \
44 |            -gencode arch=compute_50,code=sm_50 \
45 |            -gencode arch=compute_52,code=sm_52 \
46 |            -gencode arch=compute_61,code=sm_61
47 | 
48 | 
49 | SOURCES=librfn.cpp cpu_operations.cpp nist_spblas.cc
50 | OBJECTS=librfn.o cpu_operations.o nist_spblas.o
51 | 
52 | ifeq ($(USEGPU),yes)
53 | 	SOURCES+=gpu_operations.cu
54 | 	OBJECTS+=gpu_operations.o
55 | endif
56 | 
57 | all: $(SOURCES) librfn.so
58 | 
59 | test: gpu_operations.o cpu_operations.o tests/tests.o tests/test_runner.o
60 | 	g++ $(LDFLAGS) $^ -o $@ $(LIBS)
61 | 	./test
62 | 
63 | testbin: librfn.so tests/testbin.o
64 | 	gcc tests/testbin.o -o testbin $(LIBPATH) $(LDFLAGS) -L./ -lrfn
65 | 
66 | librfn.so: $(OBJECTS)
67 | 	$(CXX) $(LDFLAGS) $^ -o $@ $(LIBS) -shared
68 | 
69 | gpu_operations.o: gpu_operations.cu
70 | 	$(NVCC) $(NVCCFLAGS) -o $@ -c $<
71 | 
72 | clean:
73 | 	rm -rf *.o librfn.so tests/*.o
74 | 


--------------------------------------------------------------------------------
/blas_enum.h:
--------------------------------------------------------------------------------
  1 | #ifndef BLAS_ENUM_H
  2 | #define BLAS_ENUM_H
  3 | 
  4 |   /* Enumerated types */
  5 | 
  6 | enum blas_order_type {
  7 |             blas_rowmajor = 101,
  8 |             blas_colmajor = 102 };
  9 | 
 10 | enum blas_trans_type {
 11 |             blas_no_trans   = 111,
 12 |             blas_trans      = 112,
 13 |             blas_conj_trans = 113 };
 14 | 
 15 | enum blas_uplo_type  {
 16 |             blas_upper = 121,
 17 |             blas_lower = 122 };
 18 | 
 19 | enum blas_diag_type {
 20 |             blas_non_unit_diag = 131,
 21 |             blas_unit_diag     = 132 };
 22 | 
 23 | enum blas_side_type {
 24 |             blas_left_side  = 141,
 25 |             blas_right_side = 142 };
 26 | 
 27 | enum blas_cmach_type {
 28 |             blas_base      = 151,
 29 |             blas_t         = 152,
 30 |             blas_rnd       = 153,
 31 |             blas_ieee      = 154,
 32 |             blas_emin      = 155,
 33 |             blas_emax      = 156,
 34 |             blas_eps       = 157,
 35 |             blas_prec      = 158,
 36 |             blas_underflow = 159,
 37 |             blas_overflow  = 160,
 38 |             blas_sfmin     = 161};
 39 | 
 40 | enum blas_norm_type {
 41 |             blas_one_norm       = 171,
 42 |             blas_real_one_norm  = 172,
 43 |             blas_two_norm       = 173,
 44 |             blas_frobenius_norm = 174,
 45 |             blas_inf_norm       = 175,
 46 |             blas_real_inf_norm  = 176,
 47 |             blas_max_norm       = 177,
 48 |             blas_real_max_norm  = 178 };
 49 | 
 50 | enum blas_sort_type {
 51 |             blas_increasing_order = 181,
 52 |             blas_decreasing_order = 182 };
 53 | 
 54 | enum blas_conj_type {
 55 |             blas_conj    = 191,
 56 |             blas_no_conj = 192 };
 57 | 
 58 | enum blas_jrot_type {
 59 |             blas_jrot_inner  = 201,
 60 |             blas_jrot_outer  = 202,
 61 |             blas_jrot_sorted = 203 };
 62 | 
 63 | enum blas_prec_type {
 64 |             blas_prec_single     = 211,
 65 |             blas_prec_double     = 212,
 66 |             blas_prec_indigenous = 213,
 67 |             blas_prec_extra      = 214 };
 68 | 
 69 | enum blas_base_type {
 70 |             blas_zero_base = 221,
 71 |             blas_one_base  = 222 };
 72 | 
 73 | enum blas_symmetry_type {
 74 |             blas_general          = 231,
 75 |             blas_symmetric        = 232,
 76 |             blas_hermitian        = 233,
 77 |             blas_triangular       = 234,
 78 |             blas_lower_triangular = 235,
 79 |             blas_upper_triangular = 236,
 80 |             blas_lower_symmetric  = 237,
 81 |             blas_upper_symmetric  = 238,
 82 |             blas_lower_hermitian  = 239,
 83 |             blas_upper_hermitian  = 240  };
 84 | 
 85 | enum blas_field_type {
 86 |             blas_complex          = 241,
 87 |             blas_real             = 242,
 88 |             blas_double_precision = 243,
 89 |             blas_single_precision = 244  };
 90 | 
 91 | enum blas_size_type {
 92 |             blas_num_rows      = 251,
 93 |             blas_num_cols      = 252,
 94 |             blas_num_nonzeros  = 253  };
 95 | 
 96 | enum blas_handle_type{
 97 |             blas_invalid_handle = 261,
 98 | 			blas_new_handle     = 262,
 99 | 			blas_open_handle    = 263,
100 | 			blas_valid_handle   = 264};
101 | 
102 | enum blas_sparsity_optimization_type {
103 |             blas_regular       = 271,
104 |             blas_irregular     = 272,
105 |             blas_block         = 273,
106 |             blas_unassembled   = 274 };
107 | 
108 | #endif
109 |    /* BLAS_ENUM_H */
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # librfn: Rectified Factor Networks
 2 | 
 3 | Rectified Factor Networks (RFNs) are an unsupervised technique that learns a non-linear, high-dimensional representation of its input. The underlying algorithm has been published in
 4 | 
 5 | *Rectified Factor Networks*, Djork-Arné Clevert, Andreas Mayr, Thomas Unterthiner, Sepp Hochreiter, NIPS 2015.
 6 | 
 7 | librfn is implemented in C++ and can be easily integrated in existing code bases. It also contains a high-level Python wrapper for ease of use. The library can run in either CPU or GPU mode. For larger models the GPU mode offers large speedups and is the recommended mode.
 8 | 
 9 | 
10 | # Installation
11 | 
12 | 1. (optional) Adjust the Makefile to your needs
13 | 2. Type `make` to start the building process
14 | 3. To use the python wrapper, just copy `rfn.py` and `librfn.so` into your  working directory.
15 | 
16 | 
17 | # Requirements
18 | To run the GPU code, you require a CUDA 7.5 (or higher) compatible GPU. While in theory CUDA 7.0 is also supported, it contains a bug that results in a memory leak when running librfn (and your program is likely to crash with an out-of-memory error).
19 | 
20 | If you do not have access to a GPU, you can disable GPU support by setting `USEGPU = no` in the Makefile.
21 | 
22 | Note that librfn makes heavy use of BLAS and LAPACK, so make sure to link it to a high-quality implementation to get optimal speed (e.g. OpenBLAS or MKL) by modifying the Makefile.
23 | 
24 | 
25 | # Usage
26 | 
27 | The following code trains an RFN on MNIST and plots the resulting filters::
28 | 
29 |     import numpy as np
30 |     import matplotlib.pyplot as plt
31 | 
32 |     from sklearn.datasets import fetch_mldata
33 |     mnist = fetch_mldata('MNIST original')
34 |     X = mnist['data'] / 255.0
35 | 
36 |     from rfn import *
37 |     W, P, Wout = train_rfn(X, 128, 500, 0.1, 0.1, 1e-1, 0.0, gpu_id=0)
38 | 
39 |     # plot weights
40 |     fig, ax = plt.subplots(5, 5, figsize=(8, 8))
41 |     for i, a in enumerate(ax.flat):
42 |         a.pcolorfast(W[i].reshape(28, 28), cmap=plt.cm.Greys_r)
43 |         a.set_ylim(28, 0)
44 |         a.grid("off")
45 |         a.set_axis_off()
46 |     fig.subplots_adjust(0, 0, 1, 1, 0, 0)
47 |     fig
48 | 
49 |     # calculate hidden units and reconstructions
50 |     H = np.maximum(0, np.dot(Wout, X.T))
51 |     R = np.dot(H.T, W)
52 | 
53 |     # plot reconstructions
54 |     np.random.shuffle(R)  # shuffle samples, otherwhise we only plot 0s
55 |     fig, ax = plt.subplots(5, 5, figsize=(8, 8))
56 |     for i, a in enumerate(ax.flat):
57 |         a.pcolorfast(R[i].reshape(28, 28), cmap=plt.cm.Greys_r)
58 |         a.set_ylim(28, 0)
59 |         a.grid("off")
60 |         a.set_axis_off()
61 |     fig.subplots_adjust(0, 0, 1, 1, 0, 0)
62 |     fig
63 | 
64 | 
65 | # Implementation Note
66 | 
67 | The RFN algorithm is based on the EM algorithm. Within the E-step, the published algorithm includes a projection procedure that can be implemented in several ways (see the RFN paper's supplemental section 9). To make sure no optimzation constraints are violated during this projection, the original publication tries the simplest method first, but backs out to more and more complicated updates if easier method fail (suppl. section 9.5.3).
68 | In contrast, librfn always uses the simplest/fastest projection method. This is a simplification/approximation of the original algorithm that nevertheless works very well in practice.
69 | 
70 | 
71 | # License
72 | librfn was developed by Thomas Unterthiner and is licensed under the [General Public License (GPL) Version 2 or higher](http://www.gnu.org/licenses/gpl-2.0.html) See ``License.txt`` for details.
73 | 


--------------------------------------------------------------------------------
/librfn.h:
--------------------------------------------------------------------------------
  1 | #ifndef LIBRFN_H
  2 | #define LIBRFN_H
  3 | 
  4 | /*
  5 | Copyright © 2015-2017 Thomas Unterthiner
  6 | Additional Contributions by Thomas Adler, Balázs Bencze
  7 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  8 | */
  9 | 
 10 | #ifdef __cplusplus
 11 | extern "C" {
 12 | #endif
 13 | 
 14 | 
 15 | const int USE_CPU = 2;
 16 | 
 17 | 
 18 | /**
 19 |  * Trains an RFN network.
 20 |  *
 21 |  * Note: All arguments are assumed to be in C-order (ie., row-major)
 22 |  * and in host (ie., CPU) memory.
 23 |  * If necessary, any transfers from and to the GPU will be
 24 |  * done internally by the function itself.
 25 |  *
 26 |  * @param X             [n, m] data matrix, with 1 sample per row
 27 |  * @param W             [k, m] weight matrix, expected to be pre-initialized
 28 |  * @param P             [m, ] vector, used to store Psi
 29 |  * @param n             number of samples
 30 |  * @param m             number of input features
 31 |  * @param k             number of hidden units
 32 |  * @param n_iter        number of iterations the algorithm will run
 33 |  * @param learnrate     learnrate
 34 |  * @param dropout_rate  the dropout rate for hidden activations
 35 |  * @param input_dropout_rate  the dropout rate for input units
 36 |  * @param seed          seed for the random number generation
 37 |  * @param gpu_id        ID of the GPU that this will run on
 38 |  *                      If this is -1 use the GPU with the most free memory
 39 |  *                      If this is -2, the CPU is used instead of the GPU
 40 |  *
 41 |  * @return 0 on success, 1 otherwise. The trained network will be stored
 42 |  *         in the W_host and P_host variables.
 43 |  */
 44 | int train_rfn(const float* X, float* W, float* P, const int n,
 45 |               const int m, const int k, const int n_iter, int batch_size,
 46 |               const float etaW, const float etaP, const float minP, const float h_threshold,
 47 |               const float dropout_rate, const float input_noise_rate,
 48 |               const float l2_weightdecay, const float l1_weightdecay,
 49 |               const float momentum,
 50 |               const int noise_type, const int activation_type, const int apply_scaling,
 51 |               const int applyNewtonUpdate, unsigned long seed, int gpu_id);
 52 | 
 53 | 
 54 | /**
 55 |  * Trains an RFN network.
 56 |  * The parameters are the same as in `int train_rfn`, except that X is encoded
 57 |  * as a sparse matrix in CSR format.
 58 |  *
 59 |  * Note: the number of nonzero elements of X should be stored in Xrowptr[n]
 60 |  */
 61 | int train_rfn_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr,
 62 |                      float* W, float* P, const int n,
 63 |                      const int m, const int k, const int n_iter, int batch_size,
 64 |                      const float etaW, const float etaP, const float minP, const float h_threshold,
 65 |                      const float dropout_rate, const float input_noise_rate,
 66 |                      const float l2_weightdecay, const float l1_weightdecay,
 67 |                      const float momentum,
 68 |                      const int noise_type, const int activation_type, const int apply_scaling,
 69 |                      const int applyNewtonUpdate, unsigned long seed, int gpu_id);
 70 | 
 71 | /**
 72 |  * Given a trained RFN, this will calculate the weights that are used to
 73 |  * estimate the hidden activations.
 74 |  *
 75 |  * This needs access to the training data, as the W need to incorporate
 76 |  * the scaling that would otherwise be done on the hidden activations.
 77 |  * The scaling parameters have to be fitted on the training data's H.
 78 |  *
 79 |  * Note: All arguments are assumed to be in C-order (ie., row-major)
 80 |  * and in host (ie., CPU) memory. Any necessary transfers from and to the GPU
 81 |  * will be done internally by the function itself.
 82 |  *
 83 |  * @param X             [n, m] training data matrix, with 1 sample per row
 84 |  * @param W             [k, m] RFN weight matrix
 85 |  * @param P             [m] vector, contains Psi
 86 |  * @param Wout          [k, m] output weight matrix
 87 |  * @param n             number of training samples
 88 |  * @param m             number of input features
 89 |  * @param k             number of hidden units
 90 |  * @param gpu_id        ID of the GPU that this will run on
 91 |  *                      If this is -1 use the GPU with the most free memory
 92 |  *                      If this is -2, the CPU is used instead of the GPU
 93 |  */
 94 | void calculate_W(const float* X, const float* W, const float* P, float* Wout,
 95 |                  const int n, const int m, const int k,
 96 |                  const int activation_type, const int apply_scaling, const float h_threshold,
 97 |                  int gpu_id);
 98 | 
 99 | /**
100 |  * Given a trained RFN, this will calculate the weights that are used to
101 |  * estimate the hidden activations.
102 |  *
103 |  * The parameters are the same as in `void calculate_W`, except that X is encoded
104 |  * as a sparse matrix in CSR format.
105 |  *
106 |  * Note: the number of nonzero elements of X should be stored in Xrowptr[n]
107 |  */
108 | void calculate_W_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr,
109 |                         const float* W, const float* P, float* Wout,
110 |                         const int n, const int m, const int k,
111 |                         const int activation_type, const int apply_scaling, const float h_threshold,
112 |                         int gpu_id);
113 | 
114 | #ifdef __cplusplus
115 | }
116 | #endif
117 | 
118 | #endif /* LIBRFN_H */
119 | 


--------------------------------------------------------------------------------
/basic_python_implementation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | """
  3 | Implements the RFN algorithm as easily understandable code.
  4 | 
  5 | Copyright © 2015 Thomas Unterthiner
  6 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  7 | 
  8 | Contains a very basic CPU and a GPU implementation that is easy to understand.
  9 | This code is meant as an instructional ressource, and not suited for production
 10 | runs.
 11 | 
 12 | The GPU implementation assumes that scikits.cuda.linalg works properly
 13 | (which in turn requires CULA). Also, this requires the current development
 14 | version of scikits.cuda (as of 2014-08-11).
 15 | """
 16 | 
 17 | import time
 18 | import numpy as np
 19 | from scikits.cuda import linalg as la
 20 | import pycuda.curandom as curand
 21 | import pycuda.gpuarray as gpu
 22 | import pycuda.elementwise as el
 23 | import pycuda.driver as drv
 24 | 
 25 | from pycuda.compiler import SourceModule
 26 | from pycuda.tools import DeviceMemoryPool
 27 | from scikits.cuda.cublas import cublasSgemv
 28 | from pycuda.elementwise import ElementwiseKernel
 29 | from pycuda import cumath
 30 | 
 31 | _dropout_kernel = None
 32 | _saltpepper_kernel = None
 33 | _rng_state = None
 34 | _rng_blocks = 128
 35 | _rng_threads = 128
 36 | 
 37 | _mempool = DeviceMemoryPool()
 38 | 
 39 | def init_rng(seed):
 40 |     global _dropout_kernel, _saltpepper_kernel, _rng_state, _rng_threads, _rng_blocks
 41 |     from pycuda.characterize import sizeof
 42 |     ds = sizeof("curandState", "#include <curand_kernel.h>")
 43 |     _rng_state = drv.mem_alloc(_rng_threads * _rng_blocks * ds)
 44 | 
 45 |     src = SourceModule(
 46 |     '''
 47 |     #include <curand_kernel.h>
 48 | 
 49 |     extern "C"
 50 |     {
 51 |     __global__ void setup_rng(curandState* rng_state, const unsigned seed)
 52 |     {
 53 |         const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
 54 |         curand_init(seed, tid, 0, &rng_state[tid]);
 55 |     }
 56 | 
 57 |     __global__ void dropout_eltw(float* x, const unsigned size,
 58 |                                  float dropout_rate,
 59 |                                  curandState* rng_state) {
 60 |         const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
 61 |         const unsigned num_threads = gridDim.x*blockDim.x;
 62 |         curandState localState = rng_state[tid];
 63 |         for (unsigned i = tid; i < size; i += num_threads)
 64 |             x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
 65 |         rng_state[tid] = localState;
 66 |     }
 67 | 
 68 |     __global__ void saltpepper_eltw(float* x, const unsigned size,
 69 |                                     float dropout_rate,
 70 |                                     curandState* rng_state) {
 71 |         const unsigned tid = blockIdx.x*blockDim.x+threadIdx.x;
 72 |         const unsigned num_threads = gridDim.x*blockDim.x;
 73 |         curandState localState = rng_state[tid];
 74 |         for (unsigned i = tid; i < size; i += num_threads)
 75 |             x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
 76 |             x[i] = (curand_uniform(&localState) < dropout_rate) ? 1.0 : x[i];
 77 |         rng_state[tid] = localState;
 78 |     }
 79 |     }
 80 |     ''', no_extern_c=True)
 81 |     setup_rng = src.get_function("setup_rng")
 82 |     setup_rng.prepare("Pi")
 83 |     setup_rng.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1),
 84 |                             _rng_state, np.uint32(seed))
 85 |     _dropout_kernel = src.get_function("dropout_eltw")
 86 |     _dropout_kernel.prepare("PifP")
 87 |     _saltpepper_kernel = src.get_function("saltpepper_eltw")
 88 |     _saltpepper_kernel.prepare("PifP")
 89 | 
 90 | 
 91 | def dropout(X, dropout_rate):
 92 |     _dropout_kernel.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1),
 93 |         X.gpudata, np.prod(X.shape), np.float32(dropout_rate), _rng_state)
 94 |     return X
 95 | 
 96 | 
 97 | def saltpepper_noise(X, dropout_rate):
 98 |     _saltpepper_kernel.prepared_call((_rng_threads, 1, 1), (_rng_blocks, 1, 1),
 99 |         X.gpudata, np.prod(X.shape), np.float32(dropout_rate), _rng_state)
100 |     return X
101 | 
102 | _unitvariance_step1_kernel = ElementwiseKernel(
103 |     "float* X, float* mean, float* Xsq, const unsigned height",
104 |     "float tmp = X[i] - mean[i % height]; Xsq[i] = tmp*tmp;")
105 | 
106 | _unitvariance_step2_kernel = ElementwiseKernel(
107 |     "float* work1, const unsigned k",
108 |     "work1[i] = (work1[i] > 1e-7) ? rsqrtf(work1[i]) : 1.0;")
109 | 
110 | _unitvariance_step3_kernel = ElementwiseKernel(
111 |     "float* X, float* mean, const unsigned height",
112 |     "X[i] *= mean[i % height];")
113 | 
114 | def to_unit_variance(H):
115 |     ''' Scales H so that column has a variance of 1. '''
116 |     from scikits.cuda.misc import _global_cublas_handle as cublas_handle
117 |     ones = gpu.empty((H.shape[0], 1), np.float32, allocator=_mempool.allocate)
118 |     ones.fill(1.0)
119 |     Hsq = gpu.empty(H.shape, np.float32, allocator=_mempool.allocate)
120 |     mean = gpu.empty((1, H.shape[1]), np.float32, allocator=_mempool.allocate)
121 |     cublasSgemv(cublas_handle, "n", H.shape[1], H.shape[0],
122 |                 1.0/H.shape[0], H.gpudata,  H.shape[1], ones.gpudata,
123 |                 1, 0.0, mean.gpudata, 1)
124 |     _unitvariance_step1_kernel(H, mean, Hsq, H.shape[1])
125 |     cublasSgemv(cublas_handle, "n", Hsq.shape[1], H.shape[0],
126 |                 1.0/H.shape[0], Hsq.gpudata, H.shape[1], ones.gpudata,
127 |                 1, 0.0, mean.gpudata, 1)
128 |     _unitvariance_step2_kernel(mean, H.shape[1])
129 |     _unitvariance_step3_kernel(H, mean, H.shape[1])
130 |     return H
131 | 
132 | 
133 | def calculate_H_gpu(X, W, P):
134 |     WPW = la.add_diag(P, la.dot(W, W, "t", "n"))
135 |     tmp = la.dot(W, la.inv(WPW, overwrite=True))
136 |     H = la.dot(X, tmp, "n", "t")
137 |     H = gpu.maximum(H, 0)
138 |     H = to_unit_variance(H)
139 |     return H, tmp
140 | 
141 | 
142 | def train_rfn_gpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate, input_droput_rate, minPsi=0.1, seed=32):
143 |     k = n_hidden
144 |     n, m = X.shape
145 |     W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32)
146 |     P = np.array([0.1] * m, dtype=np.float32)
147 |     XXdiag = np.diag(np.dot(X.T, X) / n).copy() # explicit copy to avoid numpy 1.8 warning
148 |     W = gpu.to_gpu(W, allocator=_mempool.allocate)
149 |     P = gpu.to_gpu(P, allocator=_mempool.allocate)
150 |     X = gpu.to_gpu(X, allocator=_mempool.allocate)
151 |     XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate)
152 |     I = la.eye(k, dtype=np.float32)
153 | 
154 |     init_rng(seed)
155 |     t0 = time.time()
156 |     for cur_iter in range(n_iter):
157 |         H, tmp = calculate_H_gpu(X, W, P)
158 |         if dropout_rate > 0:
159 |             dropout(H, dropout_rate)
160 |         Xtmp = X
161 |         if input_dropout_rate > 0:
162 |             Xtmp = X.copy()
163 |             saltpepper_noise(Xtmp, input_dropout_rate)
164 |         U = la.dot(Xtmp, H, "t", "n") / n
165 |         S = la.dot(H, H, "t", "n") / n
166 |         S += I
167 |         S -= la.dot(tmp, W, "n", "t")
168 |         Cii = la.dot(la.dot(W, S, "t") - 2*U, W)
169 | 
170 |         Sinv = la.inv(S, overwrite=True)
171 |         dW = la.dot(Sinv, U, "n", "t") - W
172 |         dP = XXdiag + la.diag(Cii) - P
173 | 
174 |         W += learnrateW * dW
175 |         P += learnratePsi * dP
176 | 
177 |         P = gpu.maximum(P, minPsi)
178 |         if cur_iter % 25 == 0:
179 |             print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0)
180 |     return W.get(), P.get()
181 | 
182 | 
183 | def train_rfn_cpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate):
184 |     n, m = X.shape
185 |     k = n_hidden
186 |     W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32)
187 |     P = np.array([0.1] * m)
188 |     H = np.zeros((k, n), dtype=np.float32)
189 |     C = np.dot(X.T, X) / n
190 | 
191 |     t0 = time.time()
192 |     for cur_iter in range(n_iter):
193 |         I = np.eye(k, dtype=np.float32)
194 |         tmp = I + np.dot(W * 1.0/P, W.T)
195 |         tmp = np.linalg.inv(tmp)
196 |         Wout = np.dot(tmp, W) * (1.0/P)
197 |         H = np.dot(Wout, X.T)
198 | 
199 |         H = np.maximum(0, H)
200 |         H /= (H.std(1) + 1e-9)[:, None]
201 |         if dropout_rate > 0:
202 |             H *= np.random.binomial(1, 1-dropout_rate, size=H.shape).astype(np.float32)
203 | 
204 |         U = np.dot(X.T, H.T) / n
205 |         S = (np.dot(H, H.T) + tmp) / n
206 | 
207 |         dW = np.dot(np.linalg.inv(S), U.T) - W
208 |         Cii = C - np.dot(-2*U + np.dot(W.T, S), W)
209 |         dP = np.diag(Cii) - P
210 | 
211 |         W += learnrateW * dW
212 |         P += learnratePsi * dP
213 | 
214 |         P = np.maximum(P, 0.1)
215 | 
216 |         if cur_iter % 25 == 0:
217 |             print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0)
218 |     return W, H, P
219 | 


--------------------------------------------------------------------------------
/tests/tests.cpp:
--------------------------------------------------------------------------------
  1 | #include "catch.hpp"
  2 | #include "../cpu_operations.h"
  3 | #include "../gpu_operations.h"
  4 | #include <iostream>
  5 | 
  6 | #include <sys/time.h>
  7 | float time_diff(struct timeval *t2, struct timeval *t1) {
  8 |     long int diff = (t2->tv_usec + 1000000 * t2->tv_sec) - (t1->tv_usec + 1000000 * t1->tv_sec);
  9 |     return diff / 1000000.0f;
 10 | }
 11 | 
 12 | 
 13 | 
 14 | using namespace std;
 15 | 
 16 | TEST_CASE( "to_host_and_to_device", "[gpu]" ) {
 17 |     GPU_Operations op(6, 6, 6, 0, -1);
 18 |     float X_h[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 19 |     float* X_d = op.to_device(X_h, sizeof(X_h));
 20 | 
 21 |     float* X2_h = (float*) malloc(sizeof(X_h));
 22 |     op.copy_to_host(X_d, X2_h, sizeof(X_h));
 23 |     for (size_t i = 0; i < sizeof(X_h)/sizeof(X_h[0]); ++i) {
 24 |         CHECK(X_h[i] == X2_h[i]);
 25 |     }
 26 |     free(X2_h);
 27 |     op.free(X_d);
 28 | }
 29 | 
 30 | 
 31 | template <class OP>
 32 | float* test_variance(OP& op, float* X, unsigned nrows, unsigned ncols, float* expected) {
 33 |     float* var = (float*) op.malloc(ncols*sizeof(X[0]));
 34 |     op.calculate_column_variance(X, nrows, ncols, var);
 35 |     float* res = (float*) malloc(ncols*sizeof(X[0]));
 36 |     op.copy_to_host(var, res, ncols*sizeof(var[0]));
 37 |     for (size_t i = 0; i < 3; ++i) {
 38 |         CHECK(res[i] == expected[i]);
 39 |     }
 40 |     free(res);
 41 |     return var;
 42 | }
 43 | 
 44 | 
 45 | TEST_CASE( "Calculate Variance", "[operations]" ) {
 46 |     GPU_Operations gpu_op(512, 512, 512, 0, -1);
 47 |     CPU_Operations cpu_op(512, 512, 512, 0, -1);
 48 |     float X_h[] = {1.0, 2.0, 3.0,
 49 |                    4.0, 6.0, 10.0};
 50 |     float expected[] = {2.25, 4, 12.25};
 51 |     float* res_h = test_variance(cpu_op, X_h, 2, 3, expected);
 52 |     cpu_op.free(res_h);
 53 |     float* X_d = gpu_op.to_device(X_h, sizeof(X_h));
 54 |     float* res_d = test_variance(gpu_op, X_d, 2, 3, expected);
 55 |     gpu_op.free(res_d);
 56 |     gpu_op.free(X_d);
 57 | }
 58 | 
 59 | 
 60 | // the pointer-to-memberfunction thingy is pretty ugly :(
 61 | template <class OP>
 62 | float* test_scale(OP& op,
 63 |                   void (OP::*scalefunc)(float*, unsigned int, unsigned int, float*) const,
 64 |                   float* X, float* s, unsigned nrows, unsigned ncols, float* expected) {
 65 |     float* scale = op.to_device(s, ncols*sizeof(X[0]));
 66 |     (op.*scalefunc)(X, nrows, ncols, scale);
 67 |     float* res = (float*) malloc(ncols*nrows*sizeof(X[0]));
 68 |     op.copy_to_host(X, res, ncols*nrows*sizeof(X[0]));
 69 |     for (size_t i = 0; i < nrows*ncols; ++i) {
 70 |         CHECK(expected[i] == res[i]);
 71 |     }
 72 |     free(res);
 73 |     return 0;
 74 | }
 75 | 
 76 | 
 77 | TEST_CASE( "Scale columns CPU", "[operations]" ) {
 78 |     CPU_Operations op(6, 6, 6, 0, -1);
 79 |     float X_h[] = {1.0, 2.0, 3.0,
 80 |                    4.0, 6.0, 10.0};
 81 |     float s_h[] = {1.0, 2.0, 3.0};
 82 |     float Exp_h[] = {1.0,  4.0, 9.0,
 83 |                      4.0, 12.0, 30.0};
 84 |     test_scale(op, &CPU_Operations::scale_columns, X_h, s_h, 2, 3, Exp_h);
 85 | }
 86 | 
 87 | 
 88 | TEST_CASE( "Scale columns GPU", "[operations]" ) {
 89 |     GPU_Operations op(6, 6, 6, 0, -1);
 90 |     float X_h[] = {1.0, 2.0, 3.0,
 91 |                    4.0, 6.0, 10.0};
 92 |     float s_h[] = {1.0, 2.0, 3.0};
 93 |     float Exp_h[] = {1.0,  4.0, 9.0,
 94 |                      4.0, 12.0, 30.0};
 95 |     float* X_d = op.to_device(X_h, sizeof(X_h));
 96 |     test_scale(op, &GPU_Operations::scale_columns, X_d, s_h, 2, 3, Exp_h);
 97 |     op.free(X_d);
 98 | }
 99 | 
100 | 
101 | TEST_CASE( "Scale rows CPU", "[operations]" ) {
102 |     CPU_Operations op(6, 6, 6, 0, -1);
103 |     float X_h[] = {1.0, 2.0,  3.0, 4.0, 5.0,
104 |                    4.0, 6.0, 10.0, 1.0, 1.5};
105 |     float s_h[] = {2.0, 4.0};
106 |     float Exp_h[] = { 2.0,  4.0,  6.0, 8.0, 10.0,
107 |                      16.0, 24.0, 40.0, 4.0, 6.0};
108 |     test_scale(op, &CPU_Operations::scale_rows, X_h, s_h, 2, 5, Exp_h);
109 | }
110 | 
111 | 
112 | TEST_CASE( "Scale rows GPU", "[operations]" ) {
113 |     GPU_Operations op(6, 6, 6, 0, -1);
114 |     float X_h[] = {1.0, 2.0,  3.0, 4.0, 5.0,
115 |                    4.0, 6.0, 10.0, 1.0, 1.5};
116 |     float s_h[] = {2.0, 4.0};
117 |     float Exp_h[] = { 2.0,  4.0,  6.0, 8.0, 10.0,
118 |                      16.0, 24.0, 40.0, 4.0, 6.0};
119 |     float* X_d = op.to_device(X_h, sizeof(X_h));
120 |     test_scale(op, &GPU_Operations::scale_rows, X_d, s_h, 2, 5, Exp_h);
121 |     op.free(X_d);
122 | }
123 | 
124 | 
125 | TEST_CASE( "invsqrt cpu", "[operations]" ) {
126 |     CPU_Operations op(6, 6, 6, 0, -1);
127 |     float x_h[] = {0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0};
128 |     float e_h[] = {1.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0};
129 |     int n = sizeof(x_h) / sizeof(x_h[0]);
130 |     for (int i = 1; i < n; ++i)
131 |         e_h[i] = 1.0f / sqrt(x_h[i]);
132 |     op.invsqrt(x_h, n);
133 |     for (size_t i = 0; i < 3; ++i) {
134 |         CHECK(abs(x_h[i] - e_h[i]) < 1e-3);
135 |     }
136 | }
137 | 
138 | 
139 | TEST_CASE( "invsqrt gpu", "[operations]" ) {
140 |     GPU_Operations op(6, 6, 6, 0, -1);
141 |     float x_h[] = {0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0};
142 |     float e_h[] = {1.0, 1.0, 2.0, 3.0, 4.0, 6.0, 10.0};
143 |     int n = sizeof(x_h) / sizeof(x_h[0]);
144 |     for (int i = 1; i < n; ++i)
145 |         e_h[i] = 1.0f / sqrt(x_h[i]);
146 |     float* x_d = op.to_device(x_h, sizeof(x_h));
147 |     op.invsqrt(x_d, n);
148 |     float* res = (float*) malloc(n*sizeof(x_h[0]));
149 |     op.copy_to_host(x_d, res, n*sizeof(x_h[0]));
150 |     for (size_t i = 0; i < 3; ++i) {
151 |         CHECK(abs(res[i] - e_h[i]) < 1e-3);
152 |     }
153 |     op.free(x_d);
154 | }
155 | 
156 | 
157 | TEST_CASE( "filleye cpu", "[operations]" ) {
158 |     unsigned n = 10;
159 |     CPU_Operations op(n, n, n, 0, -1);
160 |     float* x = op.malloc(n*n*sizeof(float));
161 |     op.fill_eye(x, 10);
162 |     double s = 0.0;
163 |     for (unsigned i = 0; i < n; ++i) {
164 |         for (unsigned j = 0; j < n; ++j) {
165 |             if (i == j) {
166 |                 CHECK(x[i*n+j] == 1.0);
167 |             } else {
168 |                 s += abs(x[i*n+j]);
169 |             }
170 |         }
171 |     }
172 |     CHECK(s == 0.0);
173 |     op.free(x);
174 | }
175 | 
176 | 
177 | TEST_CASE( "filleye gpu", "[operations]" ) {
178 |     unsigned n = 10;
179 |     CPU_Operations cpu_op(n, n, n, 0, -1);
180 |     GPU_Operations op(n, n, n, 0, -1);
181 |     float* x_d = op.malloc(n*n*sizeof(float));
182 |     op.fill_eye(x_d, 10);
183 |     float *x = cpu_op.malloc(n*n*sizeof(float));
184 |     op.copy_to_host(x_d, x, n*n*sizeof(float));
185 |     double s = 0.0;
186 |     for (unsigned i = 0; i < n; ++i) {
187 |         for (unsigned j = 0; j < n; ++j) {
188 |             if (i == j) {
189 |                 CHECK(x[i*n+j] == 1.0);
190 |             } else {
191 |                 s += abs(x[i*n+j]);
192 |             }
193 |         }
194 |     }
195 |     CHECK(s == 0.0);
196 |     op.free(x_d);
197 | }
198 | 
199 | 
200 | TEST_CASE( "Variance of CPU/GPU on large matrices", "[cpu_vs_gpu]" ) {
201 |     unsigned n = 428;
202 |     unsigned m = 554;
203 |     CPU_Operations cpu_op(m, n, m, 0, -1);
204 |     GPU_Operations gpu_op(m, n, m, 0, -1);
205 | 
206 |     float* X_h = cpu_op.malloc(n*m*sizeof(float));
207 |     for (unsigned i = 0; i < n*m; ++i) {
208 |         X_h[i] = 10*((rand()+1.0)/(RAND_MAX+1.0)) - 5.0;
209 |     }
210 |     float *X_d = gpu_op.to_device(X_h, n*m*sizeof(float));
211 | 
212 |     float* var_h = cpu_op.malloc(m*sizeof(float));
213 |     float* var_d = gpu_op.malloc(m*sizeof(float));
214 |     cpu_op.calculate_column_variance(X_h, n, m, var_h);
215 |     gpu_op.calculate_column_variance(X_d, n, m, var_d);
216 |     float* var_gpu_h = cpu_op.malloc(m*sizeof(float));
217 |     gpu_op.to_host(var_d, var_gpu_h, m*sizeof(float));
218 | 
219 |     for (unsigned i = 0; i < m; ++i)
220 |         CHECK(abs(var_h[i] - var_gpu_h[i]) < 1e-3);
221 |     cpu_op.free(var_h);
222 |     cpu_op.free(var_gpu_h);
223 | }
224 | 
225 | 
226 | 
227 | TEST_CASE( "dgmm CPU/GPU", "[operations]" ) {
228 |     unsigned n = 10;
229 |     unsigned k = 10;
230 |     unsigned m = 12;
231 |     CPU_Operations cpu_op(m, n, k, 0, -1);
232 |     GPU_Operations gpu_op(m, n, k, 0, -1);
233 |     float* xh = cpu_op.malloc(m*k*sizeof(float));
234 |     float* ah = cpu_op.malloc(m*sizeof(float));
235 |     float* ch = cpu_op.malloc(m*k*sizeof(float));
236 |     for (int i = 0; i < m*n; ++i)
237 |         xh[i] = 10* (rand() / RAND_MAX);
238 |     for (int i = 0; i < n; ++i)
239 |         ah[i] = 50* (rand() / RAND_MAX);
240 |     cpu_op.dgmm("l", m, k, xh, m, ah, 1, ch, m);
241 | 
242 |     float* xd = gpu_op.to_device(xh, m*k*sizeof(float));
243 |     float* ad = gpu_op.to_device(ah, m*sizeof(float));
244 |     float* cd = gpu_op.to_device(ch, m*k*sizeof(float));
245 |     gpu_op.dgmm("l", m, k, xd, m, ad, 1, cd, m);
246 | 
247 |     float* dh = cpu_op.malloc(m*k*sizeof(float));
248 |     gpu_op.copy_to_host(cd, dh, m*k*sizeof(float));
249 |     for (unsigned i = 0; i < m*k; ++i) {
250 |         CHECK(ch[i] == dh[i]);
251 |     }
252 | }
253 | 


--------------------------------------------------------------------------------
/cpu_operations.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2015-2017 Thomas Unterthiner
  3 | Additional Contributions by Thomas Adler, Balázs Bencze
  4 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  5 | */
  6 | 
  7 | #include "cpu_operations.h"
  8 | #include <algorithm>
  9 | 
 10 | 
 11 | /* This is the interface for RFN's sparse matrix operations.
 12 |  * If you want to use the generic implementation, compile nist_spblas.cc,
 13 |  * If you want to use the MKL, compile mkl_sparse_impl.cpp and link to MKL. */
 14 | 
 15 | CPU_Operations::SparseMatrix create(int row, int col); /* empty */
 16 | CPU_Operations::SparseMatrix suscr_csr(int m, int n, float *val, int *col, int *ptr); /* from csr */
 17 | void destroy(CPU_Operations::SparseMatrix A);
 18 | 
 19 | /* select row subset */
 20 | CPU_Operations::SparseMatrix srowsubset(CPU_Operations::SparseMatrix A, int first_row, int nrow); /* allocates new matrix */
 21 | 
 22 | /* column means and variances */
 23 | void scolmeans(CPU_Operations::SparseMatrix A, float *means);
 24 | void scolvars(CPU_Operations::SparseMatrix A, float *vars);
 25 | 
 26 | /* scale rows/cols */
 27 | void sscalecols(CPU_Operations::SparseMatrix A, float *s);
 28 | void sscalerows(CPU_Operations::SparseMatrix A, float *s);
 29 | 
 30 | /* set element (set to zero will delete entry) */
 31 | void ssetelement( CPU_Operations::SparseMatrix A, int row, int col, float val );
 32 | void ssetelement( CPU_Operations::SparseMatrix A, int idx, float val );
 33 | 
 34 | /* get element reference */
 35 | float &sgetelement( CPU_Operations::SparseMatrix A, int row, int col);
 36 | float &sgetelement( CPU_Operations::SparseMatrix A, int idx );
 37 | 
 38 | /* get element pointer */
 39 | float *sgetelementp( CPU_Operations::SparseMatrix A, int row, int col );
 40 | float *sgetelementp( CPU_Operations::SparseMatrix A, int idx );
 41 | 
 42 | /* sgemm routines with sparse matrix being lhs (A) or rhs (B) of the product */
 43 | void susgemm(char sidea, char transa, char transb, int nohs, const float &alpha, CPU_Operations::SparseMatrix A,
 44 |    const float *B, int ldB, const float &beta, float *C, int ldC);
 45 | 
 46 | /* checks whether A is a valid handle */
 47 | bool handle_valid(CPU_Operations::SparseMatrix A);
 48 | 
 49 | /* debug */
 50 | namespace NIST_SPBLAS
 51 | {void print(int A);}
 52 | 
 53 | 
 54 | using std::max;
 55 | 
 56 | //float* CPU_Operations::ones = 0;
 57 | 
 58 | CPU_Operations::CPU_Operations(const int m, const int n, const int k,
 59 |                           unsigned long seed, int gpu_id) {
 60 |     srand(seed);
 61 |     int maxsize = max(max(n, m), k);
 62 |     ones = malloc(maxsize*sizeof(float));
 63 |     for (int i = 0; i < maxsize; ++i)
 64 |         ones[i] = 1.0f;
 65 | 
 66 |     var_tmp = malloc(maxsize*sizeof(float));
 67 | }
 68 | 
 69 | 
 70 | CPU_Operations::~CPU_Operations() {
 71 |     free(ones);
 72 |     free(var_tmp);
 73 | }
 74 | 
 75 | CPU_Operations::SparseMatrix CPU_Operations::create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m){
 76 |     return suscr_csr(n, m, (float*) Xvals, (int*) Xcols, (int*) Xrowptr);
 77 | }
 78 | 
 79 | CPU_Operations::SparseMatrix CPU_Operations::get_batch(SparseMatrix X, int ldx, int batch_num, int batch_size) {
 80 |     return srowsubset(X, batch_num * batch_size, batch_size);
 81 | }
 82 | 
 83 | void CPU_Operations::scale_rows(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const {
 84 |         sscalerows(X, s);
 85 | }
 86 | 
 87 | 
 88 | static void colmeans(const float* X, float* means, const int nrows, const int ncols) {
 89 |     memset(means, 0, ncols*sizeof(float));
 90 |     for (int i = 0; i < nrows; ++i) {
 91 |         for (int j = 0; j < ncols; ++j) {
 92 |             means[j] += X[i*ncols+j];
 93 |         }
 94 |     }
 95 |     for (int j = 0; j < ncols; ++j)
 96 |         means[j] /= nrows;
 97 | }
 98 | 
 99 | 
100 | void CPU_Operations::dgmm(const char* mode, const int m, const int n, const float* A,
101 |                   int lda, const float* x, int incx, float* C, int ldc) const {
102 |     if (mode[0] == 'l' || mode[0] == 'L') {
103 |         for (int i = 0; i < n; ++i) {
104 |             for (int j = 0; j < m; ++j)
105 |                 C[i*ldc+j] = A[i*lda+j] * x[j];
106 |         }
107 |     } else {
108 |         for (int i = 0; i < n; ++i) {
109 |             for (int j = 0; j < m; ++j)
110 |                 C[i*ldc+j] = A[i*lda+j] * x[i];
111 |         }
112 |     }
113 | }
114 | 
115 | 
116 | void CPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
117 |         const SparseMatrix a, const int lda, const float *b, const int ldb, const float beta, float *c,
118 |         const int ldc) const {
119 |     /* The gemm interface is understood as a column-major routine. The sparse implementation,
120 |      * however, is row-major, so we need to compute B^T * A^T = C^T instead of A * B = C. The
121 |      * transposition is implicitly performed by A, B and C being column-major. */
122 |     susgemm('r', transa[0], transb[0], n, alpha, a, b, ldb, beta, c, ldc);
123 | }
124 | 
125 | void CPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
126 |         const float *a, const int lda, const SparseMatrix b, const int ldb, const float beta, float *c,
127 |         const int ldc) const {
128 |     susgemm('l', transb[0], transa[0], m, alpha, b, a, lda, beta, c, ldc);
129 | }
130 | 
131 | CPU_Operations::SparseMatrix CPU_Operations::memcpy_matrix(SparseMatrix &dest, SparseMatrix src, int nrows_to_copy, int src_ncol, int first_row = 0) const {
132 |     free(dest);
133 |     return dest = srowsubset(src, first_row, nrows_to_copy);
134 | }
135 | 
136 | void CPU_Operations::free(SparseMatrix a) const {
137 |     if (handle_valid(a))
138 |         destroy(a);
139 | }
140 | 
141 | CPU_Operations::SparseMatrix CPU_Operations::malloc_matrix(int rows, int cols, SparseMatrix dummy) {
142 |     return create(rows, cols);
143 | }
144 | 
145 | void CPU_Operations::calculate_column_variance(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* variances) {
146 |     memset(variances, 0, ncols * sizeof(float));
147 |     scolvars(X, variances);
148 | }
149 | 
150 | void CPU_Operations::scale_columns(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const {
151 |     sscalecols(X, s);
152 | }
153 | 
154 | void CPU_Operations::dropout(SparseMatrix X, const unsigned size, const float dropout_rate) const {
155 |     assert(0.0f <= dropout_rate && dropout_rate <= 1.0f);
156 |     for (unsigned i = 0; i < size; ++i)
157 |         /* TODO: write a routine sgetlement that leaves X const */
158 |         if (rand_unif() < dropout_rate) {
159 |             float *v = sgetelementp(X, i);
160 | 
161 |             if (v != NULL)
162 |                 *v = 0.f;
163 |         }
164 | }
165 | 
166 | void CPU_Operations::add_saltpepper_noise(SparseMatrix X, const unsigned size, const float noise_rate) const {
167 |     assert(0.0f <= noise_rate && noise_rate <= 1.0f);
168 |     for (unsigned i = 0; i < size; ++i) {
169 |         if (rand_unif() < noise_rate) {
170 |             float *v = sgetelementp(X, i);
171 | 
172 |             if (v != NULL)
173 |                 *v = (rand_unif() < 0.5 ? 0.0f : 1.0f);
174 |         }
175 |     }
176 | }
177 | 
178 | /* gauss noise makes no sense on sparse matrices */
179 | void CPU_Operations::add_gauss_noise(SparseMatrix X, const unsigned size, const float noise_rate) const {
180 |     assert(0.0 <= noise_rate);
181 |     for (unsigned i = 0; i < size; ++i) {
182 |         float *v = sgetelementp(X, i);
183 | 
184 |         if (v != NULL)
185 |             *v += rand_normal() * noise_rate;
186 |     }
187 | }
188 | 
189 | 
190 | void CPU_Operations::calculate_column_variance(const float* X, const unsigned nrows,
191 |                                                const unsigned ncols, float* variances) {
192 |     colmeans(X, var_tmp, nrows, ncols);
193 |     memset(variances, 0, ncols*sizeof(float));
194 |     for (unsigned i = 0; i < nrows; ++i) {
195 |         for (unsigned j = 0; j < ncols; ++j) {
196 |             const float x = X[i*ncols+j] - var_tmp[j];
197 |             variances[j] += x*x;
198 |         }
199 |     }
200 | 
201 |     for (unsigned j = 0; j < ncols; ++j) {
202 |         variances[j] /= nrows;
203 |     }
204 | }
205 | 
206 | 
207 | void CPU_Operations::invsqrt(float* s, const unsigned n) const {
208 |     for (unsigned j = 0; j < n; ++j) {
209 |         if (s[j] == 0)
210 |             s[j] = 1.0f;
211 |         else
212 |             s[j] = 1.0 / sqrtf(s[j]);
213 |     }
214 | }
215 | 
216 | void CPU_Operations::scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const {
217 |     for (unsigned i = 0; i < nrows; ++i) {
218 |         for (unsigned j = 0; j < ncols; ++j) {
219 |             X[i*ncols+j] *= s[j];
220 |         }
221 |     }
222 | }
223 | 
224 | void CPU_Operations::scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const {
225 |     for (unsigned i = 0; i < nrows; ++i) {
226 |         for (unsigned j = 0; j < ncols; ++j) {
227 |             X[i*ncols+j] *= s[i];
228 |         }
229 |     }
230 | }
231 | 
232 | 
233 | /// Prints a column major matrix.
234 | void CPU_Operations::printMatrixCM(const float* a, int n, int m, const char* fmt) {
235 |     const char* format = fmt == 0 ? "%1.3f " : fmt;
236 |     for (int i = 0; i < n; ++i) {
237 |         for (int j =0 ; j < m; ++j)
238 |             printf(format, a[i + j*n]);
239 |         printf("\n");
240 |     }
241 |     printf("\n");
242 | }
243 | 
244 | 
245 | /// Prints a row-major matrix
246 | void CPU_Operations::printMatrixRM(const float* a, int n, int m, const char* fmt) {
247 |     const char* format = fmt == 0 ? "%1.3f " : fmt;
248 |     for (int i = 0; i < n; ++i) {
249 |         for (int j =0 ; j < m; ++j)
250 |             printf(format, a[i*m + j]);
251 |         printf("\n");
252 |     }
253 | }
254 | 
255 | void CPU_Operations::printMatrixCM(const SparseMatrix a, int n, int m, const char *fmt) {
256 |     NIST_SPBLAS::print(a);
257 | }
258 | 
259 | void CPU_Operations::printMatrixRM(const SparseMatrix a, int n, int m, const char *fmt) {
260 |     NIST_SPBLAS::print(a);
261 | }
262 | 


--------------------------------------------------------------------------------
/rfn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | '''
  3 | Python wrapper for librfn.
  4 | 
  5 | Copyright © 2015-2017 Thomas Unterthiner
  6 | Additional Contributions by Thomas Adler, Balázs Bencze
  7 | 
  8 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  9 | '''
 10 | 
 11 | import os
 12 | import time
 13 | import ctypes as ct
 14 | import numpy as np
 15 | import matplotlib.pyplot as plt
 16 | import warnings
 17 | from scipy import sparse
 18 | 
 19 | 
 20 | import sys
 21 | if sys.version_info < (3,):
 22 |     range = xrange
 23 | 
 24 | 
 25 | _curdir = os.path.dirname(os.path.realpath(__file__))
 26 | _librfn = ct.cdll.LoadLibrary(os.path.join(_curdir, 'librfn.so'))
 27 | _default_gpu_id = -1
 28 | _use_cpu_id = -2
 29 | 
 30 | _librfn.calculate_W.argtypes = [
 31 |     np.ctypeslib.ndpointer(np.float32),
 32 |     np.ctypeslib.ndpointer(np.float32),
 33 |     np.ctypeslib.ndpointer(np.float32),
 34 |     np.ctypeslib.ndpointer(np.float32),
 35 |     ct.c_int, ct.c_int, ct.c_int,
 36 |     ct.c_int, ct.c_int, ct.c_float,
 37 |     ct.c_int]
 38 | 
 39 | 
 40 | _librfn.train_rfn.restype = ct.c_int
 41 | _librfn.train_rfn.argtypes = [
 42 |     np.ctypeslib.ndpointer(np.float32),
 43 |     np.ctypeslib.ndpointer(np.float32),
 44 |     np.ctypeslib.ndpointer(np.float32),
 45 |     ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int,
 46 |     ct.c_float, ct.c_float, ct.c_float, ct.c_float,
 47 |     ct.c_float, ct.c_float, ct.c_float, ct.c_float, ct.c_float,
 48 |     ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int,
 49 |     ct.c_int
 50 | ]
 51 | 
 52 | 
 53 | _librfn.calculate_W_sparse.argtypes = [
 54 |     np.ctypeslib.ndpointer(np.float32),
 55 |     np.ctypeslib.ndpointer(np.int32),
 56 |     np.ctypeslib.ndpointer(np.int32),
 57 |     np.ctypeslib.ndpointer(np.float32),
 58 |     np.ctypeslib.ndpointer(np.float32),
 59 |     np.ctypeslib.ndpointer(np.float32),
 60 |     ct.c_int, ct.c_int, ct.c_int,
 61 |     ct.c_int, ct.c_int, ct.c_float,
 62 |     ct.c_int]
 63 | 
 64 | 
 65 | _librfn.train_rfn_sparse.restype = ct.c_int
 66 | _librfn.train_rfn_sparse.argtypes = [
 67 |     np.ctypeslib.ndpointer(np.float32),
 68 |     np.ctypeslib.ndpointer(np.int32),
 69 |     np.ctypeslib.ndpointer(np.int32),
 70 |     np.ctypeslib.ndpointer(np.float32),
 71 |     np.ctypeslib.ndpointer(np.float32),
 72 |     ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int,
 73 |     ct.c_float, ct.c_float, ct.c_float, ct.c_float,
 74 |     ct.c_float, ct.c_float, ct.c_float, ct.c_float, ct.c_float,
 75 |     ct.c_int, ct.c_int, ct.c_int, ct.c_int, ct.c_int,
 76 |     ct.c_int
 77 | ]
 78 | 
 79 | 
 80 | _input_noise_types = {"dropout": 1, "saltpepper": 2, "gaussian": 3}
 81 | _activation_types = {"linear": 0, "relu": 1, "leaky": 2, "sigmoid": 3, "tanh": 4}
 82 | 
 83 | def train_rfn(X, n_hidden, n_iter, etaW, etaP, minP, dropout_rate,
 84 |               input_noise_rate=0.0, startP=0.1, startW=None,
 85 |               l2_weightdecay=0.0, l1_weightdecay=0.0,
 86 |               input_noise_type="saltpepper", activation="relu",
 87 |               h_threshold=0.0, momentum=0.0, applyNewtonUpdate=True,
 88 |               batch_size=-1, seed=None, gpu_id="default"):
 89 |     '''Trains a Rectified Factor Network (RFN).
 90 | 
 91 |     Trains an RFN as explained in
 92 |     "Rectified Factor Networks", Clevert et al., NIPS 2015
 93 | 
 94 |     Parameters
 95 |     ----------
 96 |     X : array-like, shape = (n_samples, n_features)
 97 |         Input samples
 98 | 
 99 |     n_hidden : int
100 |         Number of latent variables to estimate
101 | 
102 |     n_iter : int
103 |         Number of iterations to run the algorithm
104 | 
105 |     etaW : float
106 |         Learning rate of the W parameter
107 | 
108 |     etaP : float
109 |         Learning rate of the Psi parameter
110 |         (It's probably save to set this to the same value as etaW)
111 | 
112 |     minP : float
113 |         Minimal value for Psi. Should be in 1e-8 - 1e-1
114 | 
115 |     dropout_rate : float in [0, 1)
116 |         Dropout rate for the latent variables
117 | 
118 |     input_noise_rate : float
119 |         Noise/dropout rate for input variables
120 | 
121 |     startW : array-like, shape = (n_hidden, n_features)
122 |         Optional pre-initialized weights parameters. Useful if one wants to
123 |         continue training of an old result.
124 | 
125 |     l2_weightdecay : float
126 |         L2 penalty for weight decay
127 | 
128 |     l2_weightdecay : float
129 |         L1 penalty for weight decay
130 | 
131 |     input_noise_type : one of 'dropout', 'saltpepper' or 'gaussian'
132 |         Type of input noise
133 | 
134 |     activation : one of ('linear', 'relu', 'leaky', 'sigmoid', 'tanh')
135 |         Activation function for hidden/latent variables.
136 | 
137 |     h_threshold : float
138 |         Threshhold for rectifying/leaky activations
139 | 
140 |     momentum : float
141 |         Momentum term for learning
142 | 
143 |     applyNewtonUpdate : boolean
144 |         Whether to use a Newton update (default) or a Gradient Descent step.
145 | 
146 |     batch_size : int
147 |         If > 2, this will activate mini-batch learning instead of full
148 |         batch learning.
149 | 
150 |     seed : int
151 |         Seed for the random number generator
152 | 
153 |     gpu_id : int or "cpu"
154 |         ID of the gpu device to use. If set to "cpu", the calculations will
155 |         be performed on the CPU instead.
156 | 
157 | 
158 |     Returns
159 |     -------
160 |     A tuple of three elements:
161 | 
162 |     W : array-like, shape = (n_hidden, n_features)
163 |         The weight matrix W used in the paper, used to transform the
164 |         hidden/latent variables back to visibles.
165 |     Psi : array-like, shape = (n_features, )
166 |         Variance of each input feature dimension (Psi in the paper's formulas)
167 |     Wout : array-like, shape = (n_hidden, n_features)
168 |         Weight matrix needed to transform the visible variables back into
169 |         hidden variables. Normally this is done via
170 |             `H = np.maximum(0, np.dot(Wout, X.T))`
171 |     '''
172 | 
173 |     if seed is None:
174 |         # should be different for each call on each process
175 |         seed = np.uint32(hash(os.getpid() + time.time()) % 4294967295)
176 |     if gpu_id == "default":
177 |         gpu_id = _default_gpu_id
178 |     elif gpu_id == "cpu":
179 |         gpu_id = _use_cpu_id
180 | 
181 |     rng = np.random.RandomState(seed)
182 |     if startW is None:
183 |         W = rng.normal(scale=0.01, size=(n_hidden, X.shape[1])).astype(np.float32)
184 |     else:
185 |         W = startW
186 |     if isinstance(startP, np.ndarray):
187 |         P = startP
188 |     else:
189 |         P = np.array([startP] * X.shape[1], dtype=np.float32)
190 | 
191 |     Wout = np.empty((W.shape[0], W.shape[1]), np.float32)
192 | 
193 |     if sparse.issparse(X):
194 |         X = X.tocsr().astype(np.float32)
195 |         _librfn.train_rfn_sparse(X.data, X.indices, X.indptr, W, P, X.shape[0], X.shape[1], n_hidden, n_iter,
196 |                           batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
197 |                           l2_weightdecay, l1_weightdecay, momentum, _input_noise_types[input_noise_type],
198 |                           _activation_types[activation], 1, applyNewtonUpdate, seed, gpu_id)
199 |         _librfn.calculate_W_sparse(X.data, X.indices, X.indptr, W, P, Wout,
200 |                                 X.shape[0], X.shape[1], W.shape[0],
201 |                                 _activation_types[activation], 1, h_threshold,
202 |                                 gpu_id)
203 |     else:
204 |         X = X.astype(np.float32, order="C")
205 |         _librfn.train_rfn(X, W, P, X.shape[0], X.shape[1], n_hidden, n_iter,
206 |                           batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
207 |                           l2_weightdecay, l1_weightdecay, momentum, _input_noise_types[input_noise_type],
208 |                           _activation_types[activation], 1, applyNewtonUpdate, seed, gpu_id)
209 |         _librfn.calculate_W(X, W, P, Wout,
210 |                                 X.shape[0], X.shape[1], W.shape[0],
211 |                                 _activation_types[activation], 1, h_threshold,
212 |                                 gpu_id)
213 | 
214 |     return W, P, Wout
215 | 
216 | 
217 | from sklearn.base import BaseEstimator, TransformerMixin
218 | class RectifiedFactorNetwork(BaseEstimator, TransformerMixin):
219 |     '''Implements a sklearn interface for RFN.'''
220 |     def __init__(self, n_hidden=128, n_iter=50, etaW=0.1, etaP=0.1, minP=1e-2, dropout_rate=0.0,
221 |               input_noise_rate=0.0, startP=0.1, startW=None,
222 |               l2_weightdecay=0.0, l1_weightdecay=0.0,
223 |               input_noise_type="saltpepper", activation="relu",
224 |               h_threshold=0.0, momentum=0.0, applyNewtonUpdate=True,
225 |               batch_size=-1, seed=None, gpu_id="default"):
226 |         self.n_hidden = n_hidden
227 |         self.n_iter = n_iter
228 |         self.etaW = etaW
229 |         self.etaP = etaP
230 |         self.minP = minP
231 |         self.dropout_rate = dropout_rate
232 |         self.input_noise_rate = input_noise_rate
233 |         self.startP = startP
234 |         self.startW = startW
235 |         self.l2_weightdecay = l2_weightdecay
236 |         self.l1_weightdecay = l1_weightdecay
237 |         self.input_noise_type = input_noise_type
238 |         self.activation = activation
239 |         self.h_threshold = h_threshold
240 |         self.momentum = momentum
241 |         self.applyNewtonUpdate = applyNewtonUpdate
242 |         self.batch_size = batch_size
243 |         self.seed = seed
244 |         self.gpu_id = gpu_id
245 | 
246 |     def fit(self, x, y=None):
247 |         res = train_rfn(x, self.n_hidden, self.n_iter, self.etaW, self.etaP, self.minP, self.dropout_rate,
248 |               self.input_noise_rate, self.startP, self.startW,
249 |               self.l2_weightdecay, self.l1_weightdecay,
250 |               self.input_noise_type, self.activation,
251 |               self.h_threshold, self.momentum, self.applyNewtonUpdate,
252 |               self.batch_size, self.seed, self.gpu_id)
253 |         self.w, self.psi, self.wout = res
254 |         return self
255 | 
256 |     def transform(self, x):
257 |         h = np.maximum(np.dot(x, self.wout.T), 0)
258 |         s = h.std(1)
259 |         s[s < 1e-6] = 1
260 |         h /= s[:, None]  ## TODO: should I really scale the h?
261 |         return h
262 | 
263 |     def inverse_transform(self, h):
264 |         r = np.dot(h, self.w)
265 |         return r
266 | 


--------------------------------------------------------------------------------
/cpu_operations.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2015-2017 Thomas Unterthiner
  3 | Additional Contributions by Thomas Adler, Balázs Bencze
  4 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  5 | */
  6 | 
  7 | #include <cstring>
  8 | #include <cstdlib>
  9 | #include <cstdio>
 10 | #include <cmath>
 11 | #include <cassert>
 12 | 
 13 | #ifndef M_PI
 14 | #define M_PI 3.14159265358979323846
 15 | #endif
 16 | 
 17 | #include <typeinfo> /* for typeid */
 18 | 
 19 | 
 20 | extern "C" {
 21 | extern void sgemm_(const char *transa, const char *transb, const int *m, const int *n, const int *k, const float *alpha,
 22 |         const float *a, const int *lda, const float *b, const int *ldb, const float *beta, float *c, const int *ldc);
 23 | 
 24 | extern void ssymm_(const char *side, const char *uplo, const int *m, const int *n, const float *alpha, const float *a,
 25 |         const int *lda, const float *b, const int *ldb, const float *beta, float *c, const int *ldc);
 26 | 
 27 | extern void saxpy_(const int *n, const float *alpha, const float *dx, const int *incx, float *dy, const int *incy);
 28 | extern int spotrf_(const char *uplo, int *n, float *a, int * lda, int *info);
 29 | extern int spotrs_(const char *uplo, int *n, int *nrhs, float * a, int *lda, float *b, int *ldb, int *info);
 30 | extern int sposv_(const char *uplo, int *n, int *nrhs, float * a, int *lda, float *b, int *ldb, int *info);
 31 | extern int spotri_(const char *uplo, int *n, float *a, int *lda, int *info);
 32 | }
 33 | 
 34 | using std::cos;
 35 | using std::log;
 36 | using std::sqrt;
 37 | 
 38 | using std::rand;
 39 | using std::srand;
 40 | 
 41 | // random in (0, 1]
 42 | inline double rand_unif(void) {
 43 |     return (rand() + 1.0) / (RAND_MAX + 1.0);
 44 | }
 45 | 
 46 | // generates random samples from a 0/1 Gaussian via Box-Mueller
 47 | inline double rand_normal(void) {
 48 |     return sqrt(-2.0 * log(rand_unif())) * cos(2.0 * M_PI * rand_unif());
 49 | }
 50 | 
 51 | 
 52 | inline double rand_exp(double lambda) /* inversion sampling */
 53 | {
 54 |     return -log(1 - rand_unif()) / lambda;
 55 | }
 56 | 
 57 | class CPU_Operations {
 58 |     float* var_tmp;
 59 | 
 60 | public:
 61 | 
 62 | float* ones;
 63 | 
 64 | typedef int SparseMatrix;
 65 | 
 66 | static SparseMatrix create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m);
 67 | 
 68 | 
 69 | template<typename T>
 70 | T init_invalid(void) {
 71 |     return (typeid(T) == typeid(SparseMatrix) ? (T) -1 : (T) 0);
 72 | }
 73 | 
 74 | CPU_Operations(const int m, const int n, const int k, unsigned long seed, int gpu_id);
 75 | ~CPU_Operations();
 76 | 
 77 | float* to_device(const float* src, const int size) const {
 78 |     return (float*) src;
 79 | }
 80 | 
 81 | SparseMatrix to_device(SparseMatrix src, const int size) const {
 82 |     return src;
 83 | }
 84 | 
 85 | float* to_host(const float* src, float* dest, const int size) const {
 86 |     return dest;
 87 | }
 88 | 
 89 | float* copy_to_host(const float* src, float* dst, size_t size) const {
 90 |     memcpy(dst, src, size);
 91 |     return dst;
 92 | }
 93 | 
 94 | float* get_batch(const float* X, int ncol, int batch_num, int batch_size) {
 95 |     /* return pointer */
 96 |     return (float*) &X[batch_num * batch_size * ncol];
 97 | }
 98 | 
 99 | SparseMatrix get_batch(SparseMatrix X, int ldx, int batch_num, int batch_size);
100 | 
101 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
102 |         const float *a, const int lda, const float *b, const int ldb, const float beta, float *c,
103 |         const int ldc) const {
104 |     sgemm_(transa, transb, &m, &n, &k, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
105 | }
106 | 
107 | 
108 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
109 |         const SparseMatrix a, const int lda, const float *b, const int ldb, const float beta, float *c,
110 |         const int ldc) const;
111 | 
112 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
113 |         const float *a, const int lda, const SparseMatrix b, const int ldb, const float beta, float *c,
114 |         const int ldc) const;
115 | 
116 | void dgmm(const char* mode, const int m, const int n, const float* A, int lda, const float* x, int incx, float* C,
117 |         int ldc) const;
118 | 
119 | void symm(const char *side, const char *uplo, const int m, const int n, const float alpha, const float *a,
120 |         const int lda, const float *b, const int ldb, const float beta, float *c, const int ldc) const {
121 |     ssymm_(side, uplo, &m, &n, &alpha, a, &lda, b, &ldb, &beta, c, &ldc);
122 | }
123 | 
124 | void axpy(const int n, const float alpha, const float *dx, const int incx, float *dy, const int incy) const {
125 |     saxpy_(&n, &alpha, dx, &incx, dy, &incy);
126 | }
127 | 
128 | int posv(const char* uplo, int n, int nrhs, float* a, int lda, float* b, int ldb) const {
129 |     int info;
130 |     int retval = sposv_(uplo, &n, &nrhs, a, &lda, b, &ldb, &info);
131 | 
132 |     if (info != 0)
133 |         printf("info: %d\n", info);
134 | 
135 |     assert(!info);
136 | 
137 |     return retval;
138 | }
139 | 
140 | int potrf(const char *uplo, int n, float* a, int lda) const {
141 |     int info;
142 |     int retval = spotrf_(uplo, &n, a, &lda, &info);
143 |     assert(!info);
144 |     return retval;
145 | }
146 | 
147 | int potrs(const char *uplo, int n, int nrhs, float* a, int lda, float *b, int ldb, int *info) const {
148 |     return spotrs_(uplo, &n, &nrhs, a, &lda, b, &ldb, info);
149 | }
150 | 
151 | int potri(const char *uplo, int n, float *a, int lda) const {
152 |     int info;
153 |     int retval = spotri_(uplo, &n, a, &lda, &info);
154 |     assert(!info);
155 |     return retval;
156 | }
157 | 
158 | void* memset(void* dest, int ch, size_t count) const {
159 |     return std::memset(dest, ch, count);
160 | }
161 | 
162 | float* memcpy(void* dest, const void *src, size_t count) const {
163 |     return (float*) std::memcpy(dest, src, count);
164 | }
165 | 
166 | float *memcpy_matrix(float *dest, float *src, int nrows_to_copy, int src_ncol, int first_row = 0) const {
167 |     return memcpy(dest, &src[first_row * src_ncol], nrows_to_copy * src_ncol * sizeof(float));
168 | }
169 | 
170 | SparseMatrix memcpy_matrix(SparseMatrix &dest, SparseMatrix src, int nrows_to_copy, int src_ncol, int first_row) const;
171 | 
172 | void free(void* ptr) const {
173 |     if (ptr != 0)
174 |         std::free(ptr);
175 | }
176 | 
177 | void free(SparseMatrix a) const;
178 | 
179 | void free_sparse(void *ptr) {
180 | }
181 | 
182 | void free_sparse(SparseMatrix a) {
183 |     free(a);
184 | }
185 | 
186 | void free_devicememory(void* ptr) const {
187 |     ;
188 | }
189 | 
190 | void free_devicememory(SparseMatrix X) const {
191 | }
192 | 
193 | template<typename T>
194 | T malloc_matrix(int rows, int cols) {
195 |     return malloc_matrix(rows, cols, init_invalid<T>());
196 | }
197 | 
198 | SparseMatrix malloc_matrix(int rows, int cols, SparseMatrix dummy);
199 | 
200 | float *malloc_matrix(int rows, int cols, float *dummy) {
201 |     return malloc(rows * cols * sizeof(float));
202 | }
203 | 
204 | float* malloc(size_t size) const {
205 |     return (float*) std::malloc(size);
206 | }
207 | 
208 | void maximum(float* x, const float value, const int size) const {
209 |     for (int i = 0; i < size; ++i)
210 |         x[i] = fmaxf(x[i], value);
211 | }
212 | 
213 | void leaky_relu(float* x, const float value, const int size) {
214 |     for (int i = 0; i < size; ++i)
215 |         x[i] = (x[i] < 0.0f) ? x[i] * value : x[i];
216 | }
217 | 
218 | void sigmoid(float* x, const int size) const {
219 |     for (int i = 0; i < size; ++i) {
220 |         x[i] = 1 / (1 + expf(-x[i]));
221 |     }
222 | }
223 | 
224 | void tanh(float* x, const int size) const {
225 |     for (int i = 0; i < size; ++i) {
226 |         x[i] = tanhf(x[i]);
227 |     }
228 | }
229 | 
230 | void fill_eye(float* a, int n) const {
231 |     memset(a, 0, n * n * sizeof(float));
232 |     for (int i = 0; i < n; ++i)
233 |         a[i * n + i] = 1.0f;
234 | }
235 | 
236 | void fill(float* X, const int size, const float value) const {
237 |     for (int i = 0; i < size; ++i) {
238 |         X[i] = value;
239 |     }
240 | }
241 | 
242 | void calculate_column_variance(const float* X, const unsigned nrows, const unsigned ncols, float* variances);
243 | void calculate_column_variance(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* variances);
244 | 
245 | void invsqrt(float* s, const unsigned n) const;
246 | 
247 | void scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const;
248 | void scale_columns(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const;
249 | 
250 | void scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const;
251 | void scale_rows(SparseMatrix X, const unsigned nrows, const unsigned ncols, float* s) const;
252 | 
253 | void dropout(float* X, const unsigned size, const float dropout_rate) const {
254 |     assert(0.0f <= dropout_rate && dropout_rate <= 1.0f);
255 |     for (unsigned i = 0; i < size; ++i)
256 |         X[i] = rand_unif() < dropout_rate ? 0.0f : X[i];
257 | }
258 | 
259 | void dropout(SparseMatrix X, const unsigned size, const float dropout_rate) const;
260 | 
261 | void add_saltpepper_noise(float* X, const unsigned size, const float noise_rate) const {
262 |     assert(0.0f <= noise_rate && noise_rate <= 1.0f);
263 |     for (unsigned i = 0; i < size; ++i) {
264 |         if (rand_unif() < noise_rate) {
265 |             X[i] = rand_unif() < 0.5 ? 0.0f : 1.0f;
266 |         }
267 |     }
268 | }
269 | 
270 | void add_saltpepper_noise(SparseMatrix X, const unsigned size, const float noise_rate) const;
271 | 
272 | void add_gauss_noise(float* X, const unsigned size, const float noise_rate) const {
273 |     assert(0.0 <= noise_rate);
274 |     for (unsigned i = 0; i < size; ++i)
275 |         X[i] += rand_normal() * noise_rate;
276 | }
277 | 
278 | /* gauss noise makes no sense on sparse matrices */
279 | void add_gauss_noise(SparseMatrix X, const unsigned size, const float noise_rate) const;
280 | 
281 | void invert(float* X, const unsigned size) const {
282 |     for (unsigned i = 0; i < size; ++i)
283 |         X[i] = 1.0f / X[i];
284 | }
285 | 
286 | void soft_threshold(float* x, const float alpha, const int size) const {
287 |     float f;
288 |     for (int i = 0; i < size; ++i) {
289 |         f = x[i];
290 |         x[i] = f > 0 ? fmaxf(0., f - alpha) : fminf(0., f + alpha);
291 |     }
292 | }
293 | 
294 | // Useful for debugging
295 | static void printMatrixCM(const float* a, int n, int m, const char* fmt);
296 | static void printMatrixCM(const SparseMatrix a, int n, int m, const char *fmt);
297 | 
298 | static void printMatrixRM(const float* a, int n, int m, const char* fmt);
299 | static void printMatrixRM(const SparseMatrix a, int n, int m, const char *fmt);
300 | 
301 | void prints(const float* f, unsigned l) const {}
302 | 
303 | void printsu(const int* f, unsigned l) const {}
304 | 
305 | void printm(const char* name, const SparseMatrix a, int n, int m) const {
306 |     printf("%s\n", name);
307 |     printMatrixCM(a, n, m, 0);
308 | }
309 | 
310 | void printm(const char* name, const float* a, int n, int m) const {
311 |     printf("%s\n", name);
312 |     printMatrixCM(a, n, m, 0);
313 | }
314 | };
315 | 


--------------------------------------------------------------------------------
/blas_sparse_proto.h:
--------------------------------------------------------------------------------
  1 | #ifndef BLAS_SPARSE_PROTO_H
  2 | #define BLAS_SPARSE_PROTO_H
  3 | 
  4 | typedef int blas_sparse_matrix;
  5 | 
  6 | 
  7 |   /* Level 1 Computational Routines */
  8 | 
  9 | void BLAS_susdot( enum blas_conj_type conj, int nz, const float *x, 
 10 |                   const int *indx, const float *y, int incy, float *r,
 11 |                   enum blas_base_type index_base );
 12 | void BLAS_dusdot( enum blas_conj_type conj, int nz, const double *x, 
 13 |                   const int *indx, const double *y, int incy, double *r,
 14 |                   enum blas_base_type index_base );
 15 | void BLAS_cusdot( enum blas_conj_type conj, int nz, const void *x, 
 16 |                   const int *indx, const void *y, int incy, void *r,
 17 |                   enum blas_base_type index_base );
 18 | void BLAS_zusdot( enum blas_conj_type conj, int nz, const void *x, 
 19 |                   const int *indx, const void *y, int incy, void *r,
 20 |                   enum blas_base_type index_base );
 21 | 
 22 | void BLAS_susaxpy( int nz, float alpha, const float *x, const int *indx,
 23 |                  float *y, int incy, enum blas_base_type index_base );
 24 | void BLAS_dusaxpy( int nz, double alpha, const double *x, const int *indx,
 25 |                  double *y, int incy, enum blas_base_type index_base );
 26 | void BLAS_cusaxpy( int nz, const void *alpha, const void *x, const int *indx,
 27 |                  void *y, int incy, enum blas_base_type index_base );
 28 | void BLAS_zusaxpy( int nz, const void *alpha, const void *x, const int *indx,
 29 |                  void *y, int incy, enum blas_base_type index_base );
 30 | 
 31 | void BLAS_susga( int nz, const float *y, int incy, float *x, const int *indx,
 32 |               enum blas_base_type index_base );
 33 | void BLAS_dusga( int nz, const double *y, int incy, double *x, const int *indx,
 34 |               enum blas_base_type index_base );
 35 | void BLAS_cusga( int nz, const void *y, int incy, void *x, const int *indx,
 36 |               enum blas_base_type index_base );
 37 | void BLAS_zusga( int nz, const void *y, int incy, void *x, const int *indx,
 38 |               enum blas_base_type index_base );
 39 | 
 40 | void BLAS_susgz( int nz, float *y, int incy, float *x, const int *indx,
 41 |               enum blas_base_type index_base );
 42 | void BLAS_dusgz( int nz, double *y, int incy, double *x, const int *indx,
 43 |               enum blas_base_type index_base );
 44 | void BLAS_cusgz( int nz, void *y, int incy, void *x, const int *indx,
 45 |               enum blas_base_type index_base );
 46 | void BLAS_zusgz( int nz, void *y, int incy, void *x, const int *indx,
 47 |               enum blas_base_type index_base );
 48 | 
 49 | void BLAS_sussc( int nz, const float *x, float *y, int incy, const int *indx,
 50 |               enum blas_base_type index_base );
 51 | void BLAS_dussc( int nz, const double *x, double *y, int incy, const int *indx,
 52 |               enum blas_base_type index_base );
 53 | void BLAS_cussc( int nz, const void *x, void *y, int incy, const int *indx,
 54 |               enum blas_base_type index_base );
 55 | void BLAS_zussc( int nz, const void *x, void *y, int incy, const int *indx,
 56 |               enum blas_base_type index_base );
 57 | 
 58 |                /* Level 2 Computational Routines */
 59 | 
 60 | int BLAS_susmv( enum blas_trans_type transa, float alpha, 
 61 |     blas_sparse_matrix A, const float *x, int incx, float *y, int incy );
 62 | int BLAS_dusmv( enum blas_trans_type transa, double alpha, 
 63 |     blas_sparse_matrix A, const double *x, int incx, double *y, int incy );
 64 | int BLAS_cusmv( enum blas_trans_type transa, const void *alpha, 
 65 |     blas_sparse_matrix A, const void *x, int incx, void *y, int incy );
 66 | int BLAS_zusmv( enum blas_trans_type transa, const void *alpha, 
 67 |     blas_sparse_matrix A, const void *x, int incx, void *y, int incy );
 68 | 
 69 | int BLAS_sussv( enum blas_trans_type transt, float alpha, 
 70 |     blas_sparse_matrix T, float *x, int incx );
 71 | int BLAS_dussv( enum blas_trans_type transt, double alpha, 
 72 |     blas_sparse_matrix T, double *x, int incx );
 73 | int BLAS_cussv( enum blas_trans_type transt, const void *alpha, 
 74 |     blas_sparse_matrix T, void *x, int incx );
 75 | int BLAS_zussv( enum blas_trans_type transt, const void *alpha, 
 76 |     blas_sparse_matrix T, void *x, int incx );
 77 | 
 78 |                /* Level 3 Computational Routines */
 79 | 
 80 | int BLAS_susmm( enum blas_order_type order, enum blas_trans_type transa,
 81 |     int nrhs, float alpha, blas_sparse_matrix A, const float *b, int ldb,
 82 |         float *c, int ldc );
 83 | int BLAS_dusmm( enum blas_order_type order, enum blas_trans_type transa,
 84 |         int nrhs, double alpha, blas_sparse_matrix A, const double *b,
 85 |         int ldb, double *c, int ldc );
 86 | int BLAS_cusmm( enum blas_order_type order, enum blas_trans_type transa,
 87 |          int nrhs, const void *alpha, blas_sparse_matrix A, const void *b, 
 88 |      int ldb, void *c, int ldc );
 89 | int BLAS_zusmm( enum blas_order_type order, enum blas_trans_type transa,
 90 |          int nrhs, const void *alpha, blas_sparse_matrix A, const void *b, 
 91 |      int ldb, void *c, int ldc );
 92 | 
 93 | int BLAS_sussm( enum blas_order_type order, enum blas_trans_type transt,
 94 |               int nrhs, float alpha, int t, float *b, int ldb );
 95 | int BLAS_dussm( enum blas_order_type order, enum blas_trans_type transt,
 96 |               int nrhs, double alpha, int t, double *b, int ldb );
 97 | int BLAS_cussm( enum blas_order_type order, enum blas_trans_type transt,
 98 |               int nrhs, const void *alpha, int t, void *b, int ldb );
 99 | int BLAS_zussm( enum blas_order_type order, enum blas_trans_type transt,
100 |               int nrhs, const void *alpha, int t, void *b, int ldb );
101 | 
102 |                /* Handle Management Routines */
103 | 
104 |                /* Creation Routines */
105 | 
106 | blas_sparse_matrix BLAS_suscr_begin( int m, int n );
107 | blas_sparse_matrix BLAS_duscr_begin( int m, int n );
108 | blas_sparse_matrix BLAS_cuscr_begin( int m, int n );
109 | blas_sparse_matrix BLAS_zuscr_begin( int m, int n );
110 | 
111 | 
112 | blas_sparse_matrix BLAS_suscr_block_begin( int Mb, int Nb, int k, int l );
113 | blas_sparse_matrix BLAS_duscr_block_begin( int Mb, int Nb, int k, int l );
114 | blas_sparse_matrix BLAS_cuscr_block_begin( int Mb, int Nb, int k, int l );
115 | blas_sparse_matrix BLAS_zuscr_block_begin( int Mb, int Nb, int k, int l );
116 | 
117 | blas_sparse_matrix BLAS_suscr_variable_block_begin( int Mb, int Nb, 
118 | 		const int *k, const int *l );
119 | blas_sparse_matrix BLAS_duscr_variable_block_begin( int Mb, int Nb, 
120 | 		const int *k, const int *l );
121 | blas_sparse_matrix BLAS_cuscr_variable_block_begin( int Mb, int Nb, 
122 | 		const int *k, const int *l );
123 | blas_sparse_matrix BLAS_zuscr_variable_block_begin( int Mb, int Nb, 
124 | 		const int *k, const int *l );
125 | 
126 | 
127 |                /* Insertion Routines */
128 | 
129 | int BLAS_suscr_insert_entry( blas_sparse_matrix A, float val, int i, int j );
130 | int BLAS_duscr_insert_entry( blas_sparse_matrix A, double val, int i, int j );
131 | int BLAS_cuscr_insert_entry( blas_sparse_matrix A, const void *val, int i, int j );
132 | int BLAS_zuscr_insert_entry( blas_sparse_matrix A, const void *val, int i, int j );
133 | 
134 | int BLAS_suscr_insert_entries( blas_sparse_matrix A, int nz, const float *val,
135 |                             const int *indx, const int *jndx );
136 | int BLAS_duscr_insert_entries( blas_sparse_matrix A, int nz, const double *val,
137 |                             const int *indx, const int *jndx );
138 | int BLAS_cuscr_insert_entries( blas_sparse_matrix A, int nz, const void *val,
139 |                             const int *indx, const int *jndx );
140 | int BLAS_zuscr_insert_entries( blas_sparse_matrix A, int nz, const void *val,
141 |                             const int *indx, const int *jndx );
142 | 
143 | int BLAS_suscr_insert_col( blas_sparse_matrix A, int j, int nz,
144 |                            const float *val, const int *indx );
145 | int BLAS_duscr_insert_col( blas_sparse_matrix A, int j, int nz,
146 |                            const double *val, const int *indx );
147 | int BLAS_cuscr_insert_col( blas_sparse_matrix A, int j, int nz,
148 |                            const void *val, const int *indx );
149 | int BLAS_zuscr_insert_col( blas_sparse_matrix A, int j, int nz,
150 |                            const void *val, const int *indx );
151 | 
152 | int BLAS_suscr_insert_row( blas_sparse_matrix A, int i, int nz,
153 |                            const float *val, const int *indx );
154 | int BLAS_duscr_insert_row( blas_sparse_matrix A, int i, int nz,
155 |                            const double *val, const int *indx );
156 | int BLAS_cuscr_insert_row( blas_sparse_matrix A, int i, int nz,
157 |                            const void *val, const int *indx );
158 | int BLAS_zuscr_insert_row( blas_sparse_matrix A, int i, int nz,
159 |                            const void *val, const int *indx );
160 | 
161 | int BLAS_suscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 
162 |                         const float *val, const int row_stride, 
163 |                         const int col_stride, const int *indx, 
164 |                         const int *jndx );
165 | int BLAS_duscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 
166 |                         const double *val, const int row_stride, 
167 |                         const int col_stride, const int *indx, 
168 |                         const int *jndx );
169 | int BLAS_cuscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 
170 |                         const void *val, const int row_stride, 
171 |                         const int col_stride, const int *indx, 
172 |                         const int *jndx );
173 | int BLAS_zuscr_insert_clique( blas_sparse_matrix A, const int k, const int l, 
174 |                         const void *val, const int row_stride, 
175 |                         const int col_stride, const int *indx, 
176 |                         const int *jndx );
177 | 
178 | int BLAS_suscr_insert_block( blas_sparse_matrix A, const float *val, 
179 |                         int row_stride, int col_stride, int i, int j );
180 | int BLAS_duscr_insert_block( blas_sparse_matrix A, const double *val, 
181 |                         int row_stride, int col_stride, int i, int j );
182 | int BLAS_cuscr_insert_block( blas_sparse_matrix A, const void *val, 
183 |                         int row_stride, int col_stride, int i, int j );
184 | int BLAS_zuscr_insert_block( blas_sparse_matrix A, const void *val, 
185 |                         int row_stride, int col_stride, int i, int j );
186 | 
187 |                /* Completion of Construction Routines */
188 | 
189 | int BLAS_suscr_end( blas_sparse_matrix A );
190 | int BLAS_duscr_end( blas_sparse_matrix A );
191 | int BLAS_cuscr_end( blas_sparse_matrix A );
192 | int BLAS_zuscr_end( blas_sparse_matrix A );
193 | 
194 |                /* Matrix Property Routines */
195 | 
196 | int BLAS_usgp( blas_sparse_matrix A, int pname );
197 | 
198 | int BLAS_ussp( blas_sparse_matrix A, int pname );
199 | 
200 |                /* Destruction Routine */
201 | 
202 | int BLAS_usds( blas_sparse_matrix A );
203 | 
204 | /* custom */
205 | blas_sparse_matrix create(int row, int col);
206 | void destroy(blas_sparse_matrix A);
207 | 
208 | /* column means and variances */
209 | void scolmeans(blas_sparse_matrix A, float *means);
210 | void scolvars(blas_sparse_matrix A, float *vars);
211 | void srowmeans(blas_sparse_matrix A, float *means);
212 | void srowvars(blas_sparse_matrix A, float *vars);
213 | 
214 | /* scale rows/cols */
215 | void sscalecols(blas_sparse_matrix A, float *s);
216 | void sscalerows(blas_sparse_matrix A, float *s);
217 | 
218 | /* select row subset */
219 | blas_sparse_matrix srowsubset(blas_sparse_matrix A, int first_row, int nrow);
220 | 
221 | /* construct from CSR data */
222 | blas_sparse_matrix suscr_csr(int m, int n, float *x, int *row, int *col);
223 | 
224 | /* set element (set to zero will delete entry) */
225 | void ssetelement( blas_sparse_matrix A, int row, int col, float val );
226 | void ssetelement( blas_sparse_matrix A, int idx, float val );
227 | 
228 | /* get element reference */
229 | float &sgetelement( blas_sparse_matrix A, int row, int col);
230 | float &sgetelement( blas_sparse_matrix A, int idx );
231 | 
232 | /* sgemm routines with sparse matrix being lhs (A) or rhs (B) of the product */
233 | void susgemm(enum blas_side_type sidea, enum blas_trans_type transa, enum blas_trans_type transb, 
234 |    int nohs, const float &alpha, blas_sparse_matrix A, const float *B, int ldB, const float &beta, float *C, int ldC);
235 | void susgemma(enum blas_order_type order, enum blas_trans_type transa, enum blas_trans_type transb, 
236 |    int nrhs, const float &alpha, blas_sparse_matrix A, const float *B, int ldB, const float &beta, 
237 |    float *C, int ldC);
238 | void susgemmb(enum blas_order_type order, enum blas_trans_type transa, enum blas_trans_type transb, 
239 |    int nlhs, const float &alpha, const float *A, int ldA, blas_sparse_matrix B, const float &beta, 
240 |    float *C, int ldC);
241 | 
242 | /* checks whether A is a valid handle */
243 | bool handle_valid(blas_sparse_matrix A);
244 | 
245 | /* debug */
246 | namespace NIST_SPBLAS
247 | {void print(int A);}
248 | 
249 | #include <cstdio>
250 | 
251 | #define dbg_printf(...) do { \
252 |    printf(__VA_ARGS__); \
253 |    fflush(stdout); \
254 | } while (0)
255 | 
256 | #endif
257 |   /* BLAS_SPARSE_PROTO_H */
258 | 


--------------------------------------------------------------------------------
/librfn.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2015-2017 Thomas Unterthiner
  3 | Additional Contributions by Thomas Adler, Balázs Bencze
  4 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  5 | */
  6 | 
  7 | #include "librfn.h"
  8 | #include <stdlib.h>
  9 | #include <sys/time.h>
 10 | #include "cpu_operations.h"
 11 | 
 12 | #ifndef NOGPU
 13 | #include "gpu_operations.h"
 14 | #endif
 15 | 
 16 | float time_diff(struct timeval *t2, struct timeval *t1) {
 17 |     long int diff = (t2->tv_usec + 1000000 * t2->tv_sec) - (t1->tv_usec + 1000000 * t1->tv_sec);
 18 |     return diff / 1000000.0f;
 19 | }
 20 | 
 21 | template <class OP>
 22 | int calculate_W_impl_invertMxM(OP& op, const float* W, const float* P, float* Wout,
 23 |                           const int k, const int m,
 24 |                           float* WWPchol, float* WWPinv) {
 25 |     op.gemm("n","t", m, m, k, 1.0f, W, m, W, m, 0.0f, WWPchol, m);
 26 |     op.axpy(m, 1.0f, P, 1, WWPchol, m+1);
 27 |     op.fill_eye(WWPinv, m);
 28 |     op.posv("u", m, m, WWPchol, m, WWPinv, m);
 29 |     op.gemm("t", "n", m, k, m, 1.0f, WWPinv, m, W, m, 0.0f, Wout, m);
 30 |     return 0;
 31 | }
 32 | 
 33 | 
 34 | //better option if m > k ( = W is tall), involves k*k inverse
 35 | template <class OP>
 36 | int calculate_W_impl_invertKxK(OP& op, const float* W, const float* Pinv, float* Wout,
 37 |                            const int k, const int m,
 38 |                            float* Wtmp, float* WPWchol, float* WPWinv) {
 39 |     op.dgmm("l", m, k, W, m, Pinv, 1, Wtmp, m);
 40 |     op.gemm("t", "n", k, k, m, 1.0f, W, m, Wtmp, m, 0.0f, WPWchol, k);
 41 |     op.axpy(k, 1.0f, op.ones, 1, WPWchol, k+1);
 42 |     op.fill_eye(WPWinv, k);
 43 |     op.posv("u", k, k, WPWchol, k, WPWinv, k);
 44 |     op.gemm("n", "t", m, k, k, 1.0f, Wtmp, m, WPWinv, k, 0.0f, Wout, m);
 45 |     return 0;
 46 | }
 47 | 
 48 | 
 49 | // if isMoreHiddensThanFeatures is true, we will calculate the m*m inverse, otherwise the k*k one
 50 | template <class OP, bool isMoreHiddensThanFeatures, typename XType, typename XTypeConst>
 51 | int train(XTypeConst X_host, float* W_host, float* P_host, const int n, const int m,
 52 |           const int k, const int n_iter, int batch_size, const float etaW, const float etaP,
 53 |           const float minP, const float h_threshold,
 54 |           const float dropout_rate, const float input_noise_rate,
 55 |           const float l2_weightdecay, const float l1_weightdecay, const float momentum,
 56 |           const int input_noise_type, const int activation_type, const int apply_scaling,
 57 |           const int applyNewtonUpdate, unsigned long seed, int gpu_id) {
 58 |     if (batch_size == 1) {
 59 |         printf ("batch_size == 1 not supported, switching to full batch mode");
 60 |     }
 61 | 
 62 |     OP op(n, m, k, seed, gpu_id);
 63 |     XType X = op.to_device(X_host, m*n*sizeof(float));
 64 |     float* W = op.to_device(W_host, k*m*sizeof(float));
 65 |     float* P = op.to_device(P_host, m*sizeof(float));
 66 |     if (batch_size < 2) // no mini-batches, one batch=full dataset
 67 |         batch_size = n;
 68 |     int n_batches = n / batch_size;
 69 |     float* XCov_diag = op.malloc(m*sizeof(float));
 70 | 
 71 |     float* H = op.malloc(k*batch_size*sizeof(float));
 72 |     float* Wout = op.malloc(k*m*sizeof(float));
 73 |     float* variance_H = op.malloc(k*sizeof(float));
 74 |     float* S = op.malloc(k*k*sizeof(float));
 75 |     float* Schol = op.malloc(k*k*sizeof(float));
 76 |     float* U = op.malloc(m*k*sizeof(float));
 77 |     float* Sinv = op.malloc(k*k*sizeof(float));
 78 |     float* dW = op.malloc(m*k*sizeof(float));
 79 |     float* C = op.malloc(m*m*sizeof(float));
 80 | 
 81 |     XType Xtmp = op.template init_invalid<XType>();
 82 |     if (input_noise_rate > 0.0f)
 83 |     {
 84 |        Xtmp = op.template malloc_matrix<XType>(batch_size, m);
 85 |     }
 86 | 
 87 |     // which matrices of the following we use depends on which inverse we use
 88 |     float* WWPchol = 0;
 89 |     float* WWPinv = 0;
 90 |     float* WPWchol = 0;
 91 |     float* WPWinv = 0;
 92 |     float* Wtmp = 0;
 93 |     if (isMoreHiddensThanFeatures) {
 94 |         WWPchol = op.malloc(m*m*sizeof(float));
 95 |         WWPinv = op.malloc(m*m*sizeof(float));
 96 |     } else {
 97 |         WPWchol = op.malloc(k*k*sizeof(float));
 98 |         WPWinv = op.malloc(k*k*sizeof(float));
 99 |         Wtmp = op.malloc(m*k*sizeof(float));
100 |     }
101 |     float* dP = op.malloc(m*sizeof(float));
102 | 
103 |     if (!dP) {  // We've run out of memory somewhere
104 |         op.free(dP);
105 |         op.free(C);
106 |         op.free(dW);
107 |         op.free(Sinv);
108 |         op.free(U);
109 |         op.free(Schol);
110 |         op.free(S);
111 |         op.free(variance_H);
112 |         op.free(Wout);
113 |         op.free(H);
114 |         op.free(WWPinv);
115 |         op.free(WWPchol);
116 |         op.free(WPWchol);
117 |         op.free(WPWinv);
118 |         op.free(Wtmp);
119 |         op.free(Xtmp);
120 |         op.free(XCov_diag);
121 |         return -1;
122 |     }
123 |     struct timeval t0, t1;
124 |     gettimeofday(&t0, 0);
125 | 
126 |     if (n == batch_size)
127 |         op.calculate_column_variance(X, batch_size, m, XCov_diag);
128 | 
129 |     for (int cur_iter = 0; cur_iter < n_iter; ++cur_iter) {
130 |         if (cur_iter % 25 == 0) {
131 |             gettimeofday(&t1, 0);
132 |             printf("epoch: %4d  (time: %6.2fs)\n", cur_iter, time_diff(&t1, &t0));
133 |         }
134 |         for (int cur_batch = 0; cur_batch < n_batches; ++cur_batch) {
135 |             if (isMoreHiddensThanFeatures) {
136 |                 calculate_W_impl_invertMxM<OP>(op, W, P, Wout, k, m, WWPchol, WWPinv);
137 |             } else {
138 |                 op.invert(P, m);  // TODO: something better than inverting P twice,
139 |                 /* how about inverting P once into distinct mem? */
140 |                 calculate_W_impl_invertKxK<OP>(op, W, P, Wout, k, m, Wtmp, WPWchol, WPWinv);
141 |                 op.invert(P, m);
142 |             }
143 | 
144 |             XType Xnoise;
145 | 
146 |             if (input_noise_type && input_noise_rate > 0.0f) {
147 |                 op.memcpy_matrix(Xtmp, X, batch_size, m, cur_batch);
148 |                 switch(input_noise_type) {
149 |                     case 1:  // dropout noise
150 |                         op.dropout(Xtmp, batch_size*m, input_noise_rate);
151 |                         break;
152 |                     case 2: // salt&pepper noise
153 |                         op.add_saltpepper_noise(Xtmp, batch_size*m, input_noise_rate);
154 |                         break;
155 |                     case 3: // gauss noise
156 |                         op.add_gauss_noise(Xtmp, batch_size*m, input_noise_rate);
157 |                         break;
158 |                     default:
159 |                         printf("invalid noise type");
160 |                         assert(false);
161 |                 }
162 |                 Xnoise = Xtmp;
163 |             } else {
164 |                 /* in case of sparse X, this is a copy operation! */
165 |                 Xnoise = op.get_batch(X, m, cur_batch, batch_size);
166 |             }
167 | 
168 |             op.gemm("t", "n", k, batch_size, m, 1.0f, Wout, m, Xnoise, m, 0.0f, H, k);
169 | 
170 |             if (!(input_noise_type && input_noise_rate > 0.0f))
171 |             {
172 |                /* free matrix only if it's sparse */
173 |                op.free_sparse(Xnoise);
174 |             }
175 | 
176 |             switch (activation_type) {
177 |                 case 1: op.maximum(H, h_threshold, batch_size*k); break;
178 |                 case 2: op.leaky_relu(H, h_threshold, batch_size*k); break;
179 |                 case 3: op.sigmoid(H, batch_size*k); break;
180 |                 case 4: op.tanh(H, batch_size*k); break;
181 |                 default:
182 |                     printf("invalid activation type");
183 |                     assert(false);
184 |             }
185 | 
186 |             if (apply_scaling) {
187 |                 op.calculate_column_variance(H, batch_size, k, variance_H);
188 |                 op.invsqrt(variance_H, k);
189 |                 op.scale_columns(H, batch_size, k, variance_H);
190 |             }
191 |             if (dropout_rate > 0.0f) {
192 |                 op.dropout(H, batch_size*k, dropout_rate);
193 |             }
194 |             op.gemm("n", "t", k, k, batch_size, 1.0f/batch_size, H, k, H, k, 0.0f, S, k);
195 |             if (isMoreHiddensThanFeatures) {
196 |                 op.gemm("t", "n", k, k, m, -1.0f, Wout, m, W, m, 1.0f, S, k);
197 |                 op.axpy(k, 1.0f, op.ones, 0, S, k+1);
198 |             } else {
199 |                 op.axpy(k*k, 1.0f, WPWinv, 1, S, 1);
200 |             }
201 |             XType XBatch = op.get_batch(X, m, cur_batch, batch_size);
202 | 
203 |             op.gemm("n", "t", m, k, batch_size, 1.0f/batch_size, XBatch, m, H, k, 0.0f, U, m);
204 | 
205 |             if (applyNewtonUpdate) {
206 |                 op.axpy(k, 1e-10, op.ones, 0, S, k+1);
207 |                 op.memcpy(Schol, S, k*k*sizeof(float));
208 |                 op.fill_eye(Sinv, k);
209 |                 op.posv("u", k, k, Schol, k, Sinv, k);
210 |                 op.gemm("n", "n", m, k, k, 1.0f, U, m, Sinv, k, momentum, dW, m);
211 |                 op.axpy(m*k, -(1.0f+l2_weightdecay), W, 1, dW, 1);
212 |             } else {
213 |                 op.gemm("n", "n", m, k, k, -1.0f, W, m, S, k, momentum, dW, m);
214 |                 op.axpy(m*k, 1.0f, U, 1, dW, 1);
215 | 
216 |                 if (l2_weightdecay > 0.0f) {
217 |                     op.axpy(m*k, -l2_weightdecay, W, 1, dW, 1);
218 |                 }
219 |             }
220 | 
221 |             op.gemm("n", "n", m, k, k, 1.0f, W, m, S, k, -2.0f, U, m);
222 |             op.gemm("n", "t", m, m, k, 1.0f, U, m, W, m, 0.0f, C, m);
223 | 
224 |             if (batch_size < n) {
225 |                 op.calculate_column_variance(XBatch, batch_size, m, dP);
226 |             } else {
227 |                 op.memcpy(dP, XCov_diag, m*sizeof(float));
228 |             }
229 | 
230 |             op.free_sparse(XBatch);
231 | 
232 |             op.axpy(m, 1.0f, C, m+1, dP, 1);
233 |             op.axpy(m, -1.0f, P, 1, dP, 1);
234 | 
235 |             op.axpy(m, etaP/n_batches, dP, 1, P, 1);
236 |             op.axpy(m*k, etaW/n_batches, dW, 1, W, 1);
237 | 
238 |             op.maximum(P, minP, m);
239 | 
240 |             if (l1_weightdecay > 0.0f) {
241 |                 op.soft_threshold(W, l1_weightdecay, m*k);
242 |             }
243 |         }
244 |     }
245 |     op.free(dP);
246 |     op.free(C);
247 |     op.free(dW);
248 |     op.free(Sinv);
249 |     op.free(U);
250 |     op.free(Schol);
251 |     op.free(S);
252 |     op.free(H);
253 |     op.free(variance_H);
254 |     op.free(Wout);
255 |     op.free(WWPinv);
256 |     op.free(WWPchol);
257 |     op.free(WPWchol);
258 |     op.free(WPWinv);
259 |     op.free(Wtmp);
260 |     op.free(Xtmp);
261 |     op.free(XCov_diag);
262 |     op.free_devicememory(X);
263 | 
264 |     op.to_host(W, W_host, m*k*sizeof(float));
265 |     op.to_host(P, P_host, m*sizeof(float));
266 |     return 0;
267 | }
268 | 
269 | 
270 | template <class OP, typename XType, typename XTypeConst>
271 | void calculate_W(XTypeConst X_host, const float* W_host, const float* P_host,
272 |                  float* Wout_host, const int n, const int m, const int k,
273 |                  const int activation_type, const int apply_scaling,
274 |                  const float h_threshold, int gpu_id) {
275 |     OP op(n, m, k, 0, gpu_id);
276 |     float* P_copy = (float*) malloc(m*sizeof(float));
277 |     memcpy(P_copy, P_host, m*sizeof(float)); // we might need to invert P
278 |     float* Wout = op.to_device(Wout_host, k*m*sizeof(float));
279 |     float* W = op.to_device(W_host, k*m*sizeof(float));
280 |     float* P = op.to_device(P_copy, m*sizeof(float));
281 |     XType  X = op.to_device(X_host, n*m*sizeof(float));
282 |     float* H = op.malloc(n*k*sizeof(float));
283 |     float* variance_H = op.malloc(k*sizeof(float));
284 | 
285 |     if (k > m) {
286 |         float* WWPchol = op.malloc(m*m*sizeof(float));
287 |         float* WWPinv = op.malloc(m*m*sizeof(float));
288 |         calculate_W_impl_invertMxM<OP>(op, W, P, Wout, k, m, WWPchol, WWPinv);
289 |         op.free(WWPchol);
290 |         op.free(WWPinv);
291 |     } else {
292 |         op.invert(P, m);
293 |         float* WPWchol = op.malloc(k*k*sizeof(float));
294 |         float* WPWinv = op.malloc(k*k*sizeof(float));
295 |         float* Wtmp = op.malloc(m*k*sizeof(float));
296 |         calculate_W_impl_invertKxK<OP>(op, W, P, Wout, k, m, Wtmp, WPWchol, WPWinv);
297 |         op.free(Wtmp);
298 |         op.free(WPWinv);
299 |         op.free(WPWchol);
300 |         op.invert(P, m);
301 |     }
302 | 
303 |     op.gemm("t", "n", k, n, m, 1.0f, Wout, m, X, m, 0.0f, H, k);
304 | 
305 |     switch (activation_type) {
306 |         case 1: op.maximum(H, h_threshold, n*k); break;
307 |         case 2: op.leaky_relu(H, h_threshold, n*k); break;
308 |         case 3: op.sigmoid(H,  n*k); break;
309 |         case 4: op.tanh(H,  n*k); break;
310 |         default:
311 |             printf("invalid noise type");
312 |             assert(false);
313 |     }
314 | 
315 |     if (apply_scaling){
316 |         op.calculate_column_variance(H, n, k, variance_H);
317 |         op.invsqrt(variance_H, k);
318 |         op.scale_rows(Wout, k, m, variance_H);
319 |     }
320 | 
321 |     op.free(variance_H);
322 |     op.free(H);
323 |     op.to_host(Wout, Wout_host, k*m*sizeof(float));
324 |     op.free_devicememory(W);
325 |     op.free_devicememory(P);
326 |     op.free_devicememory(X);
327 |     free(P_copy);
328 | }
329 | 
330 | 
331 | extern "C" {
332 | 
333 | int train_rfn(const float* X, float* W, float* P, const int n,
334 |               const int m, const int k, const int n_iter, int batch_size,
335 |               const float etaW, const float etaP, const float minP, const float h_threshold,
336 |               const float dropout_rate, const float input_noise_rate,
337 |               const float l2_weightdecay, const float l1_weightdecay,
338 |               const float momentum,
339 |               const int input_noise_type, const int activation_type, const int apply_scaling,
340 |               const int applyNewtonUpdate, unsigned long seed, const int gpu_id) {
341 | 
342 |     if (gpu_id == USE_CPU) {
343 |         if (k > m) {
344 |             return train<CPU_Operations, true, float *, const float *>(X, W, P, n, m, k,
345 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
346 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1);
347 |         } else {
348 |             return train<CPU_Operations, false, float *, const float *>(X, W, P, n, m, k,
349 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
350 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1);
351 |         }
352 |     } else {
353 | #ifndef NOGPU
354 |         if (k > m) {
355 |             return train<GPU_Operations, true, float *, const float *>(X, W, P, n, m, k,
356 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
357 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id);
358 |         } else {
359 |             return train<GPU_Operations, false, float *, const float *>(X, W, P, n, m, k,
360 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
361 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id);
362 |         }
363 | #else
364 |         fprintf(stderr, "librfn was compiled without GPU support");
365 | #endif
366 |     }
367 | }
368 | 
369 | 
370 | int train_rfn_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr,
371 |               float* W, float* P, const int n, const int m,
372 |               const int k, const int n_iter, int batch_size, const float etaW,
373 |               const float etaP, const float minP, const float h_threshold,
374 |               const float dropout_rate, const float input_noise_rate,
375 |               const float l2_weightdecay, const float l1_weightdecay,
376 |               const float momentum,
377 |               const int input_noise_type, const int activation_type, const int apply_scaling,
378 |               const int applyNewtonUpdate, unsigned long seed, const int gpu_id) {
379 |     if (gpu_id == USE_CPU) {
380 |         const CPU_Operations::SparseMatrix X = CPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m);
381 |         int retval = 0;
382 |         if (k > m) {
383 |             retval =  train<CPU_Operations, true, CPU_Operations::SparseMatrix, const CPU_Operations::SparseMatrix>((CPU_Operations::SparseMatrix) X, W, P, n, m, k,
384 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
385 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1);
386 |         } else {
387 |             retval = train<CPU_Operations, false, CPU_Operations::SparseMatrix, const CPU_Operations::SparseMatrix>((CPU_Operations::SparseMatrix) X, W, P, n, m, k,
388 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
389 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, -1);
390 |         }
391 |         //destroy(X);
392 |         return retval;
393 |     } else {
394 | #ifndef NOGPU
395 |         const GPU_Operations::SparseMatrix X = GPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m);
396 |         if (k > m) {
397 |             return train<GPU_Operations, true, GPU_Operations::SparseMatrix*, const GPU_Operations::SparseMatrix*>(&X, W, P, n, m, k,
398 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
399 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id);
400 |         } else {
401 |             return train<GPU_Operations, false, GPU_Operations::SparseMatrix*, const GPU_Operations::SparseMatrix*>(&X, W, P, n, m, k,
402 |                         n_iter, batch_size, etaW, etaP, minP, h_threshold, dropout_rate, input_noise_rate,
403 |                         l2_weightdecay, l1_weightdecay, momentum, input_noise_type, activation_type, apply_scaling, applyNewtonUpdate, seed, gpu_id);
404 |         }
405 | #else
406 |         fprintf(stderr, "librfn was compiled without GPU support");
407 | #endif
408 |     }
409 | }
410 | 
411 | 
412 | void calculate_W(const float* X, const float* W, const float* P, float* Wout,
413 |                  const int n, const int m, const int k, const int activation_type,
414 |                  const int  apply_scaling, const float h_threshold, int gpu_id) {
415 |     if (gpu_id == USE_CPU) {
416 |         return calculate_W<GPU_Operations, float *, const float *>(X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, gpu_id);
417 |     } else {
418 | #ifndef NOGPU
419 |         return calculate_W<CPU_Operations, float *, const float *>(X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, -1);
420 | #else
421 |         fprintf(stderr, "librfn was compiled without GPU support");
422 | #endif
423 |     }
424 | }
425 | 
426 | 
427 | void calculate_W_sparse(const float* Xvals, const int* Xcols, const int *Xrowptr,
428 |                      const float* W, const float* P, float* Wout,
429 |                      const int n, const int m, const int k, const int activation_type,
430 |                      const int  apply_scaling, const float h_threshold, int gpu_id) {
431 |     if (gpu_id == USE_CPU) {
432 |         const CPU_Operations::SparseMatrix X = CPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m);
433 |         calculate_W<CPU_Operations, CPU_Operations::SparseMatrix, const CPU_Operations::SparseMatrix>(X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, -1);
434 |         //destroy(X);
435 |      } else {
436 | #ifndef NOGPU
437 |         const GPU_Operations::SparseMatrix X = GPU_Operations::create_sparse_matrix(Xvals, Xcols, Xrowptr, n, m);
438 |         calculate_W<GPU_Operations, GPU_Operations::SparseMatrix*, const GPU_Operations::SparseMatrix *>(&X, W, P, Wout, n, m, k, activation_type, apply_scaling, h_threshold, gpu_id);
439 | #else
440 |         fprintf(stderr, "librfn was compiled without GPU support");
441 | #endif
442 |     }
443 | }
444 | 
445 | }
446 | 


--------------------------------------------------------------------------------
/gpu_operations.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2015-2017 Thomas Unterthiner
  3 | Additional Contributions by Thomas Adler, Balázs Bencze
  4 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  5 | */
  6 | 
  7 | 
  8 | #include <cuda_runtime.h>
  9 | #include <cublas_v2.h>
 10 | #include <curand.h>
 11 | #include <curand_kernel.h>
 12 | #include <cusolverDn.h>
 13 | #include <cstdio>
 14 | #include <cstring>
 15 | #include <ctype.h>
 16 | #include <cassert>
 17 | #include <map>
 18 | #include <cusparse_v2.h>
 19 | #include <typeinfo> /* for typeid */
 20 | 
 21 | 
 22 | using std::fprintf;
 23 | 
 24 | inline cublasFillMode_t uplo_to_cublas(const char* uplo) {
 25 |     return tolower(uplo[0]) == 'l' ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER;
 26 | }
 27 | 
 28 | inline cusparseOperation_t op_to_cusparse(const char* op) {
 29 |     return tolower(op[0]) == 't' ? CUSPARSE_OPERATION_TRANSPOSE : CUSPARSE_OPERATION_NON_TRANSPOSE;
 30 | }
 31 | 
 32 | static const char* cusparseErrorString(cusparseStatus_t error) {
 33 |     switch (error) {
 34 |         case CUSPARSE_STATUS_SUCCESS: return "CUSPARSE_STATUS_SUCCESS";
 35 |         case CUSPARSE_STATUS_NOT_INITIALIZED: return "CUSPARSE_STATUS_NOT_INITIALIZED";
 36 |         case CUSPARSE_STATUS_ALLOC_FAILED: return "CUSPARSE_STATUS_ALLOC_FAILED";
 37 |         case CUSPARSE_STATUS_INVALID_VALUE: return "CUSPARSE_STATUS_INVALID_VALUE";
 38 |         case CUSPARSE_STATUS_ARCH_MISMATCH: return "CUSPARSE_STATUS_ARCH_MISMATCH";
 39 |         case CUSPARSE_STATUS_MAPPING_ERROR: return "CUSPARSE_STATUS_MAPPING_ERROR";
 40 |         case CUSPARSE_STATUS_EXECUTION_FAILED: return "CUSPARSE_STATUS_EXECUTION_FAILED";
 41 |         case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR";
 42 |         case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
 43 |         case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT";
 44 |         default: return "<unknown>";
 45 |     }
 46 | }
 47 | 
 48 | static const char* cublasErrorString(cublasStatus_t error) {
 49 |     switch (error) {
 50 |         case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
 51 |         case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
 52 |         case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
 53 |         case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
 54 |         case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
 55 |         case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
 56 |         case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
 57 |         case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
 58 | #if CUDA_VERSION >= 6000
 59 |         case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
 60 | #endif
 61 |         default: return "<unknown>";
 62 |     }
 63 | }
 64 | 
 65 | #ifndef DNDEBUG
 66 | 
 67 | #define CUDA_CALL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 68 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
 69 |     if (code != cudaSuccess) {
 70 |         fprintf(stderr, "CUDA Error: %s %s:%d\n", cudaGetErrorString(code), file, line);
 71 |         if (abort)
 72 |             exit(code);
 73 |     }
 74 | }
 75 | 
 76 | #define CUBLAS_CALL(ans) { cublasAssert((ans), __FILE__, __LINE__); }
 77 | inline void cublasAssert(cublasStatus_t code, const char *file, int line) {
 78 | //printf("%d (%s:%d)\n", code, file, line);
 79 |     if (code != CUBLAS_STATUS_SUCCESS) {
 80 |         fprintf(stderr, "CUBLAS Error: %s %s:%d\n", cublasErrorString(code), file, line);
 81 |         exit(code);
 82 |     }
 83 | }
 84 | 
 85 | #define CUSPARSE_CALL(ans) { cusparseAssert((ans), __FILE__, __LINE__); }
 86 | inline void cusparseAssert(cusparseStatus_t code, const char *file, int line) {
 87 |     // printf("%d (%s:%d)\n", code, file, line);
 88 |     if (code != CUSPARSE_STATUS_SUCCESS) {
 89 |         fprintf(stderr, "CUSPARSE Error: %s %s:%d\n", cusparseErrorString(code), file, line);
 90 |         exit(code);
 91 |     }
 92 | }
 93 | 
 94 | static const char* cusolverErrorString(cusolverStatus_t error) {
 95 |     switch (error) {
 96 |         case CUSOLVER_STATUS_SUCCESS: return "CUSOLVER_STATUS_SUCCESS";
 97 |         case CUSOLVER_STATUS_NOT_INITIALIZED: return "CUSOLVER_STATUS_NOT_INITIALIZED";
 98 |         case CUSOLVER_STATUS_ALLOC_FAILED: return "CUSOLVER_STATUS_ALLOC_FAILED";
 99 |         case CUSOLVER_STATUS_INVALID_VALUE: return "CUSOLVER_STATUS_INVALID_VALUE";
100 |         case CUSOLVER_STATUS_ARCH_MISMATCH: return "CUSOLVER_STATUS_ARCH_MISMATCH";
101 |         case CUSOLVER_STATUS_EXECUTION_FAILED: return "CUSOLVER_STATUS_EXECUTION_FAILED";
102 |         case CUSOLVER_STATUS_INTERNAL_ERROR: return "CUSOLVER_STATUS_INTERNAL_ERROR";
103 |         case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
104 |         default: return "<unknown>";
105 |     }
106 | }
107 | 
108 | #define CUSOLVER_CALL(ans) { cusolverAssert((ans), __FILE__, __LINE__); }
109 | inline void cusolverAssert(cusolverStatus_t code, const char *file, int line) {
110 |     //printf("%d (%s:%d)\n", code, file, line);
111 |     if (code != CUSOLVER_STATUS_SUCCESS) {
112 |         fprintf(stderr, "CUBLAS Error: %s %s:%d\n", cusolverErrorString(code), file, line);
113 |         exit(code);
114 |     }
115 | }
116 | 
117 | #else
118 | #define CUBLAS_CALL(ans) (ans)
119 | #define CUDA_CALL(ans) (ans)
120 | #define CUSOLVER_CALL(ans) (ans)
121 | #define CUSPARSE_CALL(ans) (ans)
122 | #endif
123 | 
124 | #define MAX_STREAMS 16
125 | 
126 | 
127 | 
128 | class GPU_Operations {
129 |     cublasHandle_t handle;
130 |     curandState* rng_state;
131 |     cusolverDnHandle_t cudense_handle;
132 |     cusparseHandle_t cusparse_handle;
133 |     std::map<size_t, void*> buffer_map; // keeps track of buffers allocated for potrf
134 |     int* devinfo; // cuSOLVER error reporting
135 |     cudaStream_t streams[MAX_STREAMS];
136 |     cusparseMatDescr_t descr;
137 | 
138 | 
139 | 
140 | public:
141 | float* ones;
142 | 
143 | struct SparseMatrix {
144 | 	float *values;
145 | 	int *columns;
146 | 	int *rowPointers;
147 | 	int m; // number of rows
148 | 	int nnz; // number of nonzero elements
149 | };
150 | 
151 | const SparseMatrix INVALID = {
152 | 	(float*)-1, (int*)-1, (int*)-1, 0, 0
153 | };
154 | 
155 | static SparseMatrix create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m);
156 | 
157 | 
158 | GPU_Operations(int n, int m, int k, unsigned long seed, int gpu_id);
159 | ~GPU_Operations();
160 | 
161 | float* to_device(const float* src, size_t size) const;
162 | int* to_device(const int* src, size_t size) const;
163 | SparseMatrix* to_device(const SparseMatrix* src, size_t size) const;
164 | 
165 | float* to_host(float* src, float* dst, size_t size) const {
166 |     CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
167 |     free(src);
168 |     return dst;
169 | }
170 | 
171 | int* to_host(int* src, int* dst, size_t size) const {
172 |     CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
173 |     free(src);
174 |     return dst;
175 | }
176 | 
177 | float* copy_to_host(const float* src, float* dst, size_t size) const {
178 |     CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
179 |     return dst;
180 | }
181 | 
182 | int* copy_to_host(const int* src, int* dst, size_t size) const {
183 |     CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToHost));
184 |     return dst;
185 | }
186 | 
187 | void set_stream(unsigned iterator) const {
188 |     unsigned stream_id = iterator % MAX_STREAMS;
189 |     CUBLAS_CALL(cublasSetStream_v2(handle, streams[stream_id]));
190 | }
191 | 
192 | void synchronize_stream(unsigned iterator) const {
193 |     unsigned stream_id = iterator % MAX_STREAMS;
194 |     CUDA_CALL(cudaStreamSynchronize(streams[stream_id]));
195 | }
196 | 
197 | void synchronize_all_streams() const {
198 |     for (unsigned i = 0; i < MAX_STREAMS; ++i) {
199 |         synchronize_stream(i);
200 |     }
201 | }
202 | 
203 | void default_stream() const {
204 |     CUBLAS_CALL(cublasSetStream_v2(handle, NULL));
205 | }
206 | 
207 | void gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
208 |         const float *a, const int lda, const float *b, const int ldb, const float beta, float *c,
209 |         const int ldc) const {
210 |     cublasOperation_t ta = tolower(transa[0]) == 'n' ? CUBLAS_OP_N : CUBLAS_OP_T;
211 |     cublasOperation_t tb = tolower(transb[0]) == 'n' ? CUBLAS_OP_N : CUBLAS_OP_T;
212 |     CUBLAS_CALL(cublasSgemm(handle, ta, tb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
213 | }
214 | 
215 | void gemm(const char *transa, const char *transb, const int m,
216 |           const int n, const int k, const float alpha,
217 |           const SparseMatrix* a, const int lda, const float *b,
218 |           const int ldb, const float beta, float *c,
219 |           const int ldc);
220 | 
221 | void gemm(const char *transa, const char *transb, const int m,
222 |           const int n, const int k, const float alpha, const float *a,
223 |           const int lda, const SparseMatrix* b, const int ldb,
224 |           const float beta, float *c, const int ldc);
225 | 
226 | void dgmm(const char* mode, const int m, const int n, const float* A,
227 |           int lda, const float* x, int incx, float* C,
228 |           int ldc) const {
229 |     cublasSideMode_t lr = mode[0] == 'l' ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
230 |     CUBLAS_CALL(cublasSdgmm(handle, lr, m, n, A, lda, x, incx, C, ldc));
231 | }
232 | 
233 | void symm(const char *side, const char *uplo, const int m, const int n,
234 |           const float alpha, const float *a, const int lda, const float *b,
235 |           const int ldb, const float beta, float *c, const int ldc) const {
236 |     cublasSideMode_t s = tolower(side[0]) == 'l' ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT;
237 |     cublasFillMode_t ul = uplo_to_cublas(uplo);
238 |     CUBLAS_CALL(cublasSsymm(handle, s, ul, m, n, &alpha,a, lda, b, ldb, &beta, c, ldc));
239 | }
240 | 
241 | void axpy(const int n, const float alpha, const float* x, const int incx, float *y, const int incy) const {
242 |     CUBLAS_CALL(cublasSaxpy(handle, n, &alpha, x, incx, y, incy));
243 | }
244 | 
245 | int potrf(const char *uplo, int n, float* a, int lda) {
246 |     cublasFillMode_t ul = uplo_to_cublas(uplo);
247 |     int bufsize = 0;
248 |     int info = 0;
249 |     CUSOLVER_CALL(cusolverDnSpotrf_bufferSize(cudense_handle, ul, n, a, lda, &bufsize));
250 | 
251 |     float* buffer = (float*) get_buffer(bufsize * sizeof(float));
252 | 
253 |     CUSOLVER_CALL(cusolverDnSpotrf(cudense_handle, ul, n, a, lda, buffer, bufsize, devinfo));
254 |     CUDA_CALL(cudaMemcpy(&info, devinfo, sizeof(int), cudaMemcpyDeviceToHost));
255 |     return info;
256 | }
257 | 
258 | void* get_buffer(size_t bufsize) {
259 |     // See if we already have a buffer of correct size, otherwise allocate
260 |     void* buffer = 0;
261 |     auto it = buffer_map.find(bufsize);
262 |     if (it != buffer_map.end()) {
263 |         buffer = it->second;
264 |     } else {
265 |         buffer = malloc(bufsize);
266 |         buffer_map[bufsize] = buffer;
267 |     }
268 |     return buffer;
269 | }
270 | 
271 | int potrs(const char *uplo, int n, int nrhs, float * a, int lda, float *b, int ldb) const {
272 |     int info;
273 |     cublasFillMode_t ul = uplo_to_cublas(uplo);
274 |     CUSOLVER_CALL(cusolverDnSpotrs(cudense_handle, ul, n, nrhs, a, lda, b, ldb, devinfo));
275 |     CUDA_CALL(cudaMemcpy(&info, devinfo, sizeof(info), cudaMemcpyDeviceToHost));
276 |     return info;
277 | }
278 | 
279 | int posv(const char *uplo, int n, int nrhs, float * a, int lda, float *b, int ldb) {
280 |     int info = potrf(uplo, n, a, lda);
281 |     if (info == 0)
282 |         info = potrs(uplo, n, nrhs, a, lda, b, ldb);
283 |     return info;
284 | }
285 | 
286 | void* memset(void* dest, int ch, size_t count) const {
287 |     CUDA_CALL(cudaMemset(dest, ch, count));
288 |     return dest;
289 | }
290 | 
291 | float* memcpy(void* dest, const void *src, size_t count) const {
292 |     CUDA_CALL(cudaMemcpy(dest, src, count, cudaMemcpyDeviceToDevice));
293 |     return 0;
294 | }
295 | 
296 | void free(void* ptr) const {
297 |     if (ptr != 0 && ptr != &INVALID) {
298 |         CUDA_CALL(cudaFree(ptr));
299 |     }
300 | }
301 | 
302 | void free_devicememory(void* ptr) const {
303 |     if (ptr != 0) {
304 |         CUDA_CALL(cudaFree(ptr));
305 |     }
306 | }
307 | 
308 | void free_devicememory(SparseMatrix* matrix) {
309 |     if (matrix != 0 && matrix != &INVALID) {
310 |         free(matrix->columns);
311 |         free(matrix->values);
312 |         free(matrix->rowPointers);
313 |         std::free(matrix);
314 |     }
315 | }
316 | 
317 | float* malloc(size_t size) const {
318 |     float* retval = 0;
319 |     cudaError_t err = cudaMalloc(&retval, size);
320 |     CUDA_CALL(err);
321 |     if (err != cudaSuccess) {
322 |         fprintf(stderr, "cudaMalloc failed\n");
323 |         retval = 0;
324 |     }
325 |     return retval;
326 | }
327 | 
328 | int* malloci(size_t size) const {
329 |     int* retval = 0;
330 |     cudaError_t err = cudaMalloc(&retval, size);
331 |     CUDA_CALL(err);
332 |     if (err != cudaSuccess) {
333 |         fprintf(stderr, "cudaMalloc failed\n");
334 |         retval = 0;
335 |     }
336 |     return retval;
337 | }
338 | 
339 | void fill_eye(float* X, unsigned n) const;
340 | void fill(float* X, const unsigned size, const float value) const;
341 | void maximum(float* x, const float value, const unsigned size) const;
342 | void leaky_relu(float* x, const float value, const unsigned size) const;
343 | void tanh(float* x, const unsigned size) const;
344 | void sigmoid(float* x, const unsigned size) const;
345 | void soft_threshold(float* x, const float alpha, const int size) const;
346 | void invsqrt(float* s, const unsigned n) const;
347 | 
348 | void invert(float* X, const unsigned size) const;
349 | 
350 | void calculate_column_variance(const float* X, const unsigned nrows, const unsigned ncols, float* variances) const;
351 | void scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const;
352 | void scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const;
353 | void dropout(float* X, const unsigned size, const float dropout_rate) const;
354 | void add_saltpepper_noise(float* X, const unsigned size, const float noise_rate) const;
355 | void add_gauss_noise(float* X, const unsigned size, const float noise_rate) const;
356 | 
357 | void calculate_column_variance(const SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* variances);
358 | void scale_columns(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const;
359 | void scale_rows(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const;
360 | void dropout(SparseMatrix* X, const unsigned size, const float dropout_rate) const;
361 | void add_saltpepper_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const;
362 | void add_gauss_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const;
363 | 
364 | template<typename T>
365 | T init_invalid(void) {
366 |     return (typeid(T) == typeid(SparseMatrix*) ? (T) &INVALID : (T) 0);
367 | }
368 | 
369 | template<typename T>
370 | T malloc_matrix(int rows, int cols) {
371 |     return malloc_matrix(rows, cols, init_invalid<T>());
372 | }
373 | 
374 | SparseMatrix* malloc_matrix(int rows, int cols, SparseMatrix* dummy) {
375 |     SparseMatrix* matrix = (SparseMatrix*) std::malloc(sizeof(SparseMatrix));
376 |     return matrix;
377 | }
378 | 
379 | float* malloc_matrix(int rows, int cols, float *dummy) {
380 |     return (float*) malloc(rows * cols * sizeof(float));
381 | }
382 | 
383 | float *memcpy_matrix(float *dest, float *src, int nrows_to_copy, int src_ncol, int first_row = 0) const {
384 |     return memcpy(dest, &src[first_row * src_ncol], nrows_to_copy * src_ncol * sizeof(float));
385 | }
386 | 
387 | SparseMatrix* memcpy_matrix(SparseMatrix* dest, SparseMatrix* src, int nrows_to_copy, int src_ncol, int first_row = 0) const {
388 |     int fromIndex = 0;
389 |     int toIndex   = 0;
390 |     CUDA_CALL(cudaMemcpy(&fromIndex, &src->rowPointers[first_row], sizeof(int), cudaMemcpyDeviceToHost));
391 |     CUDA_CALL(cudaMemcpy(&toIndex  , &src->rowPointers[first_row + nrows_to_copy], sizeof(int), cudaMemcpyDeviceToHost));
392 | 
393 |     dest->nnz = (toIndex - fromIndex);
394 |     dest->m = nrows_to_copy;
395 | 
396 |     dest->values = malloc(dest->nnz * sizeof(float));
397 |     dest->columns = malloci(dest->nnz * sizeof(int));
398 |     dest->rowPointers = malloci((nrows_to_copy + 1) * sizeof(int));
399 | 
400 |     memcpy(dest->values, &src->values[fromIndex], dest->nnz * sizeof(float));
401 |     memcpy(dest->columns, &src->columns[fromIndex], dest->nnz * sizeof(int));
402 |     memcpy(dest->rowPointers, &src->rowPointers[first_row], (nrows_to_copy + 1) * sizeof(int));
403 |     subtract_first_element(dest->rowPointers, nrows_to_copy + 1);
404 | 
405 |     return dest;
406 | }
407 | 
408 | void subtract_first_element(int* a, unsigned len) const;
409 | 
410 | void free_sparse(void *ptr) {
411 | }
412 | 
413 | void free_sparse(SparseMatrix* a) {
414 |     // see get batch
415 |     if (handle_valid(a)) {
416 |         free(a->rowPointers);
417 |         std::free(a);
418 |     }
419 | }
420 | 
421 | bool handle_valid(SparseMatrix* a) {
422 |     return a != &INVALID;
423 | }
424 | 
425 | float* get_batch(const float* X, int ncol, int batch_num, int batch_size) {
426 |     /* return pointer */
427 |     return (float*) &X[batch_num * batch_size * ncol];
428 | }
429 | 
430 | SparseMatrix* get_batch(SparseMatrix* X, int ncol, int batch_num, int batch_size) {
431 |     // ncol can be ignored
432 |     // batch_size number of rows
433 |     int from = batch_num * batch_size;
434 |     int nrows = batch_size;
435 | 
436 |     SparseMatrix* dest = (SparseMatrix*) std::malloc(sizeof(SparseMatrix));
437 |     int fromIndex = 0;
438 |     int toIndex   = 0;
439 |     CUDA_CALL(cudaMemcpy(&fromIndex, &X->rowPointers[from], sizeof(int), cudaMemcpyDeviceToHost));
440 |     CUDA_CALL(cudaMemcpy(&toIndex  , &X->rowPointers[from + nrows], sizeof(int), cudaMemcpyDeviceToHost));
441 | 
442 |     dest->nnz = (toIndex - fromIndex);
443 |     dest->m = nrows;
444 |     dest->values = &X->values[fromIndex];
445 |     dest->columns = &X->columns[fromIndex];
446 |     dest->rowPointers = malloci((nrows + 1) * sizeof(int));
447 |     memcpy(dest->rowPointers, &X->rowPointers[from], (nrows + 1) * sizeof(int));
448 |     subtract_first_element(dest->rowPointers, nrows + 1);
449 |     return dest;
450 | }
451 | 
452 | SparseMatrix* transpose(const SparseMatrix* x, int ncol) {
453 |     SparseMatrix* t = (SparseMatrix*) std::malloc(sizeof(SparseMatrix));
454 |     t->values = //(float*) get_buffer(x->nnz * sizeof(float));
455 |             malloc(x->nnz * sizeof(float));
456 |     t->columns = //(int*) get_buffer(x->nnz * sizeof(int));
457 |             malloci(x->nnz * sizeof(int));
458 |     t->rowPointers = //(int*) get_buffer((ncol + 1) * sizeof(int));
459 |             malloci((ncol + 1) * sizeof(int));
460 |     t->nnz = x->nnz;
461 |     t->m = ncol;
462 |     CUSPARSE_CALL(cusparseScsr2csc(cusparse_handle, x->m, ncol, x->nnz, x->values, x->rowPointers, x->columns, t->values,
463 |             t->columns, t->rowPointers, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
464 | 
465 |     return t;
466 | }
467 | 
468 | // Useful for debugging
469 | void printm(const char* name, const SparseMatrix *a, int n, int m) const {
470 |     printf("%s\n", name);
471 |     printMatrixSPM(a, n, m, 0);
472 | }
473 | 
474 | void printm(const char* name, const float* a, int n, int m) const {
475 |     printf("%s\n", name);
476 |     printMatrixRM(a, n, m, 0);
477 | }
478 | 
479 | void printMatrixCM(const float* a, int n, int m, const char* fmt) const;
480 | void printMatrixRM(const float* a, int n, int m, const char* fmt) const;
481 | 
482 | void printMatrixSP(const SparseMatrix* a, const char* fmt) const;
483 | void printMatrixRM(const SparseMatrix* a, int n, int m, const char* fmt) const {
484 |     printMatrixSPM(a, n, m, fmt);
485 | }
486 | 
487 | void printMatrixSPM(const SparseMatrix* a, int n, int m, const char* fmt) const;
488 | 
489 | void prints(const float* f, unsigned l) const {
490 |     float* src = (float*) std::malloc(l * sizeof(float));
491 |     copy_to_host(f, src, l * sizeof(float));
492 |     for (unsigned i = 0; i < l; ++i) {
493 |         printf("%f ", src[i]);
494 |     }
495 |     printf("\n");
496 |     std::free(src);
497 | }
498 | 
499 | void printsu(const int* f, unsigned l) const {
500 |     int* src = (int*) std::malloc(l * sizeof(int));
501 |     copy_to_host(f, src, l * sizeof(int));
502 |     for (unsigned i = 0; i < l; ++i) {
503 |         printf("%d ", src[i]);
504 |     }
505 |     printf("\n");
506 |     std::free(src);
507 | }
508 | };
509 | 


--------------------------------------------------------------------------------
/gpu_operations.cu:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2015-2017 Thomas Unterthiner
  3 | Additional Contributions by Thomas Adler, Balázs Bencze
  4 | Licensed under GPL, version 2 or a later (see LICENSE.txt)
  5 | */
  6 | 
  7 | #include <cuda_runtime.h>
  8 | #include <cublas_v2.h>
  9 | #include <curand_kernel.h>
 10 | #include <stdexcept>
 11 | 
 12 | #include "gpu_operations.h"
 13 | 
 14 | static const int RNG_THREADS = 128;
 15 | static const int RNG_BLOCKS = 128;
 16 | 
 17 | // taken from PyCUDA
 18 | void get_grid_sizes(int problemsize, int* blocks, int* threads) {
 19 |     int min_threads = 32;
 20 |     int max_threads = 256;
 21 |     int max_blocks = 384;
 22 | 
 23 |     if (problemsize < min_threads) {
 24 |         *blocks = 1;
 25 |         *threads = min_threads;
 26 |     } else if (problemsize < max_blocks * min_threads) {
 27 |         *blocks = (problemsize + min_threads - 1) / min_threads;
 28 |         *threads = min_threads;
 29 |     } else if (problemsize < max_blocks * max_threads) {
 30 |         *blocks = max_blocks;
 31 |         int grp = (problemsize + min_threads - 1) / min_threads;
 32 |         *threads = ((grp + max_blocks - 1) / max_blocks) * min_threads;
 33 |     } else {
 34 |         *blocks = max_blocks;
 35 |         *threads = max_threads;
 36 |     }
 37 | }
 38 | 
 39 | __global__ void setup_rng(curandState* rng_state, unsigned long seed) {
 40 |     const int tid = blockIdx.x * blockDim.x + threadIdx.x;
 41 |     curand_init(seed, tid, 0, &rng_state[tid]);
 42 | }
 43 | 
 44 | __global__ void dropout_eltw(float* x, const unsigned size, const float dropout_rate, curandState* rng_state) {
 45 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 46 |     const unsigned num_threads = gridDim.x * blockDim.x;
 47 |     curandState localState = rng_state[tid];
 48 |     for (unsigned i = tid; i < size; i += num_threads)
 49 |         x[i] = (curand_uniform(&localState) < dropout_rate) ? 0.0 : x[i];
 50 |     rng_state[tid] = localState;
 51 | }
 52 | 
 53 | __global__ void saltpepper_noise_eltw(float* x, const unsigned size, const float noise_rate, curandState* rng_state) {
 54 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 55 |     const unsigned num_threads = gridDim.x * blockDim.x;
 56 |     curandState localState = rng_state[tid];
 57 |     for (unsigned i = tid; i < size; i += num_threads)
 58 |         if (curand_uniform(&localState) < noise_rate) {
 59 |             x[i] = (curand_uniform(&localState) < 0.5f) ? 0.0f : 1.0f;
 60 |         }
 61 |     rng_state[tid] = localState;
 62 | 
 63 | }
 64 | 
 65 | __global__ void gauss_noise_eltw(float* x, const unsigned size, const float noise_rate, curandState* rng_state) {
 66 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 67 |     const unsigned num_threads = gridDim.x * blockDim.x;
 68 |     curandState localState = rng_state[tid];
 69 |     for (unsigned i = tid; i < size; i += num_threads)
 70 |         x[i] += curand_normal(&localState) * noise_rate;
 71 |     rng_state[tid] = localState;
 72 | 
 73 | }
 74 | 
 75 | __global__ void leaky_relu_eltw(float* x, const float value, const unsigned size) {
 76 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 77 |     const unsigned num_threads = gridDim.x * blockDim.x;
 78 |     for (unsigned i = tid; i < size; i += num_threads) {
 79 |         x[i] = (x[i] < 0.0f) ? x[i] * value : x[i];
 80 |     }
 81 | }
 82 | 
 83 | __global__ void maximum_eltw(float* x, const float value, const unsigned size) {
 84 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 85 |     const unsigned num_threads = gridDim.x * blockDim.x;
 86 |     for (unsigned i = tid; i < size; i += num_threads) {
 87 |         x[i] = fmaxf(x[i], value);
 88 |     }
 89 | }
 90 | 
 91 | __global__ void sigmoid_eltw(float* x, const unsigned size) {
 92 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
 93 |     const unsigned num_threads = gridDim.x * blockDim.x;
 94 |     for (unsigned i = tid; i < size; i += num_threads) {
 95 |         x[i] = 1 / (1 + __expf(-x[i]));
 96 |     }
 97 | }
 98 | 
 99 | __global__ void tanh_eltw(float* x, const unsigned size) {
100 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
101 |     const unsigned num_threads = gridDim.x * blockDim.x;
102 |     for (unsigned i = tid; i < size; i += num_threads) {
103 |         x[i] = tanhf(x[i]);
104 |     }
105 | }
106 | 
107 | __global__ void softthreshold_eltw(float* x, float alpha, const unsigned size) {
108 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
109 |     const unsigned num_threads = gridDim.x * blockDim.x;
110 |     for (unsigned i = tid; i < size; i += num_threads) {
111 |         const float f = x[i];
112 |         x[i] = f > 0 ? fmaxf(0., f - alpha) : fminf(0., f + alpha);
113 |     }
114 | }
115 | 
116 | __global__ void fill_eltw(float* x, const unsigned size, const float value) {
117 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
118 |     const unsigned num_threads = gridDim.x * blockDim.x;
119 |     for (unsigned i = tid; i < size; i += num_threads) {
120 |         x[i] = value;
121 |     }
122 | }
123 | 
124 | __global__ void invert_eltw(float* x, const unsigned size) {
125 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
126 |     const unsigned num_threads = gridDim.x * blockDim.x;
127 |     for (unsigned i = tid; i < size; i += num_threads) {
128 |         x[i] = 1.0f / x[i];
129 |     }
130 | }
131 | 
132 | __global__ void col_variance_kernel(const float* X, float* var, const unsigned nrows, const unsigned ncols) {
133 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
134 |     const unsigned num_threads = blockDim.x * gridDim.x;
135 |     for (unsigned i = tid; i < ncols; i += num_threads) {
136 |         var[i] = 0.0;
137 |         for (unsigned j = 0; j < nrows; ++j) {
138 |             var[i] += X[j * ncols + i];
139 |         }
140 |         float m = var[i] / nrows;
141 |         var[i] = 0.0;
142 |         for (unsigned j = 0; j < nrows; ++j) {
143 |             float tmp = X[j * ncols + i] - m;
144 |             var[i] += tmp * tmp;
145 |         }
146 |         var[i] /= nrows;
147 |     }
148 | }
149 | 
150 | __global__ void invsqrt_eltw(float* x, const unsigned k) {
151 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
152 |     const unsigned num_threads = blockDim.x * gridDim.x;
153 |     for (unsigned i = tid; i < k; i += num_threads) {
154 |         x[i] = (x[i] > 1e-7) ? rsqrtf(x[i]) : 1.0;
155 |     }
156 | }
157 | 
158 | __global__ void scale_columns_kernel(float* X, float* a, const unsigned nrows, const unsigned ncols) {
159 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
160 |     const unsigned num_threads = blockDim.x * gridDim.x;
161 |     for (unsigned i = tid; i < ncols * nrows; i += num_threads) {
162 |         X[i] *= a[i % ncols];
163 |     }
164 | }
165 | 
166 | __global__ void scale_rows_kernel(float* X, float* a, const unsigned nrows, const unsigned ncols) {
167 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
168 |     const unsigned num_threads = blockDim.x * gridDim.x;
169 |     for (unsigned i = tid; i < ncols * nrows; i += num_threads) {
170 |         X[i] *= a[i / ncols];
171 |     }
172 | }
173 | 
174 | __global__ void subtract_first_kernel(int* x, const unsigned len) {
175 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
176 |     const unsigned num_threads = blockDim.x * gridDim.x;
177 |     const unsigned elem = x[0];
178 |     for (unsigned i = tid; i < len; i += num_threads) {
179 |         x[i] -= elem;
180 |     }
181 | }
182 | 
183 | __global__ void sparse_col_variance_kernel(const GPU_Operations::SparseMatrix X, float* var, const unsigned nrows,
184 |         const unsigned ncols) {
185 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
186 |     const unsigned num_threads = blockDim.x * gridDim.x;
187 |     for (unsigned i = tid; i < ncols; i += num_threads) {
188 |         var[i] = 0.0;
189 |         for (unsigned j = 0; j < X.nnz; ++j) {
190 |             if (X.columns[j] == i) {
191 |                 var[i] += X.values[j];
192 |             }
193 |         }
194 |         float m = var[i] / nrows;
195 |         var[i] = 0.0;
196 |         unsigned nonzero_per_column = 0;
197 |         for (unsigned j = 0; j < X.nnz; ++j) {
198 |             if (X.columns[j] == i) {
199 |                 float tmp = X.values[j] - m;
200 |                 var[i] += tmp * tmp;
201 |                 nonzero_per_column++;
202 |             }
203 |         }
204 |         var[i] += (nrows - nonzero_per_column) * (m * m);
205 |         var[i] /= nrows;
206 |     }
207 | }
208 | 
209 | __global__ void sparse_row_variance_kernel(const GPU_Operations::SparseMatrix X, float* var, const unsigned nrows,
210 |         const unsigned ncols) {
211 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
212 |     const unsigned num_threads = blockDim.x * gridDim.x;
213 |     for (unsigned i = tid; i < nrows; i += num_threads) {
214 |         var[i] = 0.0;
215 |         int from = X.rowPointers[i];
216 |         int to = X.rowPointers[i + 1];
217 |         for (int j = from; j < to; ++j) {
218 |             var[i] += X.values[j];
219 |         }
220 |         float m = var[i] / ncols;
221 |         var[i] = 0.0;
222 |         for (int j = from; j < to; ++j) {
223 |             float tmp = X.values[j] - m;
224 |             var[i] += tmp * tmp;
225 |         }
226 |         var[i] += (ncols - to + from) * (m * m);
227 |         var[i] /= ncols;
228 |     }
229 | }
230 | 
231 | __global__ void sparse_scale_columns_kernel(GPU_Operations::SparseMatrix X, float* a, const unsigned nrows, const unsigned ncols) {
232 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
233 |     const unsigned num_threads = blockDim.x * gridDim.x;
234 |     for (unsigned i = tid; i < X.nnz; i += num_threads) {
235 |         X.values[i] *= a[X.columns[i]];
236 |     }
237 | }
238 | 
239 | __global__ void sparse_scale_rows_kernel(GPU_Operations::SparseMatrix X, float* a) {
240 |     const unsigned tid = blockIdx.x * blockDim.x + threadIdx.x;
241 |     const unsigned num_threads = blockDim.x * gridDim.x;
242 |     for (unsigned i = tid; i < X.m; i += num_threads) {
243 |         for (unsigned j = X.rowPointers[i]; j < X.rowPointers[i + 1]; ++j) {
244 |             X.values[j] *= a[i];
245 |         }
246 |     }
247 | }
248 | 
249 | GPU_Operations::GPU_Operations(const int n, const int m, const int k, unsigned long seed, int gpu_id) {
250 |     // if no GPU was specified, try to pick the best one automatically
251 |     if (gpu_id < 0) {
252 |         gpu_id = 0;
253 |         int num_devices, device;
254 |         cudaGetDeviceCount(&num_devices);
255 |         if (num_devices > 1) {
256 |             size_t max_freememory = 0;
257 |             for (device = 0; device < num_devices; device++) {
258 |                 size_t free, total;
259 |                 cudaSetDevice(device);
260 |                 cudaMemGetInfo(&free, &total);
261 |                 cudaDeviceProp prop;
262 |                 cudaGetDeviceProperties(&prop, device);
263 |                 //printf("Found device %d (%s) with %d MiB of free memory\n",
264 |                 //    device, prop.name, free / (1024l*1024l));
265 |                 if (free > max_freememory) {
266 |                     max_freememory = free;
267 |                     gpu_id = device;
268 |                 }
269 |                 cudaDeviceReset();
270 |             }
271 |         }
272 |     }
273 |     assert(gpu_id >= 0);
274 |     cudaSetDevice(gpu_id);
275 | 
276 |     // the following call does not work if the current process has already
277 |     // called into librfn previously. Then, this call will return
278 |     // cudaErrorSetOnActiveProcess. Resetting the device won't work either,
279 |     // because then the subsequent cublasCreate call will just fail with
280 |     // CUBLAS_STATUS_NOT_INITIALIZED. I don't know why any of this is happening
281 |     //CUDA_CALL(cudaSetDeviceFlags(cudaDeviceScheduleYield));
282 | 
283 |     cublasStatus_t status = cublasCreate(&handle);
284 |     if (status != CUBLAS_STATUS_SUCCESS) {
285 |         const char* errmsg = cublasErrorString(status);
286 |         fprintf(stderr, "CUBLAS initialization error: %s\n", errmsg);
287 |         cudaDeviceReset();
288 |         throw std::runtime_error(errmsg);
289 |     }
290 |     CUSOLVER_CALL(cusolverDnCreate(&cudense_handle));
291 |     CUDA_CALL(cudaMalloc(&rng_state, RNG_BLOCKS * RNG_THREADS * sizeof(curandState)));
292 |     setup_rng<<<RNG_BLOCKS, RNG_THREADS>>>(rng_state, seed);
293 |     int ones_size = n > k ? n : k;
294 |     ones = (float*) malloc(ones_size * sizeof(float));
295 |     fill(ones, ones_size, 1.0f);
296 |     CUDA_CALL(cudaMalloc(&devinfo, sizeof(int)));
297 | 
298 |     cusparseStatus_t sp_status = cusparseCreate(&cusparse_handle);
299 |     if (sp_status != CUSPARSE_STATUS_SUCCESS) {
300 |         fprintf(stderr, "cuSparse: %d\n", sp_status);
301 |         cudaDeviceReset();
302 |         throw std::runtime_error("cuSparse error");
303 |     }
304 | 
305 |     for (int i = 0; i < MAX_STREAMS; i++) {
306 |         CUDA_CALL(cudaStreamCreate(&streams[i]));
307 |     }
308 | 
309 |     CUSPARSE_CALL(cusparseCreateMatDescr(&descr));
310 |     CUSPARSE_CALL(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
311 |     CUSPARSE_CALL(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
312 | }
313 | 
314 | GPU_Operations::~GPU_Operations() {
315 |     free(devinfo);
316 |     free(ones);
317 |     for (auto i : buffer_map) {
318 |         free(i.second);
319 |     }
320 |     CUSOLVER_CALL(cusolverDnDestroy(cudense_handle));
321 |     CUBLAS_CALL(cublasDestroy(handle));
322 |     for (int i = 0; i < MAX_STREAMS; i++) {
323 |         CUDA_CALL(cudaStreamSynchronize(streams[i]));
324 |         CUDA_CALL(cudaStreamDestroy(streams[i]));
325 |     }
326 |     CUSPARSE_CALL(cusparseDestroyMatDescr(descr));
327 | }
328 | 
329 | GPU_Operations::SparseMatrix GPU_Operations::create_sparse_matrix(const float* Xvals, const int* Xcols, const int *Xrowptr, int n, int m){
330 |     SparseMatrix X = {(float*) Xvals, (int*) Xcols, (int*) Xrowptr, n, Xrowptr[n]};
331 |     return X;
332 | }
333 | 
334 | 
335 | float* GPU_Operations::to_device(const float* src, size_t size) const {
336 |     float* dst = 0;
337 |     CUDA_CALL(cudaMalloc(&dst, size));
338 |     CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
339 |     return dst;
340 | }
341 | 
342 | int* GPU_Operations::to_device(const int* src, size_t size) const {
343 |     int* dst = 0;
344 |     CUDA_CALL(cudaMalloc(&dst, size));
345 |     CUDA_CALL(cudaMemcpy(dst, src, size, cudaMemcpyHostToDevice));
346 |     return dst;
347 | }
348 | 
349 | GPU_Operations::SparseMatrix* GPU_Operations::to_device(const SparseMatrix* src, size_t size) const {
350 |     SparseMatrix* dst = (SparseMatrix*) std::malloc(sizeof(SparseMatrix));
351 | 
352 |     dst->values = to_device(src->values, src->nnz * sizeof(float));
353 |     dst->columns = to_device(src->columns, src-> nnz * sizeof(int));
354 |     dst->rowPointers = to_device(src->rowPointers, (src->m + 1) * sizeof(int));
355 |     dst->m = src->m;
356 |     dst->nnz = src->nnz;
357 | 
358 |     return dst;
359 | }
360 | 
361 | void GPU_Operations::fill(float* X, const unsigned size, const float value) const {
362 |     int threads, blocks;
363 |     get_grid_sizes(size, &threads, &blocks);
364 |     fill_eltw<<<blocks, threads>>>(X, size, value);
365 |     assert(!cudaGetLastError());
366 | }
367 | 
368 | void GPU_Operations::dropout(float* X, const unsigned size, const float dropout_rate) const {
369 |     dropout_eltw<<<RNG_BLOCKS, RNG_THREADS>>>(X, size, dropout_rate, rng_state);
370 |     assert(!cudaGetLastError());
371 | }
372 | 
373 | void GPU_Operations::add_gauss_noise(float* X, const unsigned size, const float noise_rate) const {
374 |     gauss_noise_eltw<<<RNG_BLOCKS, RNG_THREADS>>>(X, size, noise_rate, rng_state);
375 |     assert(!cudaGetLastError());
376 | }
377 | 
378 | void GPU_Operations::add_saltpepper_noise(float* X, const unsigned size, const float noise_rate) const {
379 |     saltpepper_noise_eltw<<<RNG_BLOCKS, RNG_THREADS>>>(X, size, noise_rate, rng_state);
380 |     assert(!cudaGetLastError());
381 | }
382 | 
383 | void GPU_Operations::invert(float* X, const unsigned size) const {
384 |     int threads, blocks;
385 |     get_grid_sizes(size, &threads, &blocks);
386 |     invert_eltw<<<blocks, threads>>>(X, size);
387 |     assert(!cudaGetLastError());
388 | }
389 | 
390 | void GPU_Operations::maximum(float* x, const float value, const unsigned size) const {
391 |     int threads, blocks;
392 |     get_grid_sizes(size, &threads, &blocks);
393 |     maximum_eltw<<<blocks, threads>>>(x, value, size);
394 |     assert(!cudaGetLastError());
395 | }
396 | 
397 | void GPU_Operations::leaky_relu(float* x, const float value, const unsigned size) const {
398 |     int threads, blocks;
399 |     get_grid_sizes(size, &threads, &blocks);
400 |     leaky_relu_eltw<<<blocks, threads>>>(x, value, size);
401 |     assert(!cudaGetLastError());
402 | }
403 | 
404 | void GPU_Operations::sigmoid(float* x, const unsigned size) const {
405 |     int threads, blocks;
406 |     get_grid_sizes(size, &threads, &blocks);
407 |     sigmoid_eltw<<<blocks, threads>>>(x, size);
408 |     assert(!cudaGetLastError());
409 | }
410 | 
411 | void GPU_Operations::tanh(float* x, const unsigned size) const {
412 |     int threads, blocks;
413 |     get_grid_sizes(size, &threads, &blocks);
414 |     tanh_eltw<<<blocks, threads>>>(x, size);
415 |     assert(!cudaGetLastError());
416 | }
417 | 
418 | void GPU_Operations::soft_threshold(float* x, const float alpha, const int size) const {
419 |     int threads, blocks;
420 |     get_grid_sizes(size, &threads, &blocks);
421 |     softthreshold_eltw<<<blocks, threads>>>(x, alpha, size);
422 |     assert(!cudaGetLastError());
423 | }
424 | 
425 | void GPU_Operations::fill_eye(float* X, unsigned n) const {
426 |     memset(X, 0, n * n * sizeof(float));
427 |     axpy(n, 1.0f, ones, 0, X, n + 1);
428 | }
429 | 
430 | void GPU_Operations::calculate_column_variance(const float* X, const unsigned nrows, const unsigned ncols,
431 |         float* variance) const {
432 |     int threads, blocks;
433 |     get_grid_sizes(ncols, &threads, &blocks);
434 |     col_variance_kernel<<<threads, blocks>>>(X, variance, nrows, ncols);
435 | }
436 | 
437 | void GPU_Operations::invsqrt(float* s, const unsigned n) const {
438 |     int t, b;
439 |     get_grid_sizes(n, &t, &b);
440 |     invsqrt_eltw<<<t, b>>>(s, n);
441 | }
442 | 
443 | void GPU_Operations::scale_columns(float* X, const unsigned nrows, const unsigned ncols, float* s) const {
444 | 
445 |     int threads, blocks;
446 |     get_grid_sizes(ncols * nrows, &threads, &blocks);
447 |     scale_columns_kernel<<<threads, blocks>>>(X, s, nrows, ncols);
448 | }
449 | 
450 | void GPU_Operations::scale_rows(float* X, const unsigned nrows, const unsigned ncols, float* s) const {
451 |     int threads, blocks;
452 |     get_grid_sizes(ncols * nrows, &threads, &blocks);
453 |     scale_rows_kernel<<<threads, blocks>>>(X, s, nrows, ncols);
454 | }
455 | 
456 | void GPU_Operations::subtract_first_element(int* a, unsigned len) const {
457 |     int threads, blocks;
458 |     get_grid_sizes(len, &threads, &blocks);
459 |     subtract_first_kernel<<<threads, blocks>>>(a, len);
460 | }
461 | 
462 | void GPU_Operations::calculate_column_variance(const SparseMatrix* X, const unsigned nrows, const unsigned ncols,
463 |         float* variance) {
464 |     int threads, blocks;
465 |     SparseMatrix* x_transpose = transpose(X, ncols);
466 |     get_grid_sizes(nrows, &threads, &blocks);
467 |     sparse_row_variance_kernel<<<threads, blocks>>>(*x_transpose, variance, ncols, nrows);
468 |     free(x_transpose->columns);
469 |     free(x_transpose->values);
470 |     free(x_transpose->rowPointers);
471 |     std::free(x_transpose);
472 | 
473 | }
474 | 
475 | void GPU_Operations::scale_columns(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const {
476 | 
477 |     int threads, blocks;
478 |     get_grid_sizes(X->nnz, &threads, &blocks);
479 |     sparse_scale_columns_kernel<<<threads, blocks>>>(*X, s, nrows, ncols);
480 | }
481 | 
482 | void GPU_Operations::scale_rows(SparseMatrix* X, const unsigned nrows, const unsigned ncols, float* s) const {
483 |     int threads, blocks;
484 |     get_grid_sizes(X->m, &threads, &blocks);
485 |     sparse_scale_rows_kernel<<<threads, blocks>>>(*X, s);
486 | }
487 | 
488 | void GPU_Operations::dropout(SparseMatrix* X, const unsigned size, const float dropout_rate) const {
489 |     dropout_eltw<<<RNG_BLOCKS, RNG_THREADS>>>(X->values, size, dropout_rate, rng_state);
490 |     assert(!cudaGetLastError());
491 | }
492 | 
493 | void GPU_Operations::add_gauss_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const {
494 |     gauss_noise_eltw<<<RNG_BLOCKS, RNG_THREADS>>>(X->values, size, noise_rate, rng_state);
495 |     assert(!cudaGetLastError());
496 | }
497 | 
498 | void GPU_Operations::add_saltpepper_noise(SparseMatrix* X, const unsigned size, const float noise_rate) const {
499 |     saltpepper_noise_eltw<<<RNG_BLOCKS, RNG_THREADS>>>(X->values, size, noise_rate, rng_state);
500 |     assert(!cudaGetLastError());
501 | }
502 | 
503 | void GPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k, const float alpha,
504 |         const SparseMatrix* a, const int lda, const float *b, const int ldb, const float beta, float *c,
505 |         const int ldc) {
506 |     cusparseOperation_t opA = op_to_cusparse(transa);
507 |     cusparseOperation_t opB = op_to_cusparse(transb);
508 | 
509 |     SparseMatrix* row_major_a = transpose(a, opA != CUSPARSE_OPERATION_NON_TRANSPOSE ? k : m);
510 | 
511 |     int ncol_a = k;
512 |     if (opA != CUSPARSE_OPERATION_NON_TRANSPOSE) {
513 |         ncol_a = a->m;
514 |     }
515 | 
516 |     CUSPARSE_CALL(cusparseScsrmm2(cusparse_handle, opA, opB, row_major_a->m, n, ncol_a,
517 |             row_major_a->nnz, &alpha, descr, row_major_a->values, row_major_a->rowPointers, row_major_a->columns, b, ldb, &beta, c, ldc));
518 | 
519 | 
520 |     free(row_major_a->columns);
521 |     free(row_major_a->values);
522 |     free(row_major_a->rowPointers);
523 |     std::free(row_major_a);
524 | }
525 | 
526 | /*void GPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k,
527 |             const float alpha, const float *a, const int lda, const SparseMatrix* b, const int ldb,
528 |             const float beta, float *c, const int ldc) {
529 |     cusparseOperation_t opA = op_to_cusparse(transa);
530 |     cusparseOperation_t opB = op_to_cusparse(transb);
531 |     SparseMatrix* b_trans;
532 | 
533 |     if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) {
534 |         b_trans = transpose(b, n);
535 |     } else {
536 |         b_trans = (SparseMatrix*) std::malloc(sizeof(SparseMatrix));
537 |         b_trans->values = b->values;
538 |         b_trans->columns = b->columns;
539 |         b_trans->rowPointers = b->rowPointers;
540 |         b_trans->m = b->m;
541 |         b_trans->nnz = b->nnz;
542 |     }
543 | 
544 |     int m_a = m; // number of rows of A
545 |     int n_a = k; // number of columns of A
546 |     if (opA != CUSPARSE_OPERATION_NON_TRANSPOSE) {
547 |         m_a = k;
548 |         n_a = m;
549 |     }
550 | 
551 |     int bufsize;
552 |     CUSPARSE_CALL(cusparseSgemvi_bufferSize(cusparse_handle, opA, m_a, n_a, b_trans->nnz, &bufsize));
553 |     void* buffer = get_buffer(bufsize);
554 | 
555 |     int* row_pointers = (int*) std::malloc((b_trans->m + 1) * sizeof(int));
556 |     copy_to_host(b_trans->rowPointers, row_pointers, (b_trans->m + 1) * sizeof(int));
557 | 
558 |     for(unsigned r = 0; r < b_trans->m; ++r) {
559 |         int row_pointer = row_pointers[r];
560 |         int nnz = row_pointers[r + 1] - row_pointer;
561 | 
562 |         set_stream(r);
563 | 
564 |         if (nnz == 0) {
565 |             CUBLAS_CALL(cublasSscal_v2(handle, n, &beta, &c[r * ldc], 1));
566 |         } else if (nnz > 0) {
567 |             CUSPARSE_CALL(cusparseSgemvi(cusparse_handle, opA, m_a, n_a, &alpha, a, lda, nnz,
568 |                     &b_trans->values[row_pointer], &b_trans->columns[row_pointer], &beta, &c[r * ldc], CUSPARSE_INDEX_BASE_ZERO, buffer));
569 |         } else {
570 |             printf("Internal error");
571 |             exit(1);
572 |         }
573 |     }
574 | 
575 |     synchronize_all_streams();
576 |     default_stream();
577 | 
578 |     if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) {
579 |         free(b_trans->values);
580 |         free(b_trans->columns);
581 |         free(b_trans->rowPointers);
582 |     }
583 |     std::free(b_trans);
584 |     std::free(row_pointers);
585 | }*/
586 | 
587 | void GPU_Operations::gemm(const char *transa, const char *transb, const int m, const int n, const int k,
588 |             const float alpha, const float *a, const int lda, const SparseMatrix* b, const int ldb,
589 |             const float beta, float *c, const int ldc) {
590 |     cusparseOperation_t opA = op_to_cusparse(transa);
591 |     cusparseOperation_t opB = op_to_cusparse(transb);
592 |     SparseMatrix* b2;
593 |     float alpha_t = 1.0f;
594 |     float beta_t = 0.0f;
595 | 
596 |     //3)
597 |     int b2_ncol = 0;
598 |     if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) {
599 |         b2 = transpose(b, n);
600 |         b2_ncol = b->m;
601 |     } else {
602 |         b2 = (SparseMatrix*) std::malloc(sizeof(SparseMatrix));
603 |         b2->values = b->values;
604 |         b2->columns = b->columns;
605 |         b2->rowPointers = b->rowPointers;
606 |         b2->m = b->m;
607 |         b2->nnz = b->nnz;
608 |         b2_ncol = k;
609 |     }
610 |     //4)
611 |     float* c2 = (float*) get_buffer(m*n * sizeof(float));
612 |     memcpy(c2, c, m*n * sizeof(float));
613 |     if (beta != 0.0f) {
614 |         CUBLAS_CALL(cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, n, m, &alpha_t, c, ldc, &beta_t, NULL, 0, c2, ldc));
615 |     }
616 | 
617 |     // 4.5
618 |     cusparseOperation_t opA2;
619 |     if (opA == CUSPARSE_OPERATION_NON_TRANSPOSE) {
620 |         opA2 = CUSPARSE_OPERATION_TRANSPOSE;
621 |     } else {
622 |         opA2 = CUSPARSE_OPERATION_NON_TRANSPOSE;
623 |     }
624 | 
625 |     //5)
626 |     CUSPARSE_CALL(cusparseScsrmm2(cusparse_handle, CUSPARSE_OPERATION_NON_TRANSPOSE, opA2, b2->m, m, b2_ncol, b2->nnz, &alpha, descr,
627 |             b2->values, b2->rowPointers, b2->columns, a, lda, &beta, c2, b2->m));
628 | 
629 |     //6
630 |     CUBLAS_CALL(cublasSgeam(handle, CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha_t, c2, b2->m, &beta_t, (float*)0, b2->m, c, ldc));
631 |     if (opB != CUSPARSE_OPERATION_NON_TRANSPOSE) {
632 |         free(b2->columns);
633 |         free(b2->values);
634 |         free(b2->rowPointers);
635 |     }
636 |     std::free(b2);
637 | 
638 | }
639 | 
640 | // Debugging
641 | void GPU_Operations::printMatrixRM(const float* a, int n, int m, const char* fmt) const {
642 |     const char* format = fmt == 0 ? "%1.3f " : fmt;
643 |     size_t size = n * m * sizeof(float);
644 |     float* tmp = (float*) std::malloc(size);
645 |     CUDA_CALL(cudaMemcpy(tmp, a, size, cudaMemcpyDeviceToHost));
646 |     for (int i = 0; i < n; ++i) {
647 |         for (int j = 0; j < m; ++j)
648 |             printf(format, tmp[i * m + j]);
649 |         printf("\n");
650 |     }
651 |     printf("\n");
652 |     std::free(tmp);
653 | }
654 | 
655 | void GPU_Operations::printMatrixCM(const float* a, int n, int m, const char* fmt) const {
656 |     const char* format = fmt == 0 ? "%1.3f " : fmt;
657 |     size_t size = n * m * sizeof(float);
658 |     float* tmp = (float*) std::malloc(size);
659 |     CUDA_CALL(cudaMemcpy(tmp, a, size, cudaMemcpyDeviceToHost));
660 |     for (int i = 0; i < n; ++i) {
661 |         for (int j = 0; j < m; ++j)
662 |             printf(format, tmp[i + j * n]);
663 |         printf("\n");
664 |     }
665 |     printf("\n");
666 |     std::free(tmp);
667 | }
668 | 
669 | void GPU_Operations::printMatrixSP(const SparseMatrix *a, const char* fmt) const {
670 |     const char* format = fmt == 0 ? "%1.3f " : fmt;
671 |     size_t size_values = a->nnz * sizeof(float);
672 |     size_t size_columns = a->nnz * sizeof(int);
673 |     size_t size_pointers = (a->m + 1)* sizeof(int);
674 | 
675 |     float* tmp_vals = (float*) std::malloc(size_values);
676 |     int* tmp_cols = (int*) std::malloc(size_columns);
677 |     int* tmp_pointers = (int*) std::malloc(size_pointers);
678 | 
679 |     CUDA_CALL(cudaMemcpy(tmp_vals, a->values, size_values, cudaMemcpyDeviceToHost));
680 |     CUDA_CALL(cudaMemcpy(tmp_cols, a->columns, size_columns, cudaMemcpyDeviceToHost));
681 |     CUDA_CALL(cudaMemcpy(tmp_pointers, a->rowPointers, size_pointers, cudaMemcpyDeviceToHost));
682 | 
683 |     printf("values: ");
684 |     for (int i = 0; i < a->nnz; i++) {
685 |         printf(format, tmp_vals[i]);
686 |     }
687 |     printf("\npointers: ");
688 |     for (int i = 0; i <  a->m + 1; i++) {
689 |         printf("%d ", tmp_pointers[i]);
690 |     }
691 |     printf("\ncolumns: ");
692 |     for (int i = 0; i < a->nnz; i++) {
693 |         printf("%d ", tmp_cols[i]);
694 |     }
695 |     printf("\n");
696 |     std::free(tmp_vals);
697 |     std::free(tmp_cols);
698 |     std::free(tmp_pointers);
699 | }
700 | 
701 | void GPU_Operations::printMatrixSPM(const SparseMatrix *a, int n, int m, const char* fmt) const {
702 |     const char* format = fmt == 0 ? "%1.3f " : fmt;
703 |     size_t size_values = a->nnz * sizeof(float);
704 |     size_t size_columns = a->nnz * sizeof(int);
705 |     size_t size_pointers = (a->m + 1)* sizeof(int);
706 | 
707 |     float* tmp_vals = (float*) std::malloc(size_values);
708 |     int* tmp_cols = (int*) std::malloc(size_columns);
709 |     int* tmp_pointers = (int*) std::malloc(size_pointers);
710 | 
711 |     CUDA_CALL(cudaMemcpy(tmp_vals, a->values, size_values, cudaMemcpyDeviceToHost));
712 |     CUDA_CALL(cudaMemcpy(tmp_cols, a->columns, size_columns, cudaMemcpyDeviceToHost));
713 |     CUDA_CALL(cudaMemcpy(tmp_pointers, a->rowPointers, size_pointers, cudaMemcpyDeviceToHost));
714 | 
715 |     for (int i = 0; i < n; i++) {
716 |         int rowPointer = tmp_pointers[i];
717 |         int nnz = tmp_pointers[i + 1] - rowPointer;
718 |         int found = 0;
719 |         for (int j = 0; j < m; j++) {
720 |             if (found < nnz) {
721 |                 if (j == tmp_cols[rowPointer + found]) {
722 |                     printf(format, tmp_vals[rowPointer + found]);
723 |                     found++;
724 |                 } else {
725 |                     printf(format, 0.0f);
726 |                 }
727 |             } else {
728 |                 printf(format, 0.0f);
729 |             }
730 |         }
731 |         printf("\n");
732 |     }
733 |     printf("\n");
734 |     std::free(tmp_vals);
735 |     std::free(tmp_cols);
736 |     std::free(tmp_pointers);
737 | }
738 | 


--------------------------------------------------------------------------------