├── README.md ├── code ├── README.md ├── test.json ├── cuda_svd_ops │ ├── svd_inv_prod_gpu.cu.cc │ ├── svd_prod_gpu.cu.cc │ ├── Makefile │ ├── svd_prod_gpu.cc │ ├── svd_inv_prod_gpu.cc │ ├── gpu_unit_inv_test.py │ ├── gpu_unit_test.py │ ├── grad_svd_prod_gpu.cc │ ├── grad_svd_inv_prod_gpu.cc │ ├── grad_svd_prod_gpu.cu.cc │ └── grad_svd_inv_prod_gpu.cu.cc ├── magma_svd_ops │ ├── Makefile │ ├── svd_block_prod_gpu.cc │ ├── gpu_unit_test.py │ ├── grad_svd_block_prod_gpu.cc │ ├── svd_block_prod_gpu.cu.cc │ └── grad_svd_block_prod_gpu.cu.cc ├── main.py ├── load.py ├── Params.py ├── svd_ops.py ├── spectral_rnn.py └── rnn.py └── data └── Adding_task └── generate_data.py /README.md: -------------------------------------------------------------------------------- 1 | # Spectral-RNN 2 | Implementation of Spectral-RNN (Stabilizing Gradients for Deep Neural Networks via Efficient SVD Parameterization) 3 | 4 | by Jiong Zhang, Qi Lei, Inderjit S. Dhillon 5 | 6 | 7 | -------------------------------------------------------------------------------- /code/README.md: -------------------------------------------------------------------------------- 1 | #BLAS2 operator: 2 | 3 | Requires: 4 | CUDA, cuBLAS, cudnn, tensorflow-gpu 5 | 6 | Compile: 7 | cd ./cuda_svd_ops 8 | make 9 | 10 | #BLAS3 operator: 11 | 12 | Requires: 13 | CUDA, cuBLAS, cudnn, MAGMA, tensorflow-gpu 14 | 15 | Compile: 16 | cd ./magma_svd_ops 17 | make 18 | 19 | #Running: 20 | 21 | python main.py test.json 22 | 23 | 24 | -------------------------------------------------------------------------------- /code/test.json: -------------------------------------------------------------------------------- 1 | { 2 | "cell" : "LSTM", 3 | "initial_learning_rate" : 0.001, 4 | "lr_decay" : 0.99, 5 | "num_epochs" : 100, 6 | "dropout_keep_rate" : 1.0, 7 | "num_units" : 128, 8 | "r_size" : 16, 9 | "r_margin": 0.01, 10 | "gpu_flag" : 1, 11 | "batch_size" : 128, 12 | "random_seed" : 1000, 13 | "dataset" : "add", 14 | "time_steps" : 100, 15 | "model_dir" : "results/model", 16 | "pred_dir" : "results/pred", 17 | "load_model" : 0, 18 | "train_flag" : 1, 19 | "batch_norm" : 0, 20 | "display_epoch_num" : 10 21 | } 22 | -------------------------------------------------------------------------------- /data/Adding_task/generate_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, os 3 | 4 | cwd = os.getcwd() 5 | 6 | 7 | N = 101000 8 | L = 100 9 | 10 | data_file = cwd + '/data'+str(L) 11 | 12 | sigma = 1; mu = 0 13 | 14 | np.random.seed(0) 15 | 16 | dataF = np.random.rand(N,L) 17 | dataI = np.zeros((N,L)) 18 | dataY = np.zeros((N,)) 19 | print dataY.shape 20 | 21 | IdcLow = np.random.randint(0,L/2, size=N) 22 | IdcHigh = np.random.randint(L/2,L, size=N) 23 | for i in range(N): 24 | dataI[i,IdcLow[i]]=1.0 25 | dataI[i,IdcHigh[i]]=1.0 26 | dataY[i] = dataF[i,IdcLow[i]] + dataF[i,IdcHigh[i]] 27 | 28 | data = np.zeros((N,2*L+1)) 29 | data[:,0] = dataY 30 | data[:, 1::2] = dataF 31 | data[:, 2::2] = dataI 32 | 33 | try: 34 | os.remove(data_file) 35 | except OSError: 36 | pass 37 | 38 | np.savetxt(data_file, data, fmt='%.5f', delimiter = ',') 39 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/svd_inv_prod_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | #if GOOGLE_CUDA 2 | #define EIGEN_USE_GPU 3 | #include 4 | #include 5 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 6 | 7 | 8 | int inline Hprod(cublasHandle_t handle, float* H, const float* u, float* alpha, const int k, const int n_h, const int batch) { 9 | 10 | cublasStatus_t stat; 11 | float aa = 0; 12 | float bb = 0; 13 | float cc = -1.0; 14 | // aa = 2.0 / u^T * u 15 | stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa); 16 | aa = 2.0 / aa; 17 | // make sure that leading (n_h-k) entrees of u are 0 18 | //stat = cublasSscal(handle, n_h-k, &bb, u, 1); 19 | // compute alpha = aa * H^T * u 20 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 21 | &aa, H, n_h, 22 | u, 1, 23 | &bb, alpha, 1); 24 | // update H 25 | stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H, n_h); 26 | 27 | if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE; 28 | return EXIT_SUCCESS; 29 | } 30 | 31 | // host function for CUDA kernels 32 | int SvdInvProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r) { 33 | cublasStatus_t stat; 34 | cudaError_t cudaStat; 35 | cublasHandle_t handle; 36 | // creat handle 37 | stat = cublasCreate_v2(&handle); 38 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on SvdInvProd\n"); return EXIT_FAILURE; } 39 | // allocate alpha 40 | float* alpha; 41 | cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float)); 42 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; } 43 | // begin computation 44 | stat = cublasScopy(handle, n_h * batch , H, 1, H_out, 1); // fill H_out with H 45 | 46 | for(int r=n_r-1; r >= 0; r--) { 47 | Hprod(handle, H_out, U + n_h*r, alpha, n_h - r, n_h, batch); 48 | } 49 | cudaFree(alpha); 50 | cublasDestroy(handle); 51 | return EXIT_SUCCESS; 52 | } 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/svd_prod_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | #if GOOGLE_CUDA 2 | #define EIGEN_USE_GPU 3 | #include 4 | #include 5 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 6 | 7 | 8 | // CUDA kernel TODO 9 | __global__ void SvdProdGpuKernel(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r) { 10 | //for (int i = 1; i < N; i++) t_out(i) = 0; 11 | //T_out(0) = T_in(0); 12 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n_h*batch; i += blockDim.x * gridDim.x) { 13 | H_out[i] = 2.0 ; 14 | } 15 | } 16 | 17 | 18 | int inline Hprod(cublasHandle_t handle, float* H, const float* u, float* alpha, const int k, const int n_h, const int batch) { 19 | 20 | cublasStatus_t stat; 21 | float aa = 0; 22 | float bb = 0; 23 | float cc = -1.0; 24 | // aa = 2.0 / u^T * u 25 | stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa); 26 | aa = 2.0 / aa; 27 | // make sure that leading (n_h-k) entrees of u are 0 28 | //stat = cublasSscal(handle, n_h-k, &bb, u, 1); 29 | // compute alpha = aa * H^T * u 30 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 31 | &aa, H, n_h, 32 | u, 1, 33 | &bb, alpha, 1); 34 | // update H 35 | stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H, n_h); 36 | 37 | if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE; 38 | return EXIT_SUCCESS; 39 | } 40 | 41 | // host function for CUDA kernels 42 | int SvdProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r) { 43 | cublasStatus_t stat; 44 | cudaError_t cudaStat; 45 | cublasHandle_t handle; 46 | // creat handle 47 | stat = cublasCreate_v2(&handle); 48 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on SvdProd\n"); return EXIT_FAILURE; } 49 | // allocate alpha 50 | float* alpha; 51 | cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float)); 52 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; } 53 | // begin computation 54 | stat = cublasScopy(handle, n_h * batch , H, 1, H_out, 1); // fill H_out with H 55 | 56 | for(int r=0; r < n_r; r++) { 57 | Hprod(handle, H_out, U + n_h*r, alpha, n_h - r, n_h, batch); 58 | } 59 | cudaFree(alpha); 60 | cublasDestroy(handle); 61 | return EXIT_SUCCESS; 62 | } 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /code/magma_svd_ops/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | OS := $(shell uname) 3 | 4 | ifeq ($(OS),Darwin) # Mac OS X 5 | OSFLAGS = -undefined dynamic_lookup 6 | CXX = clang++ 7 | else 8 | CXX = g++ 9 | OSFLAGS = 10 | endif 11 | 12 | TF_INC = `python -c "import tensorflow; print(tensorflow.sysconfig.get_include())"` 13 | TF_LIB = `python -c "import tensorflow; print(tensorflow.sysconfig.get_lib())"` 14 | 15 | # specify your magma dir and MKL dir here 16 | MAGMA =/work/03941/jiongdys/maverick/magma 17 | CUDADIR =/opt/apps/cuda/8.0/ 18 | MKL = /opt/apps/intel/15/composer_xe_2015.3.187/mkl/include 19 | MKLLIB = /opt/apps/intel/15/composer_xe_2015.3.187/mkl/lib/intel64 20 | 21 | CC = gcc -O2 -pthread 22 | GPUCC = nvcc 23 | CFLAGS = -O3 -std=c++11 -L$(TF_LIB) -I$(TF_INC) -I$(TF_INC)/external/nsync/public -DADD_ -ltensorflow_framework 24 | 25 | 26 | MAGMA_I = -I${MAGMA}/include -I${MAGMA}/testing -I${MAGMA}/control -I${MKL} 27 | MAG_FLAGS = -m64 -DNDEBUG -O3 -Wall -Wshadow -DMAGMA_NOAFFINITY -pedantic -Wno-long-long -DHAVE_CUBLAS -DMIN_CUDA_ARCH=300 -c ${MAGMA_I} -I${CUDADIR}/include 28 | CMAG_FLAGS= -m64 -fPIC ${MAGMA_I} -Wl,-rpath,${MAGMA}/lib -L${MAGMA}/lib -L${MKLLIB} -lstdc++ -lm #-framework Accelerate -lblas_fix 29 | 30 | GPUCFLAGS = -c -arch=sm_30 --expt-relaxed-constexpr ${MAGMA_I} 31 | 32 | LFLAGS = -pthread -shared -fPIC 33 | GPULFLAGS = -x cu -Xcompiler -fPIC 34 | GPUDEF = -DGOOGLE_CUDA=1 35 | CGPUFLAGS = -lcuda -lcublas -lmagma # -lmagmablas 36 | 37 | SRC = svd_block_prod_gpu.cc 38 | GPUSRC = svd_block_prod_gpu.cu.cc 39 | PROD = svd_block_prod_gpu.so 40 | GPUPROD = svd_block_prod_cu_gpu.o 41 | 42 | GRAD_SRC = grad_svd_block_prod_gpu.cc 43 | GRAD_GPUSRC = grad_svd_block_prod_gpu.cu.cc 44 | GRAD_PROD = grad_svd_block_prod_gpu.so 45 | GRAD_GPUPROD = grad_svd_block_prod_cu_gpu.o 46 | 47 | default: gpu gpu-grad 48 | 49 | gpu: 50 | $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GPUPROD) 51 | $(CXX) $(CFLAGS) ${CMAG_FLAGS} $(SRC) $(GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(PROD) 52 | 53 | gpu-grad: 54 | $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GRAD_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GRAD_GPUPROD) 55 | $(CXX) $(CFLAGS) ${CMAG_FLAGS} $(GRAD_SRC) $(GRAD_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(GRAD_PROD) 56 | 57 | clean: 58 | rm -f $(TEST_PROD) $(TEST_FINAL) $(PROD) $(GPUPROD) $(GRAD_PROD) $(GRAD_GPUPROD) 59 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile 2 | OS := $(shell uname) 3 | 4 | ifeq ($(OS),Darwin) # Mac OS X 5 | OSFLAGS = -undefined dynamic_lookup 6 | CXX = clang++ 7 | else 8 | CXX = g++ 9 | OSFLAGS = 10 | endif 11 | 12 | TF_INC = `python -c "import tensorflow; print(tensorflow.sysconfig.get_include())"` 13 | TF_LIB= `python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())'` 14 | 15 | CC = gcc -O2 -pthread 16 | GPUCC = nvcc 17 | CFLAGS = -std=c++11 -I$(TF_INC) -L$(TF_LIB) -I$(TF_INC)/external/nsync/public -ltensorflow_framework 18 | GPUCFLAGS = -c --expt-relaxed-constexpr 19 | LFLAGS = -pthread -shared -fPIC 20 | GPULFLAGS = -x cu -Xcompiler -fPIC 21 | GPUDEF = -DGOOGLE_CUDA=1 22 | CGPUFLAGS = -lcuda -lcublas 23 | 24 | SRC = svd_prod_gpu.cc 25 | GPUSRC = svd_prod_gpu.cu.cc 26 | PROD = svd_prod_gpu.so 27 | GPUPROD = svd_prod_cu_gpu.o 28 | 29 | GRAD_SRC = grad_svd_prod_gpu.cc 30 | GRAD_GPUSRC = grad_svd_prod_gpu.cu.cc 31 | GRAD_PROD = grad_svd_prod_gpu.so 32 | GRAD_GPUPROD = grad_svd_prod_cu_gpu.o 33 | 34 | INV_SRC = svd_inv_prod_gpu.cc 35 | INV_GPUSRC = svd_inv_prod_gpu.cu.cc 36 | INV_PROD = svd_inv_prod_gpu.so 37 | INV_GPUPROD = svd_inv_prod_cu_gpu.o 38 | 39 | INV_GRAD_SRC = grad_svd_inv_prod_gpu.cc 40 | INV_GRAD_GPUSRC = grad_svd_inv_prod_gpu.cu.cc 41 | INV_GRAD_PROD = grad_svd_inv_prod_gpu.so 42 | INV_GRAD_GPUPROD = grad_svd_inv_prod_cu_gpu.o 43 | 44 | default: gpu gpu-grad inv-gpu inv-gpu-grad 45 | 46 | cpu: 47 | $(CXX) $(CFLAGS) $(SRC) $(LFLAGS) $(OSFLAGS) -o $(PROD) 48 | 49 | gpu: 50 | $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GPUPROD) 51 | $(CXX) $(CFLAGS) $(SRC) $(GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(PROD) 52 | 53 | gpu-grad: 54 | $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GRAD_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GRAD_GPUPROD) 55 | $(CXX) $(CFLAGS) $(GRAD_SRC) $(GRAD_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(GRAD_PROD) 56 | 57 | inv-gpu: 58 | $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(INV_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(INV_GPUPROD) 59 | $(CXX) $(CFLAGS) $(INV_SRC) $(INV_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(INV_PROD) 60 | 61 | inv-gpu-grad: 62 | $(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(INV_GRAD_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(INV_GRAD_GPUPROD) 63 | $(CXX) $(CFLAGS) $(INV_GRAD_SRC) $(INV_GRAD_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(INV_GRAD_PROD) 64 | 65 | clean: 66 | rm -f $(PROD) $(GPUPROD) $(GRAD_PROD) $(GRAD_GPUPROD) $(INV_PROD) $(INV_GPUPROD) $(INV_GRAD_PROD) $(INV_GRAD_GPUPROD) 67 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/svd_prod_gpu.cc: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/framework/op.h" 2 | #include "tensorflow/core/framework/shape_inference.h" 3 | #include "tensorflow/core/framework/op_kernel.h" 4 | #include "tensorflow/core/framework/tensor_shape.h" 5 | #include "tensorflow/core/platform/default/logging.h" 6 | 7 | using namespace tensorflow; 8 | 9 | REGISTER_OP("SvdProdGpu") 10 | .Input("hidden_state: float") 11 | .Input("householder_matrix: float") 12 | .Output("output_state: float") 13 | .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { 14 | c->set_output(0, c->input(0)); 15 | return Status::OK(); 16 | }); 17 | 18 | 19 | int SvdProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r); 20 | 21 | class SvdProdGpuOp : public OpKernel { 22 | public: 23 | explicit SvdProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {} 24 | 25 | void Compute(OpKernelContext* context) override { 26 | // Check number of inputs 27 | OP_REQUIRES(context, context->num_inputs() == 2, 28 | errors::InvalidArgument("SvdProd expects 2 inputes.")); 29 | 30 | // Grab the input tensor 31 | const Tensor& H = context->input(0); 32 | const Tensor& U = context->input(1); 33 | 34 | // Shapes of input 35 | const TensorShape& H_shape = H.shape(); 36 | const TensorShape& U_shape = U.shape(); 37 | 38 | const int n_h = H_shape.dim_size(1); 39 | const int n_r = U_shape.dim_size(0); 40 | const int batch = H_shape.dim_size(0); 41 | // Perform dimension check 42 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape), 43 | errors::InvalidArgument("SvdProd expects H to be a 2-D matrix.")); 44 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape), 45 | errors::InvalidArgument("SvdProd expects U to be a 2-D matrix.")); 46 | OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1), 47 | errors::InvalidArgument("The second dimension of H and U does not match!")); 48 | 49 | // Create an output tensor 50 | Tensor* H_out = NULL; 51 | OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&H_out)); 52 | 53 | // obtain data 54 | const float* H_data = H.flat().data(); 55 | const float* U_data = U.flat().data(); 56 | float* H_out_data = H_out->flat().data(); 57 | /* 58 | // test 59 | int idx =0; 60 | std::printf( "Before:\n"); 61 | for(int i=0; i < H_shape.dim_size(0); i++){ 62 | for(int j=0; j < H_shape.dim_size(1); j++){ 63 | idx = i * H_shape.dim_size(1) + j; 64 | std::printf ("H(%d,%d)=%4.4f, %4.4f\n", i, j, H.flat()(idx), H_out_data[idx]); 65 | } 66 | } 67 | */ 68 | #if GOOGLE_CUDA 69 | int op_status; 70 | op_status = SvdProdGpuKernelLauncher(H_data, U_data, H_out_data, n_h, batch, n_r ); 71 | #endif 72 | } 73 | }; 74 | 75 | REGISTER_KERNEL_BUILDER(Name("SvdProdGpu").Device(DEVICE_GPU), SvdProdGpuOp); 76 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/svd_inv_prod_gpu.cc: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/framework/op.h" 2 | #include "tensorflow/core/framework/shape_inference.h" 3 | #include "tensorflow/core/framework/op_kernel.h" 4 | #include "tensorflow/core/framework/tensor_shape.h" 5 | #include "tensorflow/core/platform/default/logging.h" 6 | 7 | using namespace tensorflow; 8 | 9 | REGISTER_OP("SvdInvProdGpu") 10 | .Input("hidden_state: float") 11 | .Input("householder_matrix: float") 12 | .Output("output_state: float") 13 | .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { 14 | c->set_output(0, c->input(0)); 15 | return Status::OK(); 16 | }); 17 | 18 | 19 | int SvdInvProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r); 20 | 21 | class SvdInvProdGpuOp : public OpKernel { 22 | public: 23 | explicit SvdInvProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {} 24 | 25 | void Compute(OpKernelContext* context) override { 26 | // Check number of inputs 27 | OP_REQUIRES(context, context->num_inputs() == 2, 28 | errors::InvalidArgument("SvdInvProd expects 2 inputes.")); 29 | 30 | // Grab the input tensor 31 | const Tensor& H = context->input(0); 32 | const Tensor& U = context->input(1); 33 | 34 | // Shapes of input 35 | const TensorShape& H_shape = H.shape(); 36 | const TensorShape& U_shape = U.shape(); 37 | 38 | const int n_h = H_shape.dim_size(1); 39 | const int n_r = U_shape.dim_size(0); 40 | const int batch = H_shape.dim_size(0); 41 | // Perform dimension check 42 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape), 43 | errors::InvalidArgument("SvdInvProd expects H to be a 2-D matrix.")); 44 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape), 45 | errors::InvalidArgument("SvdInvProd expects U to be a 2-D matrix.")); 46 | OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1), 47 | errors::InvalidArgument("The second dimension of H and U does not match!")); 48 | 49 | // Create an output tensor 50 | Tensor* H_out = NULL; 51 | OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&H_out)); 52 | 53 | // obtain data 54 | const float* H_data = H.flat().data(); 55 | const float* U_data = U.flat().data(); 56 | float* H_out_data = H_out->flat().data(); 57 | /* 58 | // test 59 | int idx =0; 60 | std::printf( "Before:\n"); 61 | for(int i=0; i < H_shape.dim_size(0); i++){ 62 | for(int j=0; j < H_shape.dim_size(1); j++){ 63 | idx = i * H_shape.dim_size(1) + j; 64 | std::printf ("H(%d,%d)=%4.4f, %4.4f\n", i, j, H.flat()(idx), H_out_data[idx]); 65 | } 66 | } 67 | */ 68 | #if GOOGLE_CUDA 69 | int op_status; 70 | op_status = SvdInvProdGpuKernelLauncher(H_data, U_data, H_out_data, n_h, batch, n_r ); 71 | #endif 72 | } 73 | }; 74 | 75 | REGISTER_KERNEL_BUILDER(Name("SvdInvProdGpu").Device(DEVICE_GPU), SvdInvProdGpuOp); 76 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/gpu_unit_inv_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops import array_ops 5 | from tensorflow.python.ops import sparse_ops 6 | 7 | def Hgrad(H, u, G, k): 8 | # H.shape = (batch, n_h) 9 | # u.shape = (n_h,) 10 | # G.shape = (batch, n_h) 11 | alpha = 2* np.dot(H[:, -k:], u[-k:]) # alpha.shape = (batch,) 12 | beta = 2* np.dot(G[:, -k:], u[-k:]) # beta.shape = (batch,) 13 | u_bar = np.zeros_like(u) 14 | u_bar[-k:] += -np.dot(alpha,G[:,-k:]) - np.dot(beta,H[:,-k:]) + np.dot(alpha,beta)*u[-k:] # sum of gradient within the batch: averaging needed??? 15 | G_out = G.copy() 16 | G_out[:,-k:] -= np.outer(beta,u[-k:]) 17 | return G_out, u_bar # G_out.shape = (batch, n_h); u_bar.shape = (n_h,) 18 | 19 | n_h = 3 20 | n_b = 2 21 | n_r = 2 22 | 23 | 24 | 25 | rng = np.random.RandomState(13) 26 | H_ = rng.uniform(-np.sqrt(6. / (n_b + n_h)), np.sqrt(6. / (n_b + n_h)), (n_b, n_h)).astype(np.float32) 27 | 28 | U_full = rng.normal(0, 0.01, (n_h, n_r)).astype(np.float32) 29 | U_ = np.tril(U_full) 30 | norms_U_ = np.linalg.norm(U_, axis=0) 31 | U_ = np.transpose(1. / norms_U_ * U_) 32 | 33 | print H_ 34 | print U_ 35 | 36 | 37 | 38 | H1 = [H_]*(n_r+1) 39 | 40 | for i in range(n_r-1,-1,-1): 41 | alpha = np.dot(H1[i+1], U_[i]) 42 | H1[i] = H1[i+1] - 2 * np.outer(alpha, U_[i]) 43 | 44 | H2 = H1[0] 45 | print H2 46 | 47 | for i in range(n_b): 48 | print np.dot(H2[i],H2[i]) - np.dot(H_[i], H_[i]) 49 | 50 | G = np.ones_like(H_) 51 | Grad_U = np.ones_like(U_) 52 | 53 | for i in range(0,n_r): 54 | G, Grad_U[i] = Hgrad(H1[i+1], U_[i], G, n_h-i) 55 | 56 | print G 57 | print Grad_U 58 | ############################################################ 59 | ############################################################ 60 | 61 | grad_svd_inv_prod_module = tf.load_op_library('./grad_svd_inv_prod_gpu.so') 62 | 63 | @ops.RegisterGradient("SvdInvProdGpu") 64 | def _svd_inv_prod_gpu_grad(op, grad): 65 | """The gradients for `svd_inv_prod_gpu`. 66 | 67 | Args: 68 | op: The `svd_prod_gpu` `Operation` that we are differentiating, which we can use 69 | to find the inputs and outputs of the original op. 70 | grad: Gradient with respect to the output of the op. 71 | 72 | Returns: 73 | Gradients with respect to the inputs. 74 | """ 75 | H = op.inputs[0] 76 | U = op.inputs[1] 77 | 78 | return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad) 79 | ############################################################ 80 | 81 | 82 | svd_inv_prod_module = tf.load_op_library('./svd_inv_prod_gpu.so') 83 | with tf.Session() as sess: 84 | 85 | H = tf.constant(H_, dtype=tf.float32) 86 | U = tf.constant(U_, dtype = tf.float32) 87 | 88 | U = tf.matrix_band_part(U, 0, -1) # upper triangular 89 | 90 | z = svd_inv_prod_module.svd_inv_prod_gpu(H,U) 91 | gr = tf.gradients(z, [H,U]) 92 | tf.global_variables_initializer().run() 93 | 94 | print('H,U and product: ',H.eval(), U.eval(),z.eval()) 95 | print('grad_H, grad_U: ' ,gr[0].eval(), gr[1].eval()) 96 | -------------------------------------------------------------------------------- /code/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ Script for running RNNs with fixed parameters. """ 3 | 4 | import os 5 | import sys 6 | import time 7 | import math 8 | import numpy as np 9 | import csv 10 | import Params 11 | import load 12 | import rnn 13 | 14 | 15 | def train(params): 16 | 17 | print('%s starting......' % params.cell) 18 | sys.stdout.flush() 19 | 20 | if params.dataset.startswith('mnist'): 21 | train_X, test_X, train_y, test_y = load.load_mnist(params) 22 | elif params.dataset.startswith('add'): 23 | train_X, test_X, train_y, test_y = load.adding_task(params) 24 | else: 25 | assert 0, "unknown dataset %s" % (params.dataset) 26 | 27 | print "parameters = ", params 28 | 29 | model = rnn.RNNModel(params) 30 | 31 | # load model 32 | if params.load_model: 33 | model.load("%s" % (params.load_model_dir)) 34 | 35 | # train model 36 | train_error, test_error = model.train(params, train_X, train_y, test_X, test_y) 37 | 38 | # save model 39 | if params.model_dir: 40 | if os.path.isdir(os.path.dirname(params.model_dir)) == False: 41 | os.makedirs(params.model_dir) 42 | model.save("%s.%s" % (params.model_dir, params.cell)) 43 | 44 | # predict 45 | train_pred = model.predict(train_X, params.batch_size) 46 | test_pred = model.predict(test_X, params.batch_size) 47 | 48 | # must close model when finish 49 | model.close() 50 | 51 | # write prediction to file 52 | if params.pred_dir: 53 | if os.path.isdir(os.path.dirname(params.pred_dir)) == False: 54 | os.makedirs(params.pred_dir) 55 | with open("%s.%s.%s.y" % (params.pred_dir, params.dataset, params.cell), "w") as f: 56 | content = "" 57 | for pred in [train_pred, test_pred]: 58 | for entry in pred: 59 | for index, value in enumerate(entry): 60 | if index: 61 | content += "," 62 | content += "%f" % (value) 63 | content += "\n" 64 | f.write(content) 65 | with open("%s.%s.%s.X" % (params.pred_dir, params.dataset, params.cell), "w") as f: 66 | content = "" 67 | for X in [train_X, test_X]: 68 | for entry in X: 69 | for index, value in enumerate(entry.ravel()): 70 | if index: 71 | content += "," 72 | content += "%f" % (value) 73 | content += "\n" 74 | f.write(content) 75 | 76 | return train_error, test_error 77 | 78 | if __name__=='__main__': 79 | if len(sys.argv) < 2: 80 | print("input parameters in json format in required") 81 | exit() 82 | paramsArray = [] 83 | for i in range(1, len(sys.argv)): 84 | params = Params.Params() 85 | params.load(sys.argv[i]) 86 | paramsArray.append(params) 87 | print("parameters[%d] = %s" % (len(paramsArray), paramsArray)) 88 | 89 | tt = time.time() 90 | for params in paramsArray: 91 | train(params) 92 | print("program takes %.3f seconds" % (time.time()-tt)) 93 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/gpu_unit_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops import array_ops 5 | from tensorflow.python.ops import sparse_ops 6 | 7 | def Hgrad(H, u, G, k): 8 | # H.shape = (batch, n_h) 9 | # u.shape = (n_h,) 10 | # G.shape = (batch, n_h) 11 | alpha = 2* np.dot(H[:, -k:], u[-k:]) # alpha.shape = (batch,) 12 | beta = 2* np.dot(G[:, -k:], u[-k:]) # beta.shape = (batch,) 13 | u_bar = np.zeros_like(u) 14 | u_bar[-k:] += -np.dot(alpha,G[:,-k:]) - np.dot(beta,H[:,-k:]) + np.dot(alpha,beta)*u[-k:] # sum of gradient within the batch: averaging needed??? 15 | G_out = G.copy() 16 | G_out[:,-k:] -= np.outer(beta,u[-k:]) 17 | return G_out, u_bar # G_out.shape = (batch, n_h); u_bar.shape = (n_h,) 18 | 19 | n_h = 3; n_b = 2; n_r = 2 20 | rng = np.random.RandomState(13) 21 | H_ = rng.uniform(-np.sqrt(6. / (n_b + n_h)), np.sqrt(6. / (n_b + n_h)), (n_b, n_h)).astype(np.float32) 22 | 23 | U_full = rng.normal(0, 0.01, (n_h, n_r)).astype(np.float32) 24 | U_ = np.tril(U_full) 25 | norms_U_ = np.linalg.norm(U_, axis=0) 26 | U_ = np.transpose(1. / norms_U_ * U_) 27 | 28 | print H_ 29 | print U_ 30 | 31 | H1 = [H_]*(n_r+1) 32 | 33 | for i in range(0,n_r): 34 | alpha = np.dot(H1[i], U_[i]) 35 | print 'alpha: ', 2*alpha 36 | H1[i+1] = H1[i] - 2 * np.outer(alpha, U_[i]) 37 | 38 | H2 = H1[-1] 39 | print H2 40 | 41 | for i in range(n_b): 42 | print np.dot(H2[i],H2[i]) - np.dot(H_[i], H_[i]) 43 | 44 | G = np.ones_like(H_) 45 | Grad_U = np.ones_like(U_) 46 | 47 | for i in range(n_r-1, -1, -1): 48 | G, Grad_U[i] = Hgrad(H1[i], U_[i], G, n_h-i) 49 | 50 | print G 51 | print Grad_U 52 | ############################################################ 53 | ############################################################ 54 | 55 | grad_svd_prod_module = tf.load_op_library('./grad_svd_prod_gpu.so') 56 | 57 | @ops.RegisterGradient("SvdProdGpu") 58 | def _svd_prod_gpu_grad(op, grad): 59 | H = op.inputs[0] 60 | U = op.inputs[1] 61 | return grad_svd_prod_module.grad_svd_prod_gpu(H,U,grad) 62 | ############################################################ 63 | svd_prod_module = tf.load_op_library('./svd_prod_gpu.so') 64 | ############################################################ 65 | 66 | grad_svd_inv_prod_module = tf.load_op_library('./grad_svd_inv_prod_gpu.so') 67 | 68 | @ops.RegisterGradient("SvdInvProdGpu") 69 | def _svd_inv_prod_gpu_grad(op, grad): 70 | H = op.inputs[0] 71 | U = op.inputs[1] 72 | return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad) 73 | ############################################################ 74 | svd_inv_prod_module = tf.load_op_library('./svd_inv_prod_gpu.so') 75 | ############################################################ 76 | 77 | with tf.Session() as sess: 78 | 79 | H = tf.constant(H_, dtype=tf.float32) 80 | U = tf.constant(U_, dtype = tf.float32) 81 | 82 | U = tf.matrix_band_part(U, 0, -1) # upper triangular 83 | 84 | z = svd_prod_module.svd_prod_gpu(H,U) 85 | z2 = svd_inv_prod_module.svd_inv_prod_gpu(z,U) 86 | gr = tf.gradients(z, [H,U]) 87 | gr2 = tf.gradients(z2, [z,U]) 88 | tf.global_variables_initializer().run() 89 | 90 | print('H,U and product: ',H.eval(), U.eval(),z.eval()) 91 | print('grad_H, grad_U: ' ,gr[0].eval(), gr[1].eval()) 92 | 93 | print('H,U and product: ',H.eval(), U.eval(),z2.eval()) 94 | print('grad_H, grad_U: ' ,gr2[0].eval(), gr2[1].eval()) 95 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/grad_svd_prod_gpu.cc: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/framework/op.h" 2 | #include "tensorflow/core/framework/shape_inference.h" 3 | #include "tensorflow/core/framework/op_kernel.h" 4 | #include "tensorflow/core/framework/tensor_shape.h" 5 | #include "tensorflow/core/platform/default/logging.h" 6 | 7 | using namespace tensorflow; 8 | 9 | REGISTER_OP("GradSvdProdGpu") 10 | .Input("hidden_state: float") 11 | .Input("householder_matrix: float") 12 | .Input("gradient_backprop: float") 13 | .Output("grad_hidden_state: float") 14 | .Output("grad_householder_matrix: float") 15 | .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { 16 | c->set_output(0, c->input(0)); 17 | c->set_output(1, c->input(1)); 18 | return Status::OK(); 19 | }); 20 | 21 | #include "tensorflow/core/framework/op_kernel.h" 22 | 23 | int GradSvdProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* G_grad, const int n_h, const int batch, const int n_r); 24 | 25 | class GradSvdProdGpuOp : public OpKernel { 26 | public: 27 | explicit GradSvdProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {} 28 | 29 | void Compute(OpKernelContext* context) override { 30 | // Check number of inputs 31 | OP_REQUIRES(context, context->num_inputs() == 3, 32 | errors::InvalidArgument("GradSvdProd expects 3 inputes.")); 33 | 34 | // Grab the input tensor 35 | const Tensor& H = context->input(0); 36 | const Tensor& U = context->input(1); 37 | const Tensor& G = context->input(2); 38 | auto input = H.flat(); 39 | 40 | // Shapes of input 41 | const TensorShape& H_shape = H.shape(); 42 | const TensorShape& U_shape = U.shape(); 43 | const TensorShape& G_shape = G.shape(); 44 | 45 | const int n_h = H_shape.dim_size(1); 46 | const int n_r = U_shape.dim_size(0); 47 | const int batch = H_shape.dim_size(0); 48 | // Perform dimension check 49 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape), 50 | errors::InvalidArgument("SvdProd expects H to be a 2-D matrix.")); 51 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape), 52 | errors::InvalidArgument("SvdProd expects U to be a 2-D matrix.")); 53 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(G_shape), 54 | errors::InvalidArgument("SvdProd expects G to be a 2-D matrix.")); 55 | OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1), 56 | errors::InvalidArgument("The second dimension of H and U does not match!")); 57 | OP_REQUIRES(context, G_shape.dim_size(0) == H_shape.dim_size(0), 58 | errors::InvalidArgument("The first dimension of G and H does not match!")); 59 | OP_REQUIRES(context, G_shape.dim_size(1) == H_shape.dim_size(1), 60 | errors::InvalidArgument("The second dimension of G and H does not match!")); 61 | 62 | // Create an output tensor 63 | Tensor* Grad_H = NULL; 64 | OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&Grad_H)); 65 | Tensor* Grad_U = NULL; 66 | OP_REQUIRES_OK(context, context->allocate_output(1, U_shape,&Grad_U)); 67 | 68 | // obtain data 69 | const float* H_data = H.flat().data(); 70 | const float* U_data = U.flat().data(); 71 | const float* G_data = G.flat().data(); 72 | float* Grad_H_data = Grad_H->flat().data(); 73 | float* Grad_U_data = Grad_U->flat().data(); 74 | #if GOOGLE_CUDA 75 | GradSvdProdGpuKernelLauncher(H_data, U_data, G_data, Grad_H_data, Grad_U_data, n_h, batch, n_r); 76 | #endif 77 | } 78 | }; 79 | 80 | REGISTER_KERNEL_BUILDER(Name("GradSvdProdGpu").Device(DEVICE_GPU), GradSvdProdGpuOp); 81 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/grad_svd_inv_prod_gpu.cc: -------------------------------------------------------------------------------- 1 | #include "tensorflow/core/framework/op.h" 2 | #include "tensorflow/core/framework/shape_inference.h" 3 | #include "tensorflow/core/framework/op_kernel.h" 4 | #include "tensorflow/core/framework/tensor_shape.h" 5 | #include "tensorflow/core/platform/default/logging.h" 6 | 7 | using namespace tensorflow; 8 | 9 | REGISTER_OP("GradSvdInvProdGpu") 10 | .Input("hidden_state: float") 11 | .Input("householder_matrix: float") 12 | .Input("gradient_backprop: float") 13 | .Output("grad_hidden_state: float") 14 | .Output("grad_householder_matrix: float") 15 | .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { 16 | c->set_output(0, c->input(0)); 17 | c->set_output(1, c->input(1)); 18 | return Status::OK(); 19 | }); 20 | 21 | #include "tensorflow/core/framework/op_kernel.h" 22 | 23 | int GradSvdInvProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* G_grad, const int n_h, const int batch, const int n_r); 24 | 25 | class GradSvdInvProdGpuOp : public OpKernel { 26 | public: 27 | explicit GradSvdInvProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {} 28 | 29 | void Compute(OpKernelContext* context) override { 30 | // Check number of inputs 31 | OP_REQUIRES(context, context->num_inputs() == 3, 32 | errors::InvalidArgument("GradSvdInvProd expects 3 inputes.")); 33 | 34 | // Grab the input tensor 35 | const Tensor& H = context->input(0); 36 | const Tensor& U = context->input(1); 37 | const Tensor& G = context->input(2); 38 | auto input = H.flat(); 39 | 40 | // Shapes of input 41 | const TensorShape& H_shape = H.shape(); 42 | const TensorShape& U_shape = U.shape(); 43 | const TensorShape& G_shape = G.shape(); 44 | 45 | const int n_h = H_shape.dim_size(1); 46 | const int n_r = U_shape.dim_size(0); 47 | const int batch = H_shape.dim_size(0); 48 | // Perform dimension check 49 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape), 50 | errors::InvalidArgument("SvdInvProd expects H to be a 2-D matrix.")); 51 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape), 52 | errors::InvalidArgument("SvdInvProd expects U to be a 2-D matrix.")); 53 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(G_shape), 54 | errors::InvalidArgument("SvdInvProd expects G to be a 2-D matrix.")); 55 | OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1), 56 | errors::InvalidArgument("The second dimension of H and U does not match!")); 57 | OP_REQUIRES(context, G_shape.dim_size(0) == H_shape.dim_size(0), 58 | errors::InvalidArgument("The first dimension of G and H does not match!")); 59 | OP_REQUIRES(context, G_shape.dim_size(1) == H_shape.dim_size(1), 60 | errors::InvalidArgument("The second dimension of G and H does not match!")); 61 | 62 | // Create an output tensor 63 | Tensor* Grad_H = NULL; 64 | OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&Grad_H)); 65 | Tensor* Grad_U = NULL; 66 | OP_REQUIRES_OK(context, context->allocate_output(1, U_shape,&Grad_U)); 67 | 68 | // obtain data 69 | const float* H_data = H.flat().data(); 70 | const float* U_data = U.flat().data(); 71 | const float* G_data = G.flat().data(); 72 | float* Grad_H_data = Grad_H->flat().data(); 73 | float* Grad_U_data = Grad_U->flat().data(); 74 | #if GOOGLE_CUDA 75 | GradSvdInvProdGpuKernelLauncher(H_data, U_data, G_data, Grad_H_data, Grad_U_data, n_h, batch, n_r); 76 | #endif 77 | } 78 | }; 79 | 80 | REGISTER_KERNEL_BUILDER(Name("GradSvdInvProdGpu").Device(DEVICE_GPU), GradSvdInvProdGpuOp); 81 | -------------------------------------------------------------------------------- /code/load.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import time 5 | import math 6 | import numpy as np 7 | import csv 8 | import pickle 9 | import sklearn 10 | from sklearn.utils import shuffle 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.datasets import fetch_mldata 13 | import os 14 | import cPickle as pickle 15 | import urllib2 16 | 17 | datasets_dir = os.getcwd() + '/../data/' 18 | 19 | def load_mnist_local(): 20 | data_dir = os.path.join(datasets_dir,'mnist/') 21 | fd = open(os.path.join(data_dir,'train-images-idx3-ubyte')) 22 | loaded = np.fromfile(file=fd,dtype=np.uint8) 23 | trX = loaded[16:].reshape((60000,28*28)).astype(float) 24 | 25 | fd = open(os.path.join(data_dir,'train-labels-idx1-ubyte')) 26 | loaded = np.fromfile(file=fd,dtype=np.uint8) 27 | trY = loaded[8:].reshape((60000)) 28 | 29 | fd = open(os.path.join(data_dir,'t10k-images-idx3-ubyte')) 30 | loaded = np.fromfile(file=fd,dtype=np.uint8) 31 | teX = loaded[16:].reshape((10000,28*28)).astype(float) 32 | 33 | fd = open(os.path.join(data_dir,'t10k-labels-idx1-ubyte')) 34 | loaded = np.fromfile(file=fd,dtype=np.uint8) 35 | teY = loaded[8:].reshape((10000)) 36 | 37 | 38 | return np.concatenate((trX,teX)), np.concatenate((trY,teY)) 39 | 40 | 41 | 42 | 43 | ''' prepare dataset ''' 44 | def load_mnist(params, permute=False): 45 | mnist = fetch_mldata('MNIST original') 46 | mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=params.random_seed) 47 | #mnist_X, mnist_y = load_mnist_local() 48 | mnist_X = mnist_X / 255.0 49 | print mnist_X.shape, mnist_y.shape 50 | print("MNIST data prepared") 51 | 52 | mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64') 53 | if permute: 54 | np.random.seed(0); permute = np.random.permutation(784) 55 | mnist_X = mnist_X[:, permute] 56 | def flatten_img(images): 57 | ''' 58 | images: shape => (n, rows, columns) 59 | output: shape => (n, rows*columns) 60 | ''' 61 | n_rows = images.shape[1] 62 | n_columns = images.shape[2] 63 | for num in range(n_rows): 64 | if num % 2 != 0: 65 | images[:, num, :] = images[:, num, :][:, ::-1] 66 | output = images.reshape(-1, n_rows*n_columns) 67 | return output 68 | 69 | time_steps = 28*28 70 | if len(params.dataset) > 6: # mnist.xx 71 | time_steps = int(params.dataset.split('.')[1]) 72 | mnist_X = mnist_X.reshape((-1, time_steps, 28*28/time_steps)) 73 | #mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len) 74 | print "mnist_X.shape = ", mnist_X.shape 75 | #mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features) 76 | mnist_y_one_hot = np.zeros((mnist_y.shape[0], 10)) 77 | for i in xrange(len(mnist_y)): 78 | mnist_y_one_hot[i][mnist_y[i]] = 1 79 | print "mnist_y.shape = ", mnist_y_one_hot.shape 80 | 81 | train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y_one_hot, 82 | test_size=0.2, 83 | random_state=params.random_seed) 84 | # need to set parameters according to dataset 85 | params.time_steps = train_X.shape[1] 86 | params.input_size = train_X.shape[2] 87 | params.output_size = 10 88 | params.regression_flag = False 89 | return train_X, test_X, train_y, test_y 90 | 91 | 92 | def adding_task(params, fname=datasets_dir+'Adding_task/data', ntrain=50000, ntest=1000): 93 | filename = fname + str(params.time_steps) 94 | data = np.loadtxt(filename, delimiter=',').astype(np.float32) 95 | x = data[:,1:]; y = data[:,0] 96 | assert(ntrain+ntest <= x.shape[0]) 97 | train_X = x.reshape((x.shape[0], x.shape[1]//2, 2)) 98 | train_Y = y.reshape((y.shape[0], 1)) 99 | params.time_steps = train_X.shape[1] 100 | params.input_size = train_X.shape[2] 101 | params.output_size = 1 102 | params.regression_flag = True 103 | print("Adding task with %i time step prepared!"%params.time_steps) 104 | print "Adding X shape: ", train_X.shape 105 | print "Adding Y shape: ", train_Y.shape 106 | 107 | return train_X[0 : ntrain], train_X[ntrain : ntrain + ntest], train_Y[0 : ntrain], train_Y[ntrain : ntrain + ntest] 108 | 109 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/grad_svd_prod_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | #if GOOGLE_CUDA 2 | 3 | #define EIGEN_USE_GPU 4 | #include 5 | #include 6 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 7 | 8 | 9 | int inline Hprod(cublasHandle_t handle,const float* H_in, float* H_out, const float* u, float* alpha, const int k, const int n_h, const int batch) { 10 | 11 | cublasStatus_t stat; 12 | float aa = 0; 13 | float bb = 0; 14 | float cc = -1.0; 15 | // aa = 2.0 / u^T * u 16 | stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa); 17 | aa = 2.0 / aa; 18 | // make sure that leading (n_h-k) entrees of u are 0 19 | //stat = cublasSscal(handle, n_h-k, &bb, u, 1); 20 | // compute alpha = aa * H^T * u 21 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 22 | &aa, H_in, n_h, 23 | u, 1, 24 | &bb, alpha, 1); 25 | // update H 26 | stat = cublasScopy(handle, n_h * batch , H_in, 1, H_out, 1); // fill H_out with H_in 27 | stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H_out, n_h); 28 | 29 | if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE; 30 | return EXIT_SUCCESS; 31 | } 32 | 33 | int inline Hgrad(cublasHandle_t handle,const float* H, const float* u, float* G, float* u_grad, float* alpha, float* beta, const int k, const int n_h, const int batch) { 34 | 35 | cublasStatus_t stat; 36 | float aa = 0; 37 | float zero = 0; 38 | float neg_one = -1.0; 39 | float pos_one = 1.0; 40 | float alpha_dot_beta = 0; 41 | // aa = 2.0 / u^T * u 42 | stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa); 43 | aa = 2.0 / aa; 44 | // make sure that leading (n_h-k) entrees of u are 0 45 | //stat = cublasSscal(handle, n_h-k, &bb, u, 1); 46 | // compute alpha = aa * H^T * u 47 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 48 | &aa, H, n_h, 49 | u, 1, 50 | &zero, alpha, 1); 51 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha failed\n"); return EXIT_FAILURE; } 52 | // compute beta = aa * G^T * u 53 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 54 | &aa, G, n_h, 55 | u, 1, 56 | &zero, beta, 1); 57 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("beta failed\n"); return EXIT_FAILURE; } 58 | // compute dot(alpha, beta) 59 | stat = cublasSdot (handle, batch, alpha, 1, beta, 1, &alpha_dot_beta); 60 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha dot beta failed\n"); return EXIT_FAILURE; } 61 | // u_grad = - G * alpha + 0 * u_grad 62 | stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch, 63 | &neg_one, G, n_h, 64 | alpha, 1, 65 | &zero, u_grad, 1); 66 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad alpha failed\n"); return EXIT_FAILURE; } 67 | // u_grad = - G * alpha + 1 * u_grad 68 | stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch, 69 | &neg_one, H, n_h, 70 | beta, 1, 71 | &pos_one, u_grad, 1); 72 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad beta failed\n"); return EXIT_FAILURE; } 73 | // u_grad = alpha_dot_beta * u + 1 * u_grad 74 | stat = cublasSaxpy(handle, n_h, &alpha_dot_beta, u, 1, u_grad, 1); 75 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad u failed\n"); return EXIT_FAILURE; } 76 | // zero out first n_h - k entrees --- there is better way! 77 | stat = cublasSscal(handle, n_h - k, &zero, u_grad, 1); 78 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad zero out failed\n"); return EXIT_FAILURE; } 79 | // update G 80 | stat = cublasSger(handle, n_h, batch, &neg_one, u, 1, beta, 1, G, n_h); 81 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("G update failed\n"); return EXIT_FAILURE; } 82 | return EXIT_SUCCESS; 83 | } 84 | // host function for CUDA kernels 85 | int GradSvdProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* U_grad, const int n_h, const int batch, const int n_r) { 86 | cublasStatus_t stat; 87 | cudaError_t cudaStat; 88 | cublasHandle_t handle; 89 | // creat handle 90 | stat = cublasCreate_v2(&handle); 91 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on GradSvdProd\n"); return EXIT_FAILURE; } 92 | // allocate alpha 93 | float* alpha; 94 | cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float)); 95 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; } 96 | float* beta; 97 | cudaStat = cudaMalloc ((void**)&beta, batch*sizeof(float)); 98 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of beta failed"); return EXIT_FAILURE; } 99 | // allocate H_hist 100 | float* H_hist; 101 | cudaStat = cudaMalloc ((void**)&H_hist, (n_r-1)*batch*n_h*sizeof(float)); 102 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of H_hist failed"); return EXIT_FAILURE; } 103 | // begin computation 104 | Hprod(handle, H, H_hist, U, alpha, n_h, n_h, batch); 105 | for(int r=1; r < n_r-1; r++) { 106 | Hprod(handle, H_hist + (r-1)*batch*n_h, H_hist + r*batch*n_h, U + n_h*r, alpha, n_h - r, n_h, batch); 107 | } 108 | 109 | stat = cublasScopy(handle, n_h * batch , G, 1, H_grad, 1); // fill H_out with H_in 110 | 111 | for(int r=n_r-1; r >0; r--) { 112 | Hgrad(handle, H_hist + (r-1)*batch*n_h, U + n_h*r, H_grad, U_grad + n_h*r, alpha, beta, n_h - r, n_h, batch); 113 | } 114 | Hgrad(handle, H, U, H_grad, U_grad, alpha, beta, n_h, n_h, batch); 115 | 116 | 117 | cudaFree(alpha); 118 | cudaFree(beta); 119 | cudaFree(H_hist); 120 | cublasDestroy(handle); 121 | return EXIT_SUCCESS; 122 | } 123 | #endif 124 | -------------------------------------------------------------------------------- /code/cuda_svd_ops/grad_svd_inv_prod_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | #if GOOGLE_CUDA 2 | 3 | #define EIGEN_USE_GPU 4 | #include 5 | #include 6 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 7 | 8 | 9 | int inline Hprod(cublasHandle_t handle,const float* H_in, float* H_out, const float* u, float* alpha, const int k, const int n_h, const int batch) { 10 | 11 | cublasStatus_t stat; 12 | float aa = 0; 13 | float bb = 0; 14 | float cc = -1.0; 15 | // aa = 2.0 / u^T * u 16 | stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa); 17 | aa = 2.0 / aa; 18 | // make sure that leading (n_h-k) entrees of u are 0 19 | //stat = cublasSscal(handle, n_h-k, &bb, u, 1); 20 | // compute alpha = aa * H^T * u 21 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 22 | &aa, H_in, n_h, 23 | u, 1, 24 | &bb, alpha, 1); 25 | // update H 26 | stat = cublasScopy(handle, n_h * batch , H_in, 1, H_out, 1); // fill H_out with H_in 27 | stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H_out, n_h); 28 | 29 | if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE; 30 | return EXIT_SUCCESS; 31 | } 32 | 33 | int inline Hgrad(cublasHandle_t handle,const float* H, const float* u, float* G, float* u_grad, float* alpha, float* beta, const int k, const int n_h, const int batch) { 34 | 35 | cublasStatus_t stat; 36 | float aa = 0; 37 | float zero = 0; 38 | float neg_one = -1.0; 39 | float pos_one = 1.0; 40 | float alpha_dot_beta = 0; 41 | // aa = 2.0 / u^T * u 42 | stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa); 43 | aa = 2.0 / aa; 44 | // make sure that leading (n_h-k) entrees of u are 0 45 | //stat = cublasSscal(handle, n_h-k, &bb, u, 1); 46 | // compute alpha = aa * H^T * u 47 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 48 | &aa, H, n_h, 49 | u, 1, 50 | &zero, alpha, 1); 51 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha failed\n"); return EXIT_FAILURE; } 52 | // compute beta = aa * G^T * u 53 | stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch, 54 | &aa, G, n_h, 55 | u, 1, 56 | &zero, beta, 1); 57 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("beta failed\n"); return EXIT_FAILURE; } 58 | // compute dot(alpha, beta) 59 | stat = cublasSdot (handle, batch, alpha, 1, beta, 1, &alpha_dot_beta); 60 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha dot beta failed\n"); return EXIT_FAILURE; } 61 | // u_grad = - G * alpha + 0 * u_grad 62 | stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch, 63 | &neg_one, G, n_h, 64 | alpha, 1, 65 | &zero, u_grad, 1); 66 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad alpha failed\n"); return EXIT_FAILURE; } 67 | // u_grad = - G * alpha + 1 * u_grad 68 | stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch, 69 | &neg_one, H, n_h, 70 | beta, 1, 71 | &pos_one, u_grad, 1); 72 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad beta failed\n"); return EXIT_FAILURE; } 73 | // u_grad = alpha_dot_beta * u + 1 * u_grad 74 | stat = cublasSaxpy(handle, n_h, &alpha_dot_beta, u, 1, u_grad, 1); 75 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad u failed\n"); return EXIT_FAILURE; } 76 | // zero out first n_h - k entrees --- there is better way! 77 | stat = cublasSscal(handle, n_h - k, &zero, u_grad, 1); 78 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad zero out failed\n"); return EXIT_FAILURE; } 79 | // update G 80 | stat = cublasSger(handle, n_h, batch, &neg_one, u, 1, beta, 1, G, n_h); 81 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("G update failed\n"); return EXIT_FAILURE; } 82 | return EXIT_SUCCESS; 83 | } 84 | // host function for CUDA kernels 85 | int GradSvdInvProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* U_grad, const int n_h, const int batch, const int n_r) { 86 | cublasStatus_t stat; 87 | cudaError_t cudaStat; 88 | cublasHandle_t handle; 89 | // creat handle 90 | stat = cublasCreate_v2(&handle); 91 | if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on GradSvdInvProd\n"); return EXIT_FAILURE; } 92 | // allocate alpha 93 | float* alpha; 94 | cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float)); 95 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; } 96 | float* beta; 97 | cudaStat = cudaMalloc ((void**)&beta, batch*sizeof(float)); 98 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of beta failed"); return EXIT_FAILURE; } 99 | // allocate H_hist 100 | float* H_hist; 101 | cudaStat = cudaMalloc ((void**)&H_hist, (n_r-1)*batch*n_h*sizeof(float)); 102 | if (cudaStat != cudaSuccess) { printf ("device memory allocation of H_hist failed"); return EXIT_FAILURE; } 103 | // begin computation 104 | Hprod(handle, H, H_hist + (n_r-2)*batch*n_h, U + n_h*(n_r-1), alpha, n_h - n_r + 1, n_h, batch); 105 | for(int r=n_r-2; r > 0; r--) { 106 | Hprod(handle, H_hist + r*batch*n_h, H_hist + (r-1)*batch*n_h, U + n_h*r, alpha, n_h - r, n_h, batch); 107 | } 108 | 109 | stat = cublasScopy(handle, n_h * batch , G, 1, H_grad, 1); // fill H_out with H_in 110 | 111 | for(int r=0; r < n_r-1; r++) { 112 | Hgrad(handle, H_hist + r*batch*n_h, U + n_h*r, H_grad, U_grad + n_h*r, alpha, beta, n_h - r, n_h, batch); 113 | } 114 | Hgrad(handle, H, U + n_h*(n_r-1), H_grad, U_grad + n_h*(n_r-1), alpha, beta, n_h - n_r + 1, n_h, batch); 115 | 116 | 117 | cudaFree(alpha); 118 | cudaFree(beta); 119 | cudaFree(H_hist); 120 | cublasDestroy(handle); 121 | return EXIT_SUCCESS; 122 | } 123 | #endif 124 | -------------------------------------------------------------------------------- /code/Params.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | 4 | """ 5 | Parameter class 6 | """ 7 | class Params (object): 8 | def __init__(self): 9 | self.cell = None # RNN cell 10 | self.initial_learning_rate = math.exp(-10) # learning rate for SGD, [exp(-10), 1] 11 | self.lr_decay = 0.8 # the multiplier to multiply the learning rate every epoch 12 | self.num_epochs = 100 # number of epochs 13 | self.dropout_keep_rate = 0.5 # percent of output units that are kept during dropout, in range (0, 1] 14 | self.num_units = 200 # number of units 15 | self.num_layers = 1 # number of layers 16 | self.r_size = 60 # the number of Householder reflectors used in Spectral-RNN 17 | self.r_margin = 0.01 # the singular value margin in Spectral-RNN 18 | self.time_steps = None # time steps, time_steps*input_size = sequence length 19 | self.input_size = None # dimensionality of input features at each time step 20 | self.output_size = None # dimensionality of label 21 | self.gpu_flag = True # use GPU or not, Spectral-RNN only available in GPU mode 22 | self.random_seed = 1000 # random seed 23 | self.dataset = 'mnist.28' # dataset name, mnist.[length] where length will overwrite self.time_steps 24 | self.batch_size = 128 # batch size 25 | self.regression_flag = True # regression or classification 26 | self.model_dir = '' # directory to save model, will append .cell_name 27 | self.load_model_dir = '' # directory to save model, will append .cell_name 28 | self.pred_dir = '' # directory for prediction results, will append .dataset.cell_name.[Xy] 29 | self.load_model = False # load model or not 30 | self.train_flag = True # train model or not 31 | self.batch_norm = False # batch normalization or not 32 | self.display_epoch_num = 1 # display how many evaluations per epoch 33 | """ 34 | convert to json 35 | """ 36 | def toJson(self): 37 | data = dict() 38 | data['cell'] = self.cell 39 | data['initial_learning_rate'] = self.initial_learning_rate 40 | data['lr_decay'] = self.lr_decay 41 | data['num_epochs'] = self.num_epochs 42 | data['dropout_keep_rate'] = self.dropout_keep_rate 43 | data['num_units'] = self.num_units 44 | data['num_layers'] = self.num_layers 45 | data['r_size'] = self.r_size 46 | data['r_margin'] = self.r_margin 47 | data['time_steps'] = self.time_steps 48 | data['input_size'] = self.input_size 49 | data['output_size'] = self.output_size 50 | data['gpu_flag'] = self.gpu_flag 51 | data['batch_size'] = self.batch_size 52 | data['random_seed'] = self.random_seed 53 | data['dataset'] = self.dataset 54 | data['regression_flag'] = self.regression_flag 55 | data['model_dir'] = self.model_dir 56 | data['load_model_dir'] = self.load_model_dir 57 | data['pred_dir'] = self.pred_dir 58 | data['load_model'] = self.load_model 59 | data['train_flag'] = self.train_flag 60 | data['batch_norm'] = self.batch_norm 61 | data['display_epoch_num'] = self.display_epoch_num 62 | return data 63 | """ 64 | load form json 65 | """ 66 | def fromJson(self, data): 67 | if 'cell' in data: self.cell = data['cell'] 68 | if 'initial_learning_rate' in data: self.initial_learning_rate = data['initial_learning_rate'] 69 | if 'lr_decay' in data: self.lr_decay = data['lr_decay'] 70 | if 'num_epochs' in data: self.num_epochs = data['num_epochs'] 71 | if 'dropout_keep_rate' in data: self.dropout_keep_rate = data['dropout_keep_rate'] 72 | if 'num_units' in data: self.num_units = data['num_units'] 73 | if 'num_layers' in data: self.num_layers = data['num_layers'] 74 | if 'r_size' in data: self.r_size = data['r_size'] 75 | if 'r_margin' in data: self.r_margin = data['r_margin'] 76 | if 'time_steps' in data: self.time_steps = data['time_steps'] 77 | if 'input_size' in data: self.input_size = data['input_size'] 78 | if 'output_size' in data: self.output_size = data['output_size'] 79 | if 'gpu_flag' in data: self.gpu_flag = data['gpu_flag'] 80 | if 'batch_size' in data: self.batch_size = data['batch_size'] 81 | if 'random_seed' in data: self.random_seed = data['random_seed'] 82 | if 'dataset' in data: self.dataset = data['dataset'] 83 | if 'regression_flag' in data: self.regression_flag = data['regression_flag'] 84 | if 'model_dir' in data: self.model_dir = data['model_dir'] 85 | if 'load_model_dir' in data: self.load_model_dir = data['load_model_dir'] 86 | if 'pred_dir' in data: self.pred_dir = data['pred_dir'] 87 | if 'load_model' in data: self.load_model = data['load_model'] 88 | if 'train_flag' in data: self.train_flag = data['train_flag'] 89 | if 'batch_norm' in data: self.batch_norm = data['batch_norm'] 90 | if 'display_epoch_num' in data: self.display_epoch_num = data['display_epoch_num'] 91 | 92 | """ 93 | dump to json file 94 | """ 95 | def dump(self, filename): 96 | with open(filename, 'w') as f: 97 | meta = self.toJson() 98 | json.dump(dict((key, value) for key, value in meta.iteritems() if value != None), f) 99 | """ 100 | load from json file 101 | """ 102 | def load(self, filename): 103 | with open(filename, 'r') as f: 104 | self.fromJson(json.load(f)) 105 | """ 106 | string 107 | """ 108 | def __str__(self): 109 | return str(self.toJson()) 110 | """ 111 | print 112 | """ 113 | def __repr__(self): 114 | return self.__str__() 115 | -------------------------------------------------------------------------------- /code/svd_ops.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from tensorflow.python.framework import ops 3 | import tensorflow as tf 4 | 5 | def Hprod(H, u, k): 6 | # H.shape = (batch, n_h) 7 | # u.shape = (n_h,) 8 | alpha = 2* np.dot(H[:, -k:], u[-k:]) / np.dot(u[-k:],u[-k:]) # alpha.shape = (batch,) 9 | H_out = H.copy() 10 | H_out[:, -k:] -= np.outer(alpha, u[-k:]) 11 | return H_out 12 | 13 | def tf_Hprod(H, u, k): 14 | # H.shape = (batch, n_h) 15 | # u.shape = (n_h,) 16 | u_square = tf.tensordot(u[-k:],u[-k:],1) 17 | alpha = 2* tf.tensordot(H[:, -k:], u[-k:],1) / u_square # alpha.shape = (batch,) 18 | H_update = tf.identity(H[:,-k:]) 19 | #H_update = tf.subtract(H_update, tf.einsum('i,j->ij',alpha, u[-k:])) 20 | H_update = tf.subtract(H_update, tf.expand_dims(alpha,1) * tf.expand_dims(u[-k:],0)) 21 | 22 | H_out = tf.concat([H[:,0 :-k], H_update], axis=1) 23 | return H_out 24 | 25 | def Hgrad(H, u, G, k): # unused 26 | # H.shape = (batch, n_h) 27 | # u.shape = (n_h,) 28 | # G.shape = (batch, n_h) 29 | alpha = 2* np.dot(H[:, -k:], u[-k:]) # alpha.shape = (batch,) 30 | beta = 2* np.dot(G[:, -k:], u[-k:]) # beta.shape = (batch,) 31 | u_bar = -np.dot(alpha,G) - np.dot(beta,H) + np.dot(alpha,beta)*u # sum of gradient within the batch: averaging needed??? 32 | G_out = G.copy() 33 | G_out -= np.outer(beta,u) 34 | return G_out, u_bar # G_out.shape = (batch, n_h); u_bar.shape = (n_h,) 35 | 36 | def tf_Hgrad(H, u, G, k): 37 | # H.shape = (batch, n_h) 38 | # u.shape = (n_h,) 39 | # G.shape = (batch, n_h) 40 | u_square = tf.tensordot(u[-k:],u[-k:],1) 41 | alpha = 2* tf.tensordot(H[:, -k:], u[-k:],1) / u_square # alpha.shape = (batch,) 42 | beta = 2* tf.tensordot(G[:, -k:], u[-k:],1) / u_square # beta.shape = (batch,) 43 | 44 | u_bar = -tf.tensordot(alpha,G[:,-k:],1) - tf.tensordot(beta,H[:,-k:],1) + tf.tensordot(alpha,beta,1)*u[-k:] # sum of gradient within the batch: averaging needed??? 45 | u_bar = tf.concat([u[0 :-k],u_bar], axis=0) 46 | 47 | 48 | G_update = tf.identity(G[:,-k:]) 49 | delta_G = tf.expand_dims(beta, 1) * tf.reshape(u[-k: ], shape=(1 , k)) 50 | G_update = tf.subtract(G_update, delta_G) 51 | G_out = tf.concat([G[:,0 :-k], G_update], axis=1) 52 | 53 | return G_out, u_bar # G_out.shape = (batch, n_h); u_bar.shape = (n_h,) 54 | ###### FP definition ######## 55 | 56 | def np_svdProd(H,U): 57 | #U_shape = U.get_shape().as_list() 58 | U_shape = U.shape 59 | n_r = U_shape[0]; n_h = U_shape[1] 60 | assert( H.shape[1] == n_h) 61 | H_copy = H.copy() 62 | for i in range(0, n_r): 63 | H_copy = Hprod(H_copy, U[i], n_h-i) 64 | return H_copy 65 | 66 | def np_svdProd_inv(H,U): 67 | #U_shape = U.get_shape().as_list() 68 | U_shape = U.shape 69 | n_r = U_shape[0]; n_h = U_shape[1] 70 | assert( H.shape[1] == n_h) 71 | H_copy = H.copy() 72 | for i in range(n_r-1,-1,-1): 73 | H_copy = Hprod(H_copy, U[i], n_h-i) 74 | return H_copy 75 | ###### BP definition ######### 76 | 77 | def svdProdGrad(op, grad): 78 | H = op.inputs[0] 79 | U = op.inputs[1] 80 | 81 | #return H, grad 82 | 83 | U_shape = U.get_shape().as_list() 84 | n_r = U_shape[0]; n_h = U_shape[1] 85 | #batch = H.get_shape().as_list()[0] 86 | #assert( H.get_shape().as_list()[1] == n_h) 87 | 88 | H_hist = [tf.zeros_like(H, dtype=tf.float32)]*n_r 89 | 90 | H_hist[0] = tf.add(H_hist[0], H) 91 | for i in range(0, n_r-1): 92 | H_hist[i+1] = tf_Hprod( H_hist[i], U[i,:], n_h-i) 93 | 94 | U_bar = [tf.zeros_like(U[0,:], dtype=tf.float32)] * n_r 95 | G = grad 96 | 97 | for i in range(n_r-1, -1, -1): 98 | G, U_bar[i] = tf_Hgrad(H_hist[i], U[i], G, n_h-i) 99 | U_grad = tf.stack(U_bar) 100 | 101 | return G, U_grad #the propagated gradient with respect to the first and second argument respectively 102 | 103 | def svdProdGrad_inv(op, grad): 104 | H = op.inputs[0] 105 | U = op.inputs[1] 106 | 107 | U_shape = U.get_shape().as_list() 108 | n_r = U_shape[0]; n_h = U_shape[1] 109 | 110 | H_hist = [tf.zeros_like(H, dtype=tf.float32)]*n_r 111 | 112 | H_hist[n_r-1] = tf.add(H_hist[n_r-1], H) 113 | for i in range(n_r-1, 0, -1): 114 | H_hist[i-1] = tf_Hprod( H_hist[i], U[i,:], n_h-i) 115 | 116 | U_bar = [tf.zeros_like(U[0,:], dtype=tf.float32)] * n_r 117 | G = grad 118 | 119 | for i in range(0, n_r): 120 | G, U_bar[i] = tf_Hgrad(H_hist[i], U[i], G, n_h-i) 121 | U_grad = tf.stack(U_bar) 122 | 123 | return G, U_grad #the propagated gradient with respect to the first and second argument respectively 124 | 125 | ###### TF operator definition ####### 126 | 127 | def py_func(func, inp, Tout, stateful=True, name=None, grad=None): 128 | 129 | # Need to generate a unique name to avoid duplicates: 130 | rnd_name = 'PyFuncGrad' + str(np.random.randint(0, 1E+8)) 131 | 132 | tf.RegisterGradient(rnd_name)(grad) # see _MySquareGrad for grad example 133 | g = tf.get_default_graph() 134 | with g.gradient_override_map({"PyFunc": rnd_name}): 135 | return tf.py_func(func, inp, Tout, stateful=stateful, name=name) 136 | 137 | 138 | def tf_svdProd(H,U, name=None): 139 | 140 | with ops.name_scope(name, "svdProd",[H,U] )as name: 141 | z = py_func(np_svdProd, 142 | [H,U], 143 | [tf.float32], 144 | name=name, 145 | grad=svdProdGrad) # <-- here's the call to the gradient 146 | return z[0] 147 | 148 | def tf_svdProd_inv(H,U, name=None): 149 | 150 | with ops.name_scope(name, "svdProd_inv",[H,U] )as name: 151 | z = py_func(np_svdProd_inv, 152 | [H,U], 153 | [tf.float32], 154 | name=name, 155 | grad=svdProdGrad_inv) # <-- here's the call to the gradient 156 | return z[0] 157 | -------------------------------------------------------------------------------- /code/magma_svd_ops/svd_block_prod_gpu.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "magma_v2.h" 4 | #include "tensorflow/core/framework/op.h" 5 | #include "tensorflow/core/framework/shape_inference.h" 6 | #include "tensorflow/core/framework/op_kernel.h" 7 | #include "tensorflow/core/framework/tensor_shape.h" 8 | #include "tensorflow/core/platform/default/logging.h" 9 | //#define PRINT_DEBUG 10 | 11 | using namespace tensorflow; 12 | 13 | REGISTER_OP("SvdBlockProdGpu") 14 | .Input("hidden_state: float") 15 | .Input("householder_matrix: float") 16 | .Attr("is_forward: bool = true") 17 | .Output("output_state: float") 18 | .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { 19 | c->set_output(0, c->input(0)); 20 | return Status::OK(); 21 | }); 22 | 23 | 24 | // TODO: move the two declars to a .hpp 25 | struct workspace { 26 | magmaFloat_ptr *T_array; 27 | magmaFloat_ptr *Tau_array; 28 | magmaFloat_ptr *Twork_array; 29 | magmaFloat_ptr *V_array; 30 | magmaFloat_ptr T; 31 | magmaFloat_ptr tau; 32 | magmaFloat_ptr twork; 33 | magmaFloat_ptr dwork; 34 | magmaFloat_ptr dworkvt; 35 | }; 36 | 37 | int SvdBlockProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r, magma_queue_t queue, workspace ws, const bool isForward=true); 38 | 39 | class SvdBlockProdGpuOp : public OpKernel { 40 | private: 41 | bool _isForward; 42 | magma_queue_t _queue; 43 | bool _queue_created; 44 | bool _persistent_tensor_created; 45 | PersistentTensor _T_array; 46 | PersistentTensor _Tau_array; 47 | PersistentTensor _Twork_array; 48 | PersistentTensor _V_array; 49 | PersistentTensor _T; 50 | PersistentTensor _tau; 51 | PersistentTensor _twork; 52 | int _batchCount; 53 | public: 54 | explicit SvdBlockProdGpuOp(OpKernelConstruction* context) : OpKernel(context) { 55 | // Get the index of the value to preserve 56 | OP_REQUIRES_OK(context, 57 | context->GetAttr("is_forward", &_isForward)); 58 | // printf("Calling magma init...\n"); 59 | magma_int_t stat = magma_init(); 60 | if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); } 61 | _queue_created = false; 62 | _persistent_tensor_created = false; 63 | // create array space 64 | _batchCount = 1; 65 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_T_array, nullptr)); 66 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Tau_array, nullptr)); 67 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Twork_array, nullptr)); 68 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_V_array, nullptr)); 69 | } 70 | 71 | ~SvdBlockProdGpuOp() override { 72 | // printf("Calling magma finalize...\n"); 73 | if (!_queue_created) { 74 | // printf("destroying magma queue!\n"); 75 | magma_queue_destroy(_queue); 76 | _queue_created = false; 77 | } 78 | magma_finalize(); 79 | } 80 | 81 | void Compute(OpKernelContext* context) override { 82 | // Check number of inputs 83 | OP_REQUIRES(context, context->num_inputs() == 2, 84 | errors::InvalidArgument("SvdBlockProd expects 2 inputes.")); 85 | 86 | // Grab the input tensor 87 | const Tensor& H = context->input(0); 88 | const Tensor& U = context->input(1); 89 | // Shapes of input 90 | const TensorShape& H_shape = H.shape(); 91 | const TensorShape& U_shape = U.shape(); 92 | 93 | const int n_h = H_shape.dim_size(1); 94 | const int n_r = U_shape.dim_size(0); 95 | const int batch = H_shape.dim_size(0); 96 | // Perform dimension check 97 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape), 98 | errors::InvalidArgument("SvdBlockProd expects H to be a 2-D matrix.")); 99 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape), 100 | errors::InvalidArgument("SvdBlockProd expects U to be a 2-D matrix.")); 101 | OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1), 102 | errors::InvalidArgument("The second dimension of H and U does not match!")); 103 | 104 | // Create an output tensor 105 | Tensor* H_out = NULL; 106 | OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&H_out)); 107 | 108 | // obtain data 109 | const float* H_data = H.flat().data(); 110 | const float* U_data = U.flat().data(); 111 | float* H_out_data = H_out->flat().data(); 112 | /* 113 | // test 114 | int idx =0; 115 | std::printf( "Before:\n"); 116 | for(int i=0; i < H_shape.dim_size(0); i++){ 117 | for(int j=0; j < H_shape.dim_size(1); j++){ 118 | idx = i * H_shape.dim_size(1) + j; 119 | std::printf ("H(%d,%d)=%4.4f, %4.4f\n", i, j, H.flat()(idx), H_out_data[idx]); 120 | } 121 | } 122 | */ 123 | // Allocate temp tensors 124 | Tensor *T_array = _T_array.AccessTensor(context); 125 | Tensor *Tau_array = _Tau_array.AccessTensor(context); 126 | Tensor *Twork_array = _Twork_array.AccessTensor(context); 127 | Tensor *V_array = _V_array.AccessTensor(context); 128 | Tensor dwork; 129 | Tensor dworkvt; 130 | int ldwork = n_h, ldworkvt = std::max(n_h,batch); 131 | if (!_persistent_tensor_created) { 132 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_T, nullptr)); 133 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r}), &_tau, nullptr)); 134 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_twork, nullptr)); 135 | _persistent_tensor_created = true; 136 | } 137 | Tensor *T = _T.AccessTensor(context); 138 | Tensor *tau = _tau.AccessTensor(context); 139 | Tensor *twork = _twork.AccessTensor(context); 140 | OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dwork)); 141 | OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dworkvt)); 142 | workspace ws; 143 | ws.T_array = reinterpret_cast(T_array->flat().data()); 144 | ws.Tau_array = reinterpret_cast(Tau_array->flat().data()); 145 | ws.Twork_array = reinterpret_cast(Twork_array->flat().data()); 146 | ws.V_array = reinterpret_cast(V_array->flat().data()); 147 | ws.T = T->flat().data(); 148 | ws.tau = tau->flat().data(); 149 | ws.twork = twork->flat().data(); 150 | ws.dwork = dwork.flat().data(); 151 | ws.dworkvt = dworkvt.flat().data(); 152 | #if GOOGLE_CUDA 153 | int op_status; 154 | if (!_queue_created) { 155 | _queue_created = true; 156 | magma_queue_create(0, &_queue); 157 | // printf("created magma queue at %p!\n", reinterpret_cast(_queue)); 158 | } 159 | op_status = SvdBlockProdGpuKernelLauncher(H_data, U_data, H_out_data, n_h, batch, n_r, _queue, ws, _isForward); 160 | #endif 161 | } 162 | }; 163 | 164 | REGISTER_KERNEL_BUILDER(Name("SvdBlockProdGpu").Device(DEVICE_GPU), SvdBlockProdGpuOp); 165 | -------------------------------------------------------------------------------- /code/magma_svd_ops/gpu_unit_test.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from tensorflow.python.framework import ops 4 | from tensorflow.python.ops import array_ops 5 | from tensorflow.python.ops import sparse_ops 6 | np.set_printoptions(threshold=np.nan) 7 | n_h = 128; n_b = 512; n_r = 16 8 | print_res = (n_h * n_r < 100) 9 | 10 | 11 | def Hgrad(H, u, G, k): 12 | # H.shape = (batch, n_h) 13 | # u.shape = (n_h,) 14 | # G.shape = (batch, n_h) 15 | alpha = 2* np.dot(H[:, -k:], u[-k:]) / np.dot(u[-k:],u[-k:]) # alpha.shape = (batch,) 16 | beta = 2* np.dot(G[:, -k:], u[-k:]) / np.dot(u[-k:],u[-k:]) # beta.shape = (batch,) 17 | u_bar = np.zeros_like(u) 18 | u_bar[-k:] += -np.dot(alpha,G[:,-k:]) - np.dot(beta,H[:,-k:]) + np.dot(alpha,beta)*u[-k:] # sum of gradient within the batch: averaging needed??? 19 | G_out = G.copy() 20 | G_out[:,-k:] -= np.outer(beta,u[-k:]) 21 | return G_out, u_bar # G_out.shape = (batch, n_h); u_bar.shape = (n_h,) 22 | 23 | rng = np.random.RandomState(13) 24 | H_ = rng.uniform(-np.sqrt(6. / (n_b + n_h)), np.sqrt(6. / (n_b + n_h)), (n_b, n_h)).astype(np.float32) 25 | 26 | U_full = rng.normal(0, 0.01, (n_h, n_r)).astype(np.float32) 27 | U_ = np.tril(U_full) 28 | norms_U_ = np.linalg.norm(U_, axis=0) 29 | #U_ = np.transpose(1. / norms_U_ * U_) 30 | U_ = np.transpose(U_) 31 | 32 | T = np.triu( np.dot(U_, U_.T)) 33 | for ii in range(n_r): 34 | T[ii,ii] /=2 35 | 36 | T_inverse = np.linalg.inv(T) 37 | 38 | #for ii in range(n_r): 39 | # for jj in range(n_r): 40 | # print ' %6.3f'%(T_inverse[ii,jj]), 41 | # print '' 42 | 43 | 44 | 45 | if print_res: 46 | print "T: ", T 47 | print "T_inverse: ", np.linalg.inv(T) 48 | 49 | print "H: ",H_ 50 | print "U: ",U_ 51 | 52 | 53 | ############################################################ 54 | #Forward 55 | ############################################################ 56 | H1 = [H_]*(n_r+1) 57 | 58 | for i in range(0,n_r): 59 | alpha = np.dot(H1[i], U_[i]) 60 | #print 'alpha: ', 2*alpha 61 | H1[i+1] = H1[i] - 2 * np.outer(alpha, U_[i]) / np.dot(U_[i],U_[i]) 62 | 63 | H2 = H1[-1] 64 | 65 | 66 | G = np.ones_like(H_) 67 | Grad_U = np.ones_like(U_) 68 | 69 | for i in range(n_r-1, -1, -1): 70 | G, Grad_U[i] = Hgrad(H1[i], U_[i], G, n_h-i) 71 | 72 | if print_res: 73 | print "Hprod: ",H2 74 | print "Grad_H: ",G 75 | print "Grad_U: ",Grad_U 76 | 77 | ################ BLAS3 VER ######################## 78 | #Blas_G = np.ones_like(H_.T) 79 | #Blas_U = U_.T 80 | #Blas_H = H_.T 81 | #Grad_Q = np.dot( Blas_G , Blas_H.T) 82 | #print "Grad_Q: ", Grad_Q 83 | #print "U * T: ", np.dot(Blas_U, np.linalg.inv(T.T)) 84 | 85 | #R =np.dot( np.dot( Grad_Q.T , Blas_U ), np.linalg.inv(T.T)) 86 | #print "R: ", R 87 | #print "U * T^T: ", np.dot(Blas_U, np.linalg.inv(T.T).T) 88 | #S =np.dot( np.dot( Grad_Q , Blas_U ), np.linalg.inv(T.T).T) 89 | 90 | #M =np.dot(np.dot( np.linalg.inv(T.T) , Blas_U.T) , R) 91 | #print "M: ", M 92 | #i_lower = np.tril_indices(2, -1) 93 | #M[i_lower] = M.T[i_lower] 94 | 95 | #print "P: ", M 96 | 97 | #Hprod_BLAS3 = np.eye(3) - np.dot(np.dot( Blas_U, np.linalg.inv(T.T)), Blas_U.T) 98 | #Hprod_BLAS3 = np.dot( Hprod_BLAS3, Blas_H) 99 | 100 | #Grad_U_BLAS3 = np.dot( Blas_U, M )- S - R 101 | 102 | #print "Hprod_BLAS3: ", Hprod_BLAS3.T 103 | #print "Grad_U_BLAS3: ", Grad_U_BLAS3.T 104 | 105 | 106 | ############################################################ 107 | #Backward 108 | ############################################################ 109 | H1 = [H_]*(n_r+1) 110 | for i in range(n_r-1,-1,-1): 111 | alpha = np.dot(H1[i+1], U_[i]) 112 | H1[i] = H1[i+1] - 2 * np.outer(alpha, U_[i]) / np.dot(U_[i],U_[i]) 113 | 114 | 115 | H2_back = H1[0] 116 | 117 | 118 | G_back = np.ones_like(H_) 119 | Grad_U_back = np.ones_like(U_) 120 | 121 | for i in range(0,n_r): 122 | G_back, Grad_U_back[i] = Hgrad(H1[i+1], U_[i], G_back, n_h-i) 123 | 124 | if print_res: 125 | print "H_inv_prod: ",H2_back 126 | print "Grad_inv_H: ",G_back 127 | print "Grad_inv_U: ",Grad_U_back 128 | ############################################################ 129 | ############################################################ 130 | svd_block_prod_module = tf.load_op_library('./svd_block_prod_gpu.so') 131 | ############################################################ 132 | 133 | grad_svd_block_prod_module = tf.load_op_library('./grad_svd_block_prod_gpu.so') 134 | 135 | @ops.RegisterGradient("SvdBlockProdGpu") 136 | def _svd_block_prod_gpu_grad(op, grad): 137 | H = op.inputs[0] 138 | U = op.inputs[1] 139 | isForward = op.get_attr("is_forward") 140 | return grad_svd_block_prod_module.grad_svd_block_prod_gpu(H,U,grad, isForward) 141 | ############################################################ 142 | svd_prod_module = tf.load_op_library('../cuda_svd_ops/svd_prod_gpu.so') 143 | ############################################################ 144 | grad_svd_prod_module = tf.load_op_library('../cuda_svd_ops/grad_svd_prod_gpu.so') 145 | 146 | @ops.RegisterGradient("SvdProdGpu") 147 | def _svd_prod_gpu_grad(op, grad): 148 | H = op.inputs[0] 149 | U = op.inputs[1] 150 | return grad_svd_prod_module.grad_svd_prod_gpu(H,U,grad) 151 | ############################################################ 152 | svd_inv_prod_module = tf.load_op_library('../cuda_svd_ops/svd_inv_prod_gpu.so') 153 | grad_svd_inv_prod_module = tf.load_op_library('../cuda_svd_ops/grad_svd_inv_prod_gpu.so') 154 | 155 | @ops.RegisterGradient("SvdInvProdGpu") 156 | def _svd_inv_prod_gpu_grad(op, grad): 157 | H = op.inputs[0] 158 | U = op.inputs[1] 159 | return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad) 160 | ############################################################ 161 | with tf.Session() as sess: 162 | 163 | H = tf.constant(H_, dtype=tf.float32) 164 | U = tf.constant(U_, dtype = tf.float32) 165 | V = tf.constant(U_, dtype = tf.float32) 166 | 167 | U = tf.matrix_band_part(U, 0, -1) # upper triangular 168 | V = tf.matrix_band_part(V, 0, -1) # upper triangular 169 | 170 | z = svd_block_prod_module.svd_block_prod_gpu(H,U, True) 171 | blas2_z = svd_prod_module.svd_prod_gpu(H,U) 172 | 173 | z2 = svd_block_prod_module.svd_block_prod_gpu(H,V, False) 174 | blas2_z2 = svd_inv_prod_module.svd_inv_prod_gpu(H,V) 175 | 176 | gr = tf.gradients(z, [H,U]) 177 | blas2_gr = tf.gradients(blas2_z, [H,U]) 178 | 179 | gr2 = tf.gradients(z2, [H,V]) 180 | blas2_gr2 = tf.gradients(blas2_z2, [H,V]) 181 | 182 | tf.global_variables_initializer().run() 183 | 184 | if print_res: 185 | print('H,U and product: ',H.eval(), U.eval(),z.eval()) 186 | print('BLAS2 H,U and product: ',H.eval(), U.eval(),blas2_z.eval()) 187 | print('grad_H, grad_U: ' ,gr[0].eval(), gr[1].eval()) 188 | 189 | print('H,U and product: ',z.eval(), V.eval(),z2.eval()) 190 | print('grad_H, grad_U: ' ,gr2[0].eval(), gr2[1].eval()) 191 | 192 | 193 | print "Forward Hprod error:", np.amax( abs(H2 - z.eval())) 194 | print "Forward Hgrad G error:", np.amax( abs(G - gr[0].eval())) 195 | print "Forward Hgrad U error:", np.amax( abs(Grad_U - gr[1].eval())) 196 | print "BLAS2 Forward Hprod error:", np.amax( abs(H2 - blas2_z.eval())) 197 | print "BLAS2 Forward Hgrad G error:", np.amax( abs(G - blas2_gr[0].eval())) 198 | print "BLAS2 Forward Hgrad U error:", np.amax( abs(Grad_U - blas2_gr[1].eval())) 199 | print "Backward Hprod error:", np.amax( abs(H2_back - z2.eval())) 200 | print "Backward Hgrad G error:", np.amax( abs(G_back - gr2[0].eval())) 201 | print "Backward Hgrad U error:", np.amax( abs(Grad_U_back - gr2[1].eval())) 202 | print "BLAS2 Backward Hprod error:", np.amax( abs(H2_back - blas2_z2.eval())) 203 | print "BLAS2 Backward Hgrad G error:", np.amax( abs(G_back - blas2_gr2[0].eval())) 204 | print "BLAS2 Backward Hgrad U error:", np.amax( abs(Grad_U_back - blas2_gr2[1].eval())) 205 | -------------------------------------------------------------------------------- /code/magma_svd_ops/grad_svd_block_prod_gpu.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "magma_v2.h" 4 | #include "tensorflow/core/framework/op.h" 5 | #include "tensorflow/core/framework/shape_inference.h" 6 | #include "tensorflow/core/framework/op_kernel.h" 7 | #include "tensorflow/core/framework/tensor_shape.h" 8 | #include "tensorflow/core/platform/default/logging.h" 9 | 10 | using namespace tensorflow; 11 | 12 | REGISTER_OP("GradSvdBlockProdGpu") 13 | .Input("hidden_state: float") 14 | .Input("householder_matrix: float") 15 | .Input("gradient_backprop: float") 16 | .Attr("is_forward: bool = true") 17 | .Output("grad_hidden_state: float") 18 | .Output("grad_householder_matrix: float") 19 | .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) { 20 | c->set_output(0, c->input(0)); 21 | c->set_output(1, c->input(1)); 22 | return Status::OK(); 23 | }); 24 | 25 | #include "tensorflow/core/framework/op_kernel.h" 26 | 27 | struct grad_workspace { 28 | magmaFloat_ptr *T_array; 29 | magmaFloat_ptr *Tau_array; 30 | magmaFloat_ptr *Twork_array; 31 | magmaFloat_ptr *V_array; 32 | magmaFloat_ptr T; 33 | magmaFloat_ptr tau; 34 | magmaFloat_ptr twork; 35 | magmaFloat_ptr dwork; 36 | magmaFloat_ptr dworkvt; 37 | magmaFloat_ptr Q_grad; 38 | magmaFloat_ptr UT; 39 | }; 40 | 41 | int GradSvdBlockProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* G_grad, const int n_h, const int batch, const int n_r, magma_queue_t queue, grad_workspace ws, const bool isForward); 42 | 43 | class GradSvdBlockProdGpuOp : public OpKernel { 44 | private: 45 | bool _isForward; 46 | magma_queue_t _queue; 47 | bool _queue_created; 48 | bool _persistent_tensor_created; 49 | PersistentTensor _T_array; 50 | PersistentTensor _Tau_array; 51 | PersistentTensor _Twork_array; 52 | PersistentTensor _V_array; 53 | PersistentTensor _T; 54 | PersistentTensor _tau; 55 | PersistentTensor _twork; 56 | PersistentTensor _Q_grad; 57 | PersistentTensor _UT; 58 | int _batchCount; 59 | public: 60 | explicit GradSvdBlockProdGpuOp(OpKernelConstruction* context) : OpKernel(context) { 61 | // Get the index of the value to preserve 62 | OP_REQUIRES_OK(context, 63 | context->GetAttr("is_forward", &_isForward)); 64 | // printf("Calling magma init in grad...\n"); 65 | magma_int_t stat = magma_init(); 66 | if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); } 67 | _queue_created = false; 68 | _persistent_tensor_created = false; 69 | // create array space 70 | _batchCount = 1; 71 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_T_array, nullptr)); 72 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Tau_array, nullptr)); 73 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Twork_array, nullptr)); 74 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_V_array, nullptr)); 75 | } 76 | 77 | ~GradSvdBlockProdGpuOp() override { 78 | // printf("Calling magma finalize in grad...\n"); 79 | if (!_queue_created) { 80 | // printf("destroying magma queue!\n"); 81 | magma_queue_destroy(_queue); 82 | _queue_created = false; 83 | } 84 | magma_finalize(); 85 | } 86 | 87 | void Compute(OpKernelContext* context) override { 88 | // Check number of inputs 89 | OP_REQUIRES(context, context->num_inputs() == 3, 90 | errors::InvalidArgument("GradSvdBlockProd expects 3 inputes.")); 91 | 92 | // Grab the input tensor 93 | const Tensor& H = context->input(0); 94 | const Tensor& U = context->input(1); 95 | const Tensor& G = context->input(2); 96 | auto input = H.flat(); 97 | 98 | // Shapes of input 99 | const TensorShape& H_shape = H.shape(); 100 | const TensorShape& U_shape = U.shape(); 101 | const TensorShape& G_shape = G.shape(); 102 | 103 | const int n_h = H_shape.dim_size(1); 104 | const int n_r = U_shape.dim_size(0); 105 | const int batch = H_shape.dim_size(0); 106 | // Perform dimension check 107 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape), 108 | errors::InvalidArgument("SvdBlockProd expects H to be a 2-D matrix.")); 109 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape), 110 | errors::InvalidArgument("SvdBlockProd expects U to be a 2-D matrix.")); 111 | OP_REQUIRES(context, TensorShapeUtils::IsMatrix(G_shape), 112 | errors::InvalidArgument("SvdBlockProd expects G to be a 2-D matrix.")); 113 | OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1), 114 | errors::InvalidArgument("The second dimension of H and U does not match!")); 115 | OP_REQUIRES(context, G_shape.dim_size(0) == H_shape.dim_size(0), 116 | errors::InvalidArgument("The first dimension of G and H does not match!")); 117 | OP_REQUIRES(context, G_shape.dim_size(1) == H_shape.dim_size(1), 118 | errors::InvalidArgument("The second dimension of G and H does not match!")); 119 | 120 | // Create an output tensor 121 | Tensor* Grad_H = NULL; 122 | OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&Grad_H)); 123 | Tensor* Grad_U = NULL; 124 | OP_REQUIRES_OK(context, context->allocate_output(1, U_shape,&Grad_U)); 125 | 126 | // obtain data 127 | const float* H_data = H.flat().data(); 128 | const float* U_data = U.flat().data(); 129 | const float* G_data = G.flat().data(); 130 | float* Grad_H_data = Grad_H->flat().data(); 131 | float* Grad_U_data = Grad_U->flat().data(); 132 | // Allocate temp tensors 133 | Tensor *T_array = _T_array.AccessTensor(context); 134 | Tensor *Tau_array = _Tau_array.AccessTensor(context); 135 | Tensor *Twork_array = _Twork_array.AccessTensor(context); 136 | Tensor *V_array = _V_array.AccessTensor(context); 137 | Tensor dwork; 138 | Tensor dworkvt; 139 | int ldwork = n_h, ldworkvt = std::max(n_h,batch); 140 | if (!_persistent_tensor_created) { 141 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_T, nullptr)); 142 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r}), &_tau, nullptr)); 143 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_twork, nullptr)); 144 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_h*n_h}), &_Q_grad, nullptr)); 145 | OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_h*n_r}), &_UT, nullptr)); 146 | _persistent_tensor_created = true; 147 | } 148 | Tensor *T = _T.AccessTensor(context); 149 | Tensor *tau = _tau.AccessTensor(context); 150 | Tensor *twork = _twork.AccessTensor(context); 151 | Tensor *Q_grad = _Q_grad.AccessTensor(context); 152 | Tensor *UT = _UT.AccessTensor(context); 153 | OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dwork)); 154 | OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dworkvt)); 155 | grad_workspace ws; 156 | ws.T_array = reinterpret_cast(T_array->flat().data()); 157 | ws.Tau_array = reinterpret_cast(Tau_array->flat().data()); 158 | ws.Twork_array = reinterpret_cast(Twork_array->flat().data()); 159 | ws.V_array = reinterpret_cast(V_array->flat().data()); 160 | ws.T = T->flat().data(); 161 | ws.tau = tau->flat().data(); 162 | ws.twork = twork->flat().data(); 163 | ws.dwork = dwork.flat().data(); 164 | ws.dworkvt = dworkvt.flat().data(); 165 | ws.Q_grad = Q_grad->flat().data(); 166 | ws.UT = UT->flat().data(); 167 | #if GOOGLE_CUDA 168 | if (!_queue_created) { 169 | _queue_created = true; 170 | magma_queue_create(0, &_queue); 171 | // printf("created magma queue at %p!\n", reinterpret_cast(_queue)); 172 | } 173 | GradSvdBlockProdGpuKernelLauncher(H_data, U_data, G_data, Grad_H_data, Grad_U_data, n_h, batch, n_r, _queue, ws, _isForward); 174 | #endif 175 | } 176 | }; 177 | 178 | REGISTER_KERNEL_BUILDER(Name("GradSvdBlockProdGpu").Device(DEVICE_GPU), GradSvdBlockProdGpuOp); 179 | -------------------------------------------------------------------------------- /code/spectral_rnn.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.python.util import nest 3 | import numpy as np 4 | import os 5 | #from svd_ops import tf_svdProd, tf_svdProd_inv 6 | from tensorflow.python.framework import ops 7 | from tensorflow.python.ops import array_ops 8 | from tensorflow.python.ops import sparse_ops 9 | 10 | ############################################################ 11 | ############ BLAS3 version of SVD ops ###################### 12 | ############################################################ 13 | svd_block_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/magma_svd_ops/svd_block_prod_gpu.so') 14 | ############################################################ 15 | grad_svd_block_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/magma_svd_ops/grad_svd_block_prod_gpu.so') 16 | 17 | @ops.RegisterGradient("SvdBlockProdGpu") 18 | def _svd_block_prod_gpu_grad(op, grad): 19 | H = op.inputs[0] 20 | U = op.inputs[1] 21 | isForward = op.get_attr("is_forward") 22 | return grad_svd_block_prod_module.grad_svd_block_prod_gpu(H,U,grad, isForward) 23 | ############################################################ 24 | ############ BLAS2 version of SVD ops ###################### 25 | ############################################################ 26 | svd_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/svd_prod_gpu.so') 27 | grad_svd_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/grad_svd_prod_gpu.so') 28 | @ops.RegisterGradient("SvdProdGpu") 29 | def _svd_prod_gpu_grad(op, grad): 30 | H = op.inputs[0] 31 | U = op.inputs[1] 32 | return grad_svd_prod_module.grad_svd_prod_gpu(H,U,grad) 33 | 34 | ############################################################ 35 | svd_inv_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/svd_inv_prod_gpu.so') 36 | grad_svd_inv_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/grad_svd_inv_prod_gpu.so') 37 | @ops.RegisterGradient("SvdInvProdGpu") 38 | def _svd_inv_prod_gpu_grad(op, grad): 39 | H = op.inputs[0] 40 | U = op.inputs[1] 41 | return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad) 42 | 43 | ############################################################ 44 | 45 | class SpectralRNNCell(tf.contrib.rnn.RNNCell): 46 | """Implements a simple distribution based recurrent unit that keeps moving 47 | averages of the mean map embeddings of features of inputs. 48 | """ 49 | """ 50 | n_h: hidden state size 51 | n_o: output size 52 | n_r: reflector size 53 | variables: pass a dictionary of Variables, and we will not create new ones 54 | backend: blas3, blas2 or python 55 | """ 56 | 57 | def __init__(self, n_h, n_r = None, r_margin = 0.01, 58 | linear_out=False, activation=tf.nn.relu, variables=None, backend="blas3"): 59 | self._n_h = n_h 60 | self._n_r = n_r or n_h//4 61 | self._r_margin = r_margin 62 | 63 | self._linear_out = linear_out 64 | self._activation = activation 65 | self._variables = variables 66 | self._backend = backend 67 | 68 | @property 69 | def state_size(self): 70 | return self._n_h 71 | 72 | @property 73 | def reflector_size(self): 74 | return self._n_r 75 | 76 | @property 77 | def output_size(self): 78 | return self._n_h 79 | 80 | def __call__(self, inputs, state, scope=None): 81 | """ 82 | recur*: r 83 | state*: mu 84 | stats*: phi 85 | _mavg_alphas: alpha vector 86 | """ 87 | with tf.variable_scope(scope or type(self).__name__): 88 | # Compute the output. 89 | """ 90 | o_t = W^o mu_t + b^o 91 | """ 92 | output = _svdlinear([inputs, state], self._n_h, self._n_r, True, r=self._r_margin, scope='output', variables=self._variables, backend=self._backend) 93 | #output = _linear([inputs, state], self._n_h, True, scope='output') 94 | 95 | 96 | if not self._linear_out: 97 | output = self._activation(output, name='output_act') 98 | """ 99 | o_t and mu_t 100 | """ 101 | return (output, output) 102 | 103 | 104 | # No longer publicly expose function in tensorflow. 105 | def _svdlinear(args, output_size, reflector_size, bias, bias_start=0.0, sig_mean = 1.0, r = 0.01, scope=None, variables=None, backend="blas3"): 106 | """Linear map with svd operator 107 | 108 | Args: 109 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 110 | output_size: int, second dimension of W[i]. 111 | bias: boolean, whether to add a bias term or not. 112 | bias_start: starting value to initialize the bias; 0 by default. 113 | sig_mean: initial and "mean" value of singular values, usually set to 1.0, 114 | for ResNet should be set to 0.0 115 | r: singular margin, the allowed margin for singular values 116 | scope: VariableScope for the created subgraph; defaults to "Linear". 117 | variables: pass a dictionary of Variables, and we will not create new ones 118 | backend: blas3, blas2 or python 119 | 120 | Returns: 121 | A 2D Tensor with shape [batch x output_size] 122 | 123 | Raises: 124 | ValueError: if some of the arguments has unspecified or wrong shape or unknown backend is passed 125 | """ 126 | if args is None or (nest.is_sequence(args) and not args): 127 | raise ValueError("`args` must be specified") 128 | if not nest.is_sequence(args): 129 | args = [args] 130 | 131 | dtype = [a.dtype for a in args][0] 132 | # computation for svd:Hprod 133 | with tf.variable_scope(scope or "svdHprod"): 134 | if variables: 135 | U_full = variables["Householder_U_full"] 136 | else: 137 | U_full = tf.get_variable( 138 | "Householder_U_full", [reflector_size, output_size], dtype=dtype) 139 | U = tf.matrix_band_part(U_full, 0, -1) # upper triangular 140 | if variables: 141 | p = variables["p"] 142 | else: 143 | p = tf.get_variable( 144 | "p", [ output_size], dtype=dtype, 145 | initializer=tf.constant_initializer(np.zeros(output_size))) 146 | Sig = 2*r*(tf.sigmoid(p) - 0.5) + sig_mean 147 | if variables: 148 | V_full = variables["Householder_V_full"] 149 | else: 150 | V_full = tf.get_variable( 151 | "Householder_V_full", [reflector_size, output_size], dtype=dtype) 152 | V = tf.matrix_band_part(V_full, 0, -1) # upper triangular 153 | 154 | 155 | if backend == "python": 156 | svd_term = tf_svdProd( args[1], V) # python operator 157 | svd_term = tf.multiply(svd_term, Sig) 158 | svd_term = tf_svdProd_inv( svd_term, U) # python operator 159 | elif backend == "blas2": 160 | svd_term = svd_prod_module.svd_prod_gpu( args[1], V) # BLAS2 operator 161 | svd_term = tf.multiply(svd_term, Sig) 162 | svd_term = svd_inv_prod_module.svd_inv_prod_gpu( svd_term, U) # BLAS2 operator 163 | elif backend == "blas3": 164 | svd_term = svd_block_prod_module.svd_block_prod_gpu( args[1], V, True) # BLAS3 operator 165 | svd_term = tf.multiply(svd_term, Sig) 166 | svd_term = svd_block_prod_module.svd_block_prod_gpu( svd_term, U, False) # BLAS3 operator 167 | else: 168 | raise ValueError("Unknown backend " + backend) 169 | 170 | 171 | 172 | # Now the computation for the rest 173 | with tf.variable_scope(scope or "svdLinear"): 174 | if variables: 175 | matrix = variables["Matrix"] 176 | else: 177 | matrix = tf.get_variable( 178 | "Matrix", [args[0].shape[1].value, output_size], dtype=dtype) 179 | res = tf.matmul(args[0], matrix) 180 | if not bias: 181 | return res + svd_term 182 | if variables: 183 | bias_term = variables["Bias"] 184 | else: 185 | bias_term = tf.get_variable( 186 | "Bias", [output_size], 187 | dtype=dtype, 188 | initializer=tf.constant_initializer(bias_start, dtype=dtype) 189 | ) 190 | return res + bias_term + svd_term 191 | 192 | def _linear(args, output_size, bias, bias_start=0.0, scope=None): 193 | """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable. 194 | 195 | Args: 196 | args: a 2D Tensor or a list of 2D, batch x n, Tensors. 197 | output_size: int, second dimension of W[i]. 198 | bias: boolean, whether to add a bias term or not. 199 | bias_start: starting value to initialize the bias; 0 by default. 200 | scope: VariableScope for the created subgraph; defaults to "Linear". 201 | 202 | Returns: 203 | A 2D Tensor with shape [batch x output_size] equal to 204 | sum_i(args[i] * W[i]), where W[i]s are newly created matrices. 205 | 206 | Raises: 207 | ValueError: if some of the arguments has unspecified or wrong shape. 208 | """ 209 | if args is None or (nest.is_sequence(args) and not args): 210 | raise ValueError("`args` must be specified") 211 | if not nest.is_sequence(args): 212 | args = [args] 213 | 214 | # Calculate the total size of arguments on dimension 1. 215 | total_arg_size = 0 216 | shapes = [a.get_shape().as_list() for a in args] 217 | for shape in shapes: 218 | if len(shape) != 2: 219 | raise ValueError( 220 | "Linear is expecting 2D arguments: %s" % 221 | str(shapes)) 222 | if not shape[1]: 223 | raise ValueError( 224 | "Linear expects shape[1] of arguments: %s" % 225 | str(shapes)) 226 | else: 227 | total_arg_size += shape[1] 228 | 229 | dtype = [a.dtype for a in args][0] 230 | 231 | # Now the computation. 232 | with tf.variable_scope(scope or "Linear"): 233 | matrix = tf.get_variable( 234 | "Matrix", [total_arg_size, output_size], dtype=dtype) 235 | if len(args) == 1: 236 | res = tf.matmul(args[0], matrix) 237 | else: 238 | res = tf.matmul(tf.concat(args, 1), matrix) 239 | if not bias: 240 | return res 241 | bias_term = tf.get_variable( 242 | "Bias", [output_size], 243 | dtype=dtype, 244 | initializer=tf.constant_initializer(bias_start, dtype=dtype) 245 | ) 246 | return res + bias_term 247 | -------------------------------------------------------------------------------- /code/rnn.py: -------------------------------------------------------------------------------- 1 | import math, time 2 | import tensorflow as tf 3 | import numpy as np 4 | import spectral_rnn 5 | import Params 6 | import sys,os 7 | from tensorflow.python.framework import ops 8 | from tensorflow.python.ops import array_ops 9 | from tensorflow.python.ops import sparse_ops 10 | 11 | 12 | class RNNModel (object): 13 | def __init__(self, params): 14 | self.rnn_cell = None 15 | # feature 16 | self.x = tf.placeholder("float", [None, params.time_steps, params.input_size]) 17 | # label 18 | self.y = tf.placeholder("float", [None, params.output_size]) 19 | # train_flag placeholder 20 | self.train_flag = tf.placeholder(tf.bool, [], name="train_flag") 21 | # learning rate placeholder 22 | self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate') 23 | 24 | self.init_epoch = 0 25 | print 'Var names: ', self.x.name, self.y.name, self.train_flag.name, self.learning_rate.name 26 | 27 | sys.stdout.flush() 28 | # set random seed before build the graph 29 | tf.set_random_seed(params.random_seed) 30 | 31 | # build graph 32 | logits = self.build(params) 33 | 34 | # prediction 35 | # Define loss and optimizer 36 | # evaluation 37 | if params.regression_flag: 38 | self.pred = logits 39 | self.loss_op = tf.reduce_mean(tf.pow(self.pred-self.y, 2)) 40 | self.accuracy = self.loss_op 41 | else: 42 | self.pred = tf.nn.softmax(logits) 43 | self.loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits( 44 | logits=logits, labels=self.y)) 45 | correct_pred = tf.equal(tf.argmax(self.pred, 1), tf.argmax(self.y, 1)) 46 | self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 47 | 48 | config = tf.ConfigProto(device_count={'GPU' : int(params.gpu_flag)}) 49 | config.gpu_options.allow_growth = True 50 | config.gpu_options.per_process_gpu_memory_fraction = 0.9 51 | # running session 52 | self.session = tf.Session(config=config) 53 | self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate) 54 | self.train_op = self.optimizer.minimize(self.loss_op) 55 | 56 | def __enter__(self): 57 | return self 58 | def __exit__(self, exc_type, exc_value, traceback): 59 | self.close() 60 | 61 | """ 62 | call this function to destroy globally defined variables in tensorflow 63 | """ 64 | def close(self): 65 | self.session.close() 66 | tf.reset_default_graph() 67 | 68 | 69 | def set_cell(self, params): 70 | if params.cell == "LSTM": 71 | self.rnn_cell = tf.contrib.rnn.BasicLSTMCell( 72 | num_units=params.num_units 73 | ) 74 | elif params.cell == "RNN": 75 | self.rnn_cell = tf.contrib.rnn.BasicRNNCell( 76 | num_units=params.num_units 77 | ) 78 | elif params.cell == "SpectralRNN": 79 | self.rnn_cell = spectral_rnn.SpectralRNNCell( 80 | n_h=params.num_units, 81 | n_r=params.r_size, 82 | r_margin = params.r_margin 83 | ) 84 | else: 85 | assert 0, "unsupported cell %s" % (params.cell) 86 | 87 | def build(self, params): 88 | 89 | self.set_cell(params) 90 | # last linear layer 91 | last_w = tf.get_variable("last_w", initializer=tf.truncated_normal([self.rnn_cell.output_size, params.output_size], stddev=0.1)) 92 | last_b = tf.get_variable("last_b", initializer=tf.truncated_normal([params.output_size], stddev=0.1)) 93 | 94 | # Unstack to get a list of 'time_steps' tensors of shape (batch_size, n_input) 95 | # assume time_steps is on axis 1 96 | x = tf.unstack(self.x, params.time_steps, 1) 97 | # get RNN cell output 98 | output, states = tf.contrib.rnn.static_rnn(self.rnn_cell, x, dtype=np.float32) 99 | # Apply Dropout 100 | output = tf.cond(self.train_flag, lambda: tf.nn.dropout(output, params.dropout_keep_rate), lambda: tf.identity(output)) 101 | # linear activation, using rnn inner loop last output 102 | logits = tf.matmul(output[-1], last_w) + last_b 103 | print "output[-1].shape = ", output[-1].get_shape() 104 | print "last_w.shape = ", last_w.get_shape() 105 | 106 | self.vars = tf.trainable_variables() 107 | self.normalize_vars = [v for v in tf.trainable_variables() if 'Householder' in v.name] 108 | 109 | self.validate_batch_size = params.batch_size * 4 110 | print "trainable_variables = ", [v.name for v in self.vars] 111 | print "normalize_variables = ", [v.name for v in self.normalize_vars] 112 | 113 | sys.stdout.flush() 114 | return logits 115 | 116 | """ 117 | @brief model training 118 | @param params parameters 119 | """ 120 | def train(self, params, train_x, train_y, test_x, test_y): 121 | if params.regression_flag: 122 | metric = "RMS" 123 | else: 124 | metric = "accuracy" 125 | #optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) 126 | 127 | normalize_op = [tf.assign(v,tf.nn.l2_normalize(tf.matrix_band_part(v,0,-1),1)) for v in self.normalize_vars] 128 | # Initialize the variables (i.e. assign their default value) 129 | init = tf.global_variables_initializer() 130 | 131 | # Start training 132 | if not params.load_model: 133 | self.session.run(init) 134 | else: 135 | uninitialized_vars = self.get_un_init_vars() 136 | if len(uninitialized_vars) > 0: 137 | print "Sth not right, these vars are not loaded: ", [x.name for x in uninitialized_vars] 138 | # only initialize if not train 139 | if not params.train_flag: 140 | print("model not trained") 141 | return None, None 142 | 143 | print "start trainging! " 144 | sys.stdout.flush() 145 | train_error = [] 146 | test_error = [] 147 | iterations = 0 148 | time_used = 0 149 | num_batches = math.ceil(len(train_x)/float(params.batch_size)) 150 | for epoch in range(self.init_epoch, params.num_epochs): 151 | # reduce learning rate by epoch 152 | learning_rate = params.initial_learning_rate*math.pow(params.lr_decay, int(epoch)) 153 | if epoch == self.init_epoch: 154 | train_error.append(self.validate(train_x, train_y, batch_size=self.validate_batch_size)) 155 | test_error.append(self.validate(test_x, test_y, batch_size=self.validate_batch_size)) 156 | print("Epoch %d, iterations = %d, time = %.6f, training %s = %.6f, testing %s = %.6f" % (self.init_epoch-1, iterations, time_used, metric, train_error[-1], metric, test_error[-1])) 157 | sys.stdout.flush() 158 | t0 = time.time() 159 | # permuate batches 160 | perm = np.random.permutation(len(train_x)) 161 | 162 | # run on batches 163 | batch_index = 0 164 | for batch_begin in range(0, len(train_x), params.batch_size): 165 | # get batch x and y 166 | batch_x = train_x[perm[batch_begin:min(batch_begin+params.batch_size, len(train_x))]] 167 | batch_y = train_y[perm[batch_begin:min(batch_begin+params.batch_size, len(train_x))]] 168 | feed_dict = {self.x: batch_x, 169 | self.y: batch_y, 170 | self.train_flag: True, 171 | self.learning_rate: learning_rate} 172 | # Run optimization op (backprop) 173 | self.session.run(self.train_op, feed_dict=feed_dict) 174 | if params.cell=='SpectralRNN': 175 | self.session.run(normalize_op) 176 | 177 | batch_index += 1 178 | iterations += 1 179 | 180 | # decay the display intervals for speedup 181 | if batch_index % (num_batches//params.display_epoch_num) == 0: 182 | time_used += time.time() - t0 183 | train_error.append(self.validate(train_x, train_y, batch_size=self.validate_batch_size)) 184 | test_error.append(self.validate(test_x, test_y, batch_size=self.validate_batch_size)) 185 | print("Epoch %.6f, iterations = %s, time = %.6f, training %s = %.6f, testing %s = %.6f, learning rate = %f" % 186 | ( self.init_epoch+float(iterations)/num_batches, '{:05}'.format(iterations), time_used, metric, train_error[-1], metric, test_error[-1], learning_rate)) 187 | sys.stdout.flush() 188 | t0 = time.time() 189 | # save model 190 | if params.model_dir and iterations%(5*num_batches)==0: 191 | if os.path.isdir(os.path.dirname(params.model_dir+'/'+params.dataset)) == False: 192 | os.makedirs(params.model_dir+'/'+params.dataset) 193 | print 'making dir: '+params.model_dir+'/'+params.dataset 194 | if params.cell=='SpectralRNN': 195 | self.save("%s/%s/%s.%s.%s.%s.%s" % (params.model_dir,params.dataset,params.cell,params.r_size,params.num_units,"init"+str(params.initial_learning_rate),"epoch"+str(epoch) )) 196 | else: 197 | self.save("%s/%s/%s.%s.%s.%s" % (params.model_dir,params.dataset,params.cell,params.num_units,"init"+str(params.initial_learning_rate),"epoch"+str(epoch) )) 198 | 199 | if np.isnan(train_error[-1]) or np.isinf(train_error[-1]) or np.isnan(test_error[-1]) or np.isinf(test_error[-1]): 200 | print("found nan or inf, stop training") 201 | break 202 | 203 | print("Optimization Finished!") 204 | 205 | return train_error, test_error 206 | 207 | """ 208 | @brief prediction 209 | @param params parameters 210 | """ 211 | def predict(self, x, batch_size=128): 212 | # Launch the graph 213 | pred = np.zeros((len(x), self.pred.get_shape().as_list()[1])) 214 | # run on batches 215 | for batch_begin in range(0, len(x), batch_size): 216 | # get batch x and y 217 | batch_x = x[batch_begin:min(batch_begin+batch_size, len(x))] 218 | # Run optimization op (backprop) 219 | pred[batch_begin:min(batch_begin+batch_size, len(x))] = self.session.run(self.pred, feed_dict={self.x: batch_x, 220 | self.train_flag: False}) 221 | return pred 222 | 223 | """ 224 | @brief validate prediction 225 | @params x feature 226 | @params y label 227 | @param batch_size batch size 228 | @return accuracy 229 | """ 230 | def validate(self, x, y, batch_size=128): 231 | # error 232 | cost = self.accuracy 233 | # relative error 234 | validate_cost = 0.0 235 | for batch_begin in range(0, len(x), batch_size): 236 | # get batch x and y 237 | batch_x = x[batch_begin:min(batch_begin+batch_size, len(x))] 238 | batch_y = y[batch_begin:min(batch_begin+batch_size, len(x))] 239 | feed_dict = {self.x: batch_x, 240 | self.y: batch_y, 241 | self.train_flag: False} 242 | # Calculate batch loss and accuracy 243 | validate_cost += self.session.run(cost, feed_dict=feed_dict)*len(batch_y) 244 | return validate_cost/len(x) 245 | 246 | """ 247 | @brief save model 248 | @param filename file name 249 | """ 250 | def save(self, filename): 251 | print "save model ", filename 252 | 253 | saver = tf.train.Saver() 254 | saver.save(self.session, filename) 255 | 256 | """ 257 | @brief load model 258 | @param filename model file name 259 | """ 260 | def load(self, filename): 261 | print "load model ", filename 262 | 263 | saver = tf.train.Saver() 264 | saver.restore(self.session, filename) 265 | 266 | graph = tf.get_default_graph() 267 | self.x = graph.get_tensor_by_name("Placeholder:0") 268 | self.y = graph.get_tensor_by_name("Placeholder_1:0") 269 | self.trainFlag = graph.get_tensor_by_name("train_flag:0") 270 | self.learningRate = graph.get_tensor_by_name("learning_rate:0") 271 | 272 | self.init_epoch = int(filename.split('epoch')[1])+1 273 | 274 | def get_un_init_vars(self): 275 | uninitialized_vars = [] 276 | for var in tf.all_variables(): 277 | try: 278 | self.session.run(var) 279 | except tf.errors.FailedPreconditionError: 280 | uninitialized_vars.append(var) 281 | return uninitialized_vars 282 | -------------------------------------------------------------------------------- /code/magma_svd_ops/svd_block_prod_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | #define EIGEN_USE_GPU 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "magma_v2.h" 7 | #include "magma_internal.h" 8 | #include "batched_kernel_param.h" 9 | #define THREAD_SIZE 512 10 | #define max_shared_bsiz 32 11 | 12 | #define RFT_MAG_GEM 13 | #define use_gemm_larft 14 | 15 | extern __shared__ float shared_data[]; 16 | 17 | __global__ void ZeroTriu(float* U, const int n_h, const int n_r) { 18 | int col = blockIdx.x; 19 | for(int row = threadIdx.x; row < col; row += blockDim.x){ 20 | U[col*n_h + row] = 0; 21 | } 22 | } 23 | 24 | __global__ void UpperTri(float* T, const int n_r, const int N) { 25 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 26 | if(idx < N and idx%n_r > idx/n_r ){ 27 | T[idx] = 0; 28 | } 29 | } 30 | 31 | __global__ void ConstSet(float* tau, const float a, const int N) { 32 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 33 | if(idx < N) tau[idx] = a; 34 | } 35 | 36 | __global__ void ConstDevide(float* tau, const float a, const int N) { 37 | int idx = blockIdx.x; 38 | if(idx < N) tau[idx] = a / tau[idx]; 39 | } 40 | 41 | __global__ void CalculateTau(float *tau, float* V, const int n_r, const int n_h, float init) { 42 | //int global_idx = blockIdx.x * blockDim.x + threadIdx.x; 43 | int col = blockIdx.x; 44 | __shared__ float sdata[THREAD_SIZE]; 45 | assert(blockDim.x == THREAD_SIZE); 46 | //=========================== 47 | // init tau to be zero 48 | // ========================== 49 | //if(threadIdx.x==0){tau[col] = init;} 50 | //=========================== 51 | // reduce col square 52 | //=========================== 53 | // compute local col square 54 | float temp = 0.0; 55 | for(int row=threadIdx.x; row < n_h; row += blockDim.x){ 56 | temp += V[ col*n_h + row] * V[col*n_h + row]; 57 | } 58 | sdata[threadIdx.x] = temp; 59 | __syncthreads(); 60 | // reduction within block (across all threads) 61 | int i = blockDim.x/2; 62 | while (i != 0){ 63 | if (threadIdx.x < i) 64 | sdata[threadIdx.x] += sdata[threadIdx.x + i]; 65 | __syncthreads(); 66 | i /= 2; 67 | } 68 | //========================= 69 | // compute tau 70 | // ======================= 71 | if(threadIdx.x == 0) 72 | tau[col] = 2.0 / sdata[0]; 73 | } 74 | 75 | 76 | __global__ void SetAddress(float** array, float* one_matrix) { 77 | int idx = blockIdx.x; 78 | array[idx] = one_matrix; 79 | } 80 | 81 | void printDeviceMatrix(const float* A, int col, int row, magma_queue_t queue){ 82 | float* hA; 83 | magma_smalloc_cpu(&hA, col*row); 84 | 85 | magma_sgetmatrix( row, col, A, row, hA, row, queue); // copy d_a -> r 86 | for(int i = 0; icuda_stream() >>> 160 | (m, n, tau_array, Trec_array, ldtrec, Ttri_array, ldttri); 161 | } 162 | 163 | /******************************************************************************/ 164 | extern "C" magma_int_t 165 | my_magma_slarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, 166 | float **v_array, magma_int_t ldv, 167 | float **tau_array, float **T_array, magma_int_t ldt, 168 | float **work_array, magma_int_t lwork, 169 | magma_int_t batchCount, magma_queue_t queue) 170 | { 171 | float c_one = MAGMA_S_ONE; 172 | float c_zero = MAGMA_S_ZERO; 173 | 174 | if ( k <= 0) return 0; 175 | if ( stair_T > 0 && k <= stair_T) return 0; 176 | 177 | magma_int_t maxnb = max_shared_bsiz; 178 | 179 | magma_int_t info = 0; 180 | if (stair_T > 0 && stair_T > maxnb) { 181 | info = -3; 182 | } 183 | else if (lwork < k*ldt) { 184 | info = -10; 185 | } 186 | if (info != 0) { 187 | magma_xerbla( __func__, -(info) ); 188 | return info; 189 | } 190 | 191 | magma_int_t DEBUG=0; 192 | magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; 193 | 194 | magma_int_t i, j, prev_n, mycol, rows; 195 | 196 | float **dW1_displ = NULL; 197 | float **dW2_displ = NULL; 198 | float **dW3_displ = NULL; 199 | float **dTstep_array = NULL; 200 | 201 | magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); 202 | magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); 203 | magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); 204 | magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); 205 | 206 | //float *Tstep = k > nb ? work : T; 207 | if (k > nb) 208 | { 209 | magma_sdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); 210 | } 211 | else 212 | { 213 | magma_sdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); 214 | } 215 | 216 | //magma_int_t ldtstep = k > nb ? k : ldt; 217 | magma_int_t ldtstep = ldt; //a enlever 218 | // stair_T = 0 meaning all T 219 | // stair_T > 0 meaning the triangular portion of T has been computed. 220 | // the value of stair_T is the nb of these triangulars 221 | 222 | 223 | //GEMV compute the whole triangular upper portion of T (phase 1) 224 | // TODO addcublas to check perf 225 | 226 | magma_sgemm_batched( MagmaConjTrans, MagmaNoTrans, 227 | k, k, n, 228 | c_one, v_array, ldv, 229 | v_array, ldv, 230 | c_zero, dTstep_array, ldtstep, 231 | batchCount, queue ); 232 | 233 | magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue ); 234 | // no need for it as T is expected to be lower zero 235 | //if (k > nb) magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue ); 236 | 237 | 238 | //TRMV 239 | //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] 240 | // TRMV is split over block of column of size nb 241 | // the update should be done from top to bottom so: 242 | // 1- a gemm using the previous computed columns 243 | // of T to update rectangular upper protion above 244 | // the triangle of my columns 245 | // 2- the columns need to be updated by a serial 246 | // loop over of gemv over itself. since we limit the 247 | // shared memory to nb, this nb column 248 | // are split vertically by chunk of nb rows 249 | 250 | dim3 grid(1, 1, batchCount); 251 | 252 | for (j=0; j < k; j += nb) 253 | { 254 | prev_n = j; 255 | mycol = min(nb, k-j); 256 | // note that myrow = prev_n + mycol; 257 | if (prev_n > 0 && mycol > 0) { 258 | if (DEBUG == 3) { 259 | printf("doing gemm on the rectangular portion of size %lld %lld of T(%lld,%lld)\n", 260 | (long long) prev_n, (long long) mycol, (long long) 0, (long long) j ); 261 | } 262 | 263 | magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); 264 | magma_sdisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); 265 | magma_sgemm_batched( MagmaNoTrans, MagmaNoTrans, 266 | prev_n, mycol, prev_n, 267 | c_one, T_array, ldt, 268 | dW1_displ, ldtstep, 269 | c_zero, dW2_displ, ldt, 270 | batchCount, queue ); 271 | 272 | // update my rectangular portion (prev_n,mycol) using sequence of gemv 273 | magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); 274 | magma_sdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); 275 | 276 | for (i=0; i < prev_n; i += nb) 277 | { 278 | rows = min(nb,prev_n-i); 279 | if (DEBUG == 3) { 280 | printf(" doing recstrmv on the rectangular portion of size %lld %lld of T(%lld,%lld)\n", 281 | (long long) rows, (long long) mycol, (long long) i, (long long) j ); 282 | } 283 | 284 | if (rows > 0 && mycol > 0) 285 | { 286 | magma_sdisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); 287 | my_magmablas_slarft_recstrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); 288 | } 289 | } 290 | } 291 | 292 | // the upper rectangular protion is updated, now if needed update the triangular portion 293 | if (stair_T == 0) { 294 | if (DEBUG == 3) { 295 | printf("doing strmv on the triangular portion of size %lld %lld of T(%lld,%lld)\n", 296 | (long long) mycol, (long long) mycol, (long long) j, (long long) j ); 297 | } 298 | 299 | if (mycol > 0) 300 | { 301 | magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); 302 | magma_sdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); 303 | magma_sdisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); 304 | magmablas_slarft_strmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); 305 | } 306 | } 307 | }// end of j 308 | 309 | magma_free(dW1_displ); 310 | magma_free(dW2_displ); 311 | magma_free(dW3_displ); 312 | magma_free(dTstep_array); 313 | 314 | return 0; 315 | } 316 | 317 | struct workspace { 318 | magmaFloat_ptr *T_array; 319 | magmaFloat_ptr *Tau_array; 320 | magmaFloat_ptr *Twork_array; 321 | magmaFloat_ptr *V_array; 322 | magmaFloat_ptr T; 323 | magmaFloat_ptr tau; 324 | magmaFloat_ptr twork; 325 | magmaFloat_ptr dwork; 326 | magmaFloat_ptr dworkvt; 327 | }; 328 | 329 | int SvdBlockProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r, magma_queue_t queue, workspace ws, const bool isForward) { 330 | /* 331 | * Computes the Hprod(U, H) = Q * H or Q^T * H, according to isForward. 332 | * where Q = House(u_1)*House(u_2)*...*House(u_{n_r}) 333 | * 334 | * H : the hidden states, shape = (n_h, batch) 335 | * U : the elementary reflectors, lower triabgular, shape = (n_h ,n_r) 336 | * n_h : hidden dimension 337 | * batch : batch size 338 | * n_r : number of Householder reflectors 339 | * isForward: if doing transpose on Q 340 | * 341 | * According to MAGMA documents, the diagonal elements of U are supposed to be 1, but does not influence the results ?? 342 | * TODO : avoid the memcpy 343 | */ 344 | 345 | magma_int_t stat; 346 | int batchCount = 1; 347 | cudaDeviceSynchronize(); 348 | 349 | // printf("using magma queue at %p!\n", reinterpret_cast(queue)); 350 | 351 | // stat = magma_init(); 352 | // TODO: avoid creating the queue here 353 | // magma_queue_t queue; 354 | // magma_queue_create(0, &queue); 355 | // if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); return EXIT_FAILURE;} 356 | 357 | 358 | magmaFloat_ptr *T_array, *Tau_array, *Twork_array, *V_array; 359 | #if 0 360 | magma_malloc((void**)&T_array, batchCount * sizeof(*T_array)); 361 | magma_malloc((void**)&Tau_array, batchCount * sizeof(*Tau_array)); 362 | magma_malloc((void**)&Twork_array, batchCount * sizeof(*Twork_array)); 363 | magma_malloc((void**)&V_array, batchCount * sizeof(*V_array)); 364 | #else 365 | T_array = ws.T_array; 366 | Tau_array = ws.Tau_array; 367 | Twork_array = ws.Twork_array; 368 | V_array = ws.V_array; 369 | #endif 370 | #if 0 371 | // construct alpha and fill alpha with 2.0 (assume u_i are of unit norm) 372 | magmaFloat_ptr T, tau; 373 | /* 374 | if( magma_smalloc(&T, n_r*n_r) != MAGMA_SUCCESS){ 375 | printf("Error allocating T!\n"); 376 | return EXIT_FAILURE; 377 | } 378 | */ 379 | T = ws.T; 380 | /* 381 | if( magma_smalloc(&tau, n_r) != MAGMA_SUCCESS){ 382 | printf("Error allocating tau!\n"); 383 | return EXIT_FAILURE; 384 | } 385 | */ 386 | tau = ws.tau; 387 | 388 | // allocate workspace 389 | magmaFloat_ptr twork, dwork, dworkvt; 390 | magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch); 391 | /* 392 | if( magma_smalloc(&twork, n_r*n_r) != MAGMA_SUCCESS){ 393 | printf("Error allocating twork!\n"); 394 | return EXIT_FAILURE; 395 | } 396 | */ 397 | twork = ws.twork; 398 | /* 399 | if( magma_smalloc(&dwork, ldwork*n_r) != MAGMA_SUCCESS){ 400 | printf("Error allocating dwork!\n"); 401 | return EXIT_FAILURE; 402 | } 403 | */ 404 | dwork = ws.dwork; 405 | if( magma_smalloc(&dworkvt, ldworkvt*n_r) != MAGMA_SUCCESS){ 406 | printf("Error allocating dworkvt!\n"); 407 | return EXIT_FAILURE; 408 | } 409 | // dworkvt = ws.dworkvt; 410 | 411 | #else 412 | magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch); 413 | magmaFloat_ptr T = ws.T; 414 | magmaFloat_ptr tau = ws.tau; 415 | magmaFloat_ptr twork = ws.twork; 416 | magmaFloat_ptr dwork = ws.dwork; 417 | magmaFloat_ptr dworkvt = ws.dworkvt; 418 | /* 419 | magmaFloat_ptr dwork; 420 | if( magma_smalloc(&dwork, ldwork*n_r) != MAGMA_SUCCESS){ 421 | printf("Error allocating dwork!\n"); 422 | return EXIT_FAILURE; 423 | } 424 | */ 425 | 426 | #endif 427 | // copy H to H_out 428 | magmablas_slacpy(MagmaFull, n_h, batch, H, n_h, H_out, n_h, queue); 429 | // compute T = inv( striu(U'U) + 0.5 * diag(U'U)) 430 | magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, dwork, n_h, queue); 431 | 432 | // calculate tau[i] = 2.0/ dot(V[i],V[i]) 433 | CalculateTau<<< n_r, THREAD_SIZE>>>(tau, dwork, n_r, n_h, 0.0); // tau = [u_i*u_i] 434 | //ConstSet<<< n_r, 1>>>(tau, 2.0, n_r); // tau = [u_i*u_i] 435 | ConstSet<<< n_r, n_r>>>(T, 0, n_r*n_r); // set T to zero 436 | 437 | SetAddress<<>>(T_array, T); 438 | SetAddress<<>>(Tau_array, tau); 439 | SetAddress<<>>(V_array, dwork); 440 | SetAddress<<>>(Twork_array, twork); 441 | 442 | stat = my_magma_slarft_batched( n_h, 443 | n_r, 444 | 0, // stair_T not sure what it does 445 | V_array, n_h, // 446 | Tau_array, // 447 | T_array, n_r, // 448 | Twork_array, n_r*n_r, 449 | 1, // batchCount 450 | queue); 451 | 452 | 453 | // compute H = Q * H 454 | magma_trans_t isTrans = MagmaTrans; 455 | if(not isForward){ 456 | isTrans = MagmaNoTrans; 457 | } 458 | 459 | stat |= magma_slarfb_gpu_gemm( MagmaLeft, // side 460 | isTrans, // transpose 461 | MagmaForward, // Q = H(u_{n_r}) . . . H(u_2) H(u_1) (Backward) 462 | MagmaColumnwise,// elementary reflectors are stored 463 | n_h, // number of rows of H 464 | batch, // number of columns of H 465 | n_r, // number of Householder reflectors 466 | U, // U = (u_1, u_2,..., u_{n_r}) 467 | n_h, // The leading dimension of U 468 | T, // block Householder T 469 | n_r, // The leading dimension of T 470 | H_out, // H.shape = (n_h, batch) 471 | n_h, // leading dimension of H 472 | dwork, // workspace 473 | ldwork, // leading dimension of workspace 474 | dworkvt, // workspace 2 475 | ldworkvt, // leading dimension of workspace2 476 | queue 477 | ); 478 | 479 | // wait for all kernels in the queue 480 | magma_queue_sync(queue); 481 | cudaDeviceSynchronize(); 482 | 483 | // free memory 484 | #if 0 485 | magma_free(T_array); 486 | magma_free(Tau_array); 487 | magma_free(Twork_array); 488 | magma_free(V_array); 489 | 490 | magma_free(T); 491 | magma_free(tau); 492 | magma_free(twork); 493 | magma_free(dwork); 494 | magma_free(dworkvt); 495 | #endif 496 | assert(stat == MAGMA_SUCCESS); 497 | // magma_queue_destroy(queue); 498 | // magma_finalize(); 499 | 500 | return EXIT_SUCCESS; 501 | } 502 | 503 | -------------------------------------------------------------------------------- /code/magma_svd_ops/grad_svd_block_prod_gpu.cu.cc: -------------------------------------------------------------------------------- 1 | #define EIGEN_USE_GPU 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "magma_v2.h" 7 | #include "magma_internal.h" 8 | #include "batched_kernel_param.h" 9 | #define THREAD_SIZE 512 10 | #define max_shared_bsiz 32 11 | 12 | #define RFT_MAG_GEM 13 | #define use_gemm_larft 14 | 15 | extern __shared__ float shared_data[]; 16 | 17 | __global__ void ZeroTriu(float* U, const int n_h, const int n_r) { 18 | int col = blockIdx.x; 19 | for(int row = threadIdx.x; row < col; row += blockDim.x){ 20 | U[col*n_h + row] = 0; 21 | } 22 | } 23 | 24 | __global__ void UpperTri(float* T, const int n_r, const int N) { 25 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 26 | if(idx < N and idx%n_r > idx/n_r ){ 27 | T[idx] = 0; 28 | } 29 | } 30 | 31 | __global__ void ConstSet(float* tau, const float a, const int N) { 32 | int idx = threadIdx.x + blockIdx.x * blockDim.x; 33 | if(idx < N) tau[idx] = a; 34 | } 35 | 36 | __global__ void ConstDevide(float* tau, const float a, const int N) { 37 | int idx = blockIdx.x; 38 | if(idx < N) tau[idx] = a / tau[idx]; 39 | } 40 | 41 | __global__ void CalculateTau(float *tau, float* V, const int n_r, const int n_h, float init) { 42 | //int global_idx = blockIdx.x * blockDim.x + threadIdx.x; 43 | int col = blockIdx.x; 44 | __shared__ float sdata[THREAD_SIZE]; 45 | assert(blockDim.x == THREAD_SIZE); 46 | //=========================== 47 | // init tau to be zero 48 | // ========================== 49 | //if(threadIdx.x==0){tau[col] = init;} 50 | //=========================== 51 | // reduce col square 52 | //=========================== 53 | // compute local col square 54 | float temp = 0.0; 55 | for(int row=threadIdx.x; row < n_h; row += blockDim.x){ 56 | temp += V[ col*n_h + row] * V[col*n_h + row]; 57 | } 58 | sdata[threadIdx.x] = temp; 59 | __syncthreads(); 60 | // reduction within block (across all threads) 61 | int i = blockDim.x/2; 62 | while (i != 0){ 63 | if (threadIdx.x < i) 64 | sdata[threadIdx.x] += sdata[threadIdx.x + i]; 65 | __syncthreads(); 66 | i /= 2; 67 | } 68 | //========================= 69 | // compute tau 70 | // ======================= 71 | if(threadIdx.x == 0) 72 | tau[col] = 2.0 / sdata[0]; 73 | } 74 | 75 | 76 | __global__ void SetAddress(float** array, float* one_matrix) { 77 | int idx = blockIdx.x; 78 | array[idx] = one_matrix; 79 | } 80 | 81 | void printDeviceMatrix(const float* A, int col, int row, magma_queue_t queue){ 82 | float* hA; 83 | magma_smalloc_cpu(&hA, col*row); 84 | 85 | magma_sgetmatrix( row, col, A, row, hA, row, queue); // copy d_a -> r 86 | for(int i = 0; icuda_stream() >>> 160 | (m, n, tau_array, Trec_array, ldtrec, Ttri_array, ldttri); 161 | } 162 | 163 | /******************************************************************************/ 164 | extern "C" magma_int_t 165 | my_magma_slarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T, 166 | float **v_array, magma_int_t ldv, 167 | float **tau_array, float **T_array, magma_int_t ldt, 168 | float **work_array, magma_int_t lwork, 169 | magma_int_t batchCount, magma_queue_t queue) 170 | { 171 | float c_one = MAGMA_S_ONE; 172 | float c_zero = MAGMA_S_ZERO; 173 | 174 | if ( k <= 0) return 0; 175 | if ( stair_T > 0 && k <= stair_T) return 0; 176 | 177 | magma_int_t maxnb = max_shared_bsiz; 178 | 179 | magma_int_t info = 0; 180 | if (stair_T > 0 && stair_T > maxnb) { 181 | info = -3; 182 | } 183 | else if (lwork < k*ldt) { 184 | info = -10; 185 | } 186 | if (info != 0) { 187 | magma_xerbla( __func__, -(info) ); 188 | return info; 189 | } 190 | 191 | magma_int_t DEBUG=0; 192 | magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T; 193 | 194 | magma_int_t i, j, prev_n, mycol, rows; 195 | 196 | float **dW1_displ = NULL; 197 | float **dW2_displ = NULL; 198 | float **dW3_displ = NULL; 199 | float **dTstep_array = NULL; 200 | 201 | magma_malloc((void**)&dW1_displ, batchCount * sizeof(*dW1_displ)); 202 | magma_malloc((void**)&dW2_displ, batchCount * sizeof(*dW2_displ)); 203 | magma_malloc((void**)&dW3_displ, batchCount * sizeof(*dW3_displ)); 204 | magma_malloc((void**)&dTstep_array, batchCount * sizeof(*dTstep_array)); 205 | 206 | //float *Tstep = k > nb ? work : T; 207 | if (k > nb) 208 | { 209 | magma_sdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue); 210 | } 211 | else 212 | { 213 | magma_sdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue); 214 | } 215 | 216 | //magma_int_t ldtstep = k > nb ? k : ldt; 217 | magma_int_t ldtstep = ldt; //a enlever 218 | // stair_T = 0 meaning all T 219 | // stair_T > 0 meaning the triangular portion of T has been computed. 220 | // the value of stair_T is the nb of these triangulars 221 | 222 | 223 | //GEMV compute the whole triangular upper portion of T (phase 1) 224 | // TODO addcublas to check perf 225 | 226 | magma_sgemm_batched( MagmaConjTrans, MagmaNoTrans, 227 | k, k, n, 228 | c_one, v_array, ldv, 229 | v_array, ldv, 230 | c_zero, dTstep_array, ldtstep, 231 | batchCount, queue ); 232 | 233 | magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue ); 234 | // no need for it as T is expected to be lower zero 235 | //if (k > nb) magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue ); 236 | 237 | 238 | //TRMV 239 | //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k] 240 | // TRMV is split over block of column of size nb 241 | // the update should be done from top to bottom so: 242 | // 1- a gemm using the previous computed columns 243 | // of T to update rectangular upper protion above 244 | // the triangle of my columns 245 | // 2- the columns need to be updated by a serial 246 | // loop over of gemv over itself. since we limit the 247 | // shared memory to nb, this nb column 248 | // are split vertically by chunk of nb rows 249 | 250 | dim3 grid(1, 1, batchCount); 251 | 252 | for (j=0; j < k; j += nb) 253 | { 254 | prev_n = j; 255 | mycol = min(nb, k-j); 256 | // note that myrow = prev_n + mycol; 257 | if (prev_n > 0 && mycol > 0) { 258 | if (DEBUG == 3) { 259 | printf("doing gemm on the rectangular portion of size %lld %lld of T(%lld,%lld)\n", 260 | (long long) prev_n, (long long) mycol, (long long) 0, (long long) j ); 261 | } 262 | 263 | magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue); 264 | magma_sdisplace_pointers(dW2_displ, T_array, ldt, 0, j, batchCount, queue); 265 | magma_sgemm_batched( MagmaNoTrans, MagmaNoTrans, 266 | prev_n, mycol, prev_n, 267 | c_one, T_array, ldt, 268 | dW1_displ, ldtstep, 269 | c_zero, dW2_displ, ldt, 270 | batchCount, queue ); 271 | 272 | // update my rectangular portion (prev_n,mycol) using sequence of gemv 273 | magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); 274 | magma_sdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); 275 | 276 | for (i=0; i < prev_n; i += nb) 277 | { 278 | rows = min(nb,prev_n-i); 279 | if (DEBUG == 3) { 280 | printf(" doing recstrmv on the rectangular portion of size %lld %lld of T(%lld,%lld)\n", 281 | (long long) rows, (long long) mycol, (long long) i, (long long) j ); 282 | } 283 | 284 | if (rows > 0 && mycol > 0) 285 | { 286 | magma_sdisplace_pointers(dW2_displ, T_array, ldt, i, j, batchCount, queue); 287 | my_magmablas_slarft_recstrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue); 288 | } 289 | } 290 | } 291 | 292 | // the upper rectangular protion is updated, now if needed update the triangular portion 293 | if (stair_T == 0) { 294 | if (DEBUG == 3) { 295 | printf("doing strmv on the triangular portion of size %lld %lld of T(%lld,%lld)\n", 296 | (long long) mycol, (long long) mycol, (long long) j, (long long) j ); 297 | } 298 | 299 | if (mycol > 0) 300 | { 301 | magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue); 302 | magma_sdisplace_pointers(dW3_displ, tau_array, 1, j, 0, batchCount, queue); 303 | magma_sdisplace_pointers(dW2_displ, T_array, ldt, j, j, batchCount, queue); 304 | magmablas_slarft_strmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue); 305 | } 306 | } 307 | }// end of j 308 | 309 | magma_free(dW1_displ); 310 | magma_free(dW2_displ); 311 | magma_free(dW3_displ); 312 | magma_free(dTstep_array); 313 | 314 | return 0; 315 | } 316 | 317 | struct grad_workspace { 318 | magmaFloat_ptr *T_array; 319 | magmaFloat_ptr *Tau_array; 320 | magmaFloat_ptr *Twork_array; 321 | magmaFloat_ptr *V_array; 322 | magmaFloat_ptr T; 323 | magmaFloat_ptr tau; 324 | magmaFloat_ptr twork; 325 | magmaFloat_ptr dwork; 326 | magmaFloat_ptr dworkvt; 327 | magmaFloat_ptr Q_grad; 328 | magmaFloat_ptr UT; 329 | }; 330 | 331 | int GradSvdBlockProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* U_grad, const int n_h, const int batch, const int n_r, magma_queue_t queue, grad_workspace ws, const bool isForward) { 332 | magma_int_t stat; 333 | int batchCount = 1; 334 | // stat = magma_init(); 335 | // magma_queue_t queue; 336 | // magma_queue_create(0, &queue); 337 | // if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); return EXIT_FAILURE;} 338 | 339 | // wait for all kernels in the queue 340 | // magma_queue_sync(queue); 341 | cudaDeviceSynchronize(); 342 | 343 | magmaFloat_ptr *T_array, *Tau_array, *Twork_array, *V_array; 344 | #if 0 345 | magma_malloc((void**)&T_array, batchCount * sizeof(*T_array)); 346 | magma_malloc((void**)&Tau_array, batchCount * sizeof(*Tau_array)); 347 | magma_malloc((void**)&Twork_array, batchCount * sizeof(*Twork_array)); 348 | magma_malloc((void**)&V_array, batchCount * sizeof(*V_array)); 349 | #else 350 | T_array = ws.T_array; 351 | Tau_array = ws.Tau_array; 352 | Twork_array = ws.Twork_array; 353 | V_array = ws.V_array; 354 | #endif 355 | // construct alpha and fill alpha with 2.0 (assume u_i are of unit norm) 356 | 357 | #if 0 358 | magmaFloat_ptr T, tau; 359 | if( magma_smalloc(&T, n_r*n_r) != MAGMA_SUCCESS){ 360 | printf("Error allocating T!\n"); 361 | return EXIT_FAILURE; 362 | } 363 | if( magma_smalloc(&tau, n_r) != MAGMA_SUCCESS){ 364 | printf("Error allocating tau!\n"); 365 | return EXIT_FAILURE; 366 | } 367 | // allocate workspace 368 | magmaFloat_ptr twork, dwork, dworkvt; 369 | magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch); 370 | if( magma_smalloc(&twork, n_r*n_r) != MAGMA_SUCCESS){ 371 | printf("Error allocating twork!\n"); 372 | return EXIT_FAILURE; 373 | } 374 | if( magma_smalloc(&dwork, ldwork*n_r) != MAGMA_SUCCESS){ 375 | printf("Error allocating dwork!\n"); 376 | return EXIT_FAILURE; 377 | } 378 | if( magma_smalloc(&dworkvt, ldworkvt*n_r) != MAGMA_SUCCESS){ 379 | printf("Error allocating dworkvt!\n"); 380 | return EXIT_FAILURE; 381 | } 382 | // calculating U_grad 383 | magmaFloat_ptr Q_grad, UT; 384 | if( magma_smalloc(&Q_grad, n_h*n_h) != MAGMA_SUCCESS){ 385 | printf("Error allocating Q_grad!\n"); 386 | return EXIT_FAILURE; 387 | } 388 | if( magma_smalloc(&UT, n_h*n_r) != MAGMA_SUCCESS){ 389 | printf("Error allocating UT!\n"); 390 | return EXIT_FAILURE; 391 | } 392 | 393 | #else 394 | magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch); 395 | magmaFloat_ptr T = ws.T; 396 | magmaFloat_ptr tau = ws.tau; 397 | magmaFloat_ptr twork = ws.twork; 398 | magmaFloat_ptr dwork = ws.dwork; 399 | magmaFloat_ptr dworkvt = ws.dworkvt; 400 | magmaFloat_ptr Q_grad = ws.Q_grad; 401 | magmaFloat_ptr UT = ws.UT; 402 | #endif 403 | 404 | // copy G to Grad_H 405 | magmablas_slacpy(MagmaFull, n_h, batch, G, n_h, H_grad, n_h, queue); 406 | // compute T = inv( striu(U'U) + 0.5 * diag(U'U)) 407 | magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, dwork, n_h, queue); 408 | // calculate tau[i] = 2.0/ dot(V[i],V[i]) 409 | CalculateTau<<< n_r, THREAD_SIZE>>>(tau, dwork, n_r, n_h, 0.0); // tau = [u_i*u_i] 410 | //ConstSet<<< n_r, 1>>>(tau, 2.0, n_r); // tau = [u_i*u_i] 411 | ConstSet<<< n_r, n_r>>>(T, 0, n_r*n_r); // set T to zero 412 | 413 | SetAddress<<>>(T_array, T); 414 | SetAddress<<>>(Tau_array, tau); 415 | SetAddress<<>>(V_array, dwork); 416 | SetAddress<<>>(Twork_array, twork); 417 | 418 | stat = my_magma_slarft_batched( n_h, 419 | n_r, 420 | 0, // stair_T not sure what it does 421 | V_array, n_h, // 422 | Tau_array, // 423 | T_array, n_r, // 424 | Twork_array, n_r*n_r, 425 | 1, // batchCount 426 | queue); 427 | 428 | // compute H_grad = Q^T * G 429 | magma_trans_t isTrans = MagmaNoTrans, tTrans1 = MagmaTrans, tTrans2 = MagmaNoTrans; 430 | magma_uplo_t useTri = MagmaUpper; 431 | if(not isForward){ // opposite of Hprod 432 | isTrans = MagmaTrans; 433 | useTri = MagmaLower; 434 | tTrans1 = MagmaNoTrans; 435 | tTrans2 = MagmaTrans; 436 | } 437 | 438 | stat |= magma_slarfb_gpu_gemm( MagmaLeft, // side 439 | isTrans, // transpose 440 | MagmaForward, // Q = H(u_{n_r}) . . . H(u_2) H(u_1) (Backward) 441 | MagmaColumnwise,// elementary reflectors are stored 442 | n_h, // number of rows of H 443 | batch, // number of columns of H 444 | n_r, // number of Householder reflectors 445 | U, // U = (u_1, u_2,..., u_{n_r}) 446 | n_h, // The leading dimension of U 447 | T, // block Householder T 448 | n_r, // The leading dimension of T 449 | H_grad, // H_grad.shape = (n_h, batch) 450 | n_h, // leading dimension of H 451 | dwork, // workspace 452 | ldwork, // leading dimension of workspace 453 | dworkvt, // workspace 2 454 | ldworkvt, // leading dimension of workspace2 455 | queue 456 | ); 457 | 458 | 459 | 460 | // Q_grad = G * H^T 461 | magma_sgemm ( MagmaNoTrans, 462 | MagmaTrans, 463 | n_h, 464 | n_h, 465 | batch, 466 | 1.0, 467 | G, 468 | n_h, 469 | H, 470 | n_h, 471 | 0.0, 472 | Q_grad, 473 | n_h, 474 | queue 475 | ); 476 | // UT = U * T; where T is upper triangular matrix 477 | magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, UT, n_h, queue); 478 | magma_strmm ( MagmaRight, 479 | MagmaUpper, 480 | tTrans1, 481 | MagmaNonUnit, 482 | n_h, 483 | n_r, 484 | 1.0, 485 | T, 486 | n_r, 487 | UT, 488 | n_h, 489 | queue 490 | ); 491 | // U_grad = - Q_grad^T * UT + 0*U_grad 492 | magma_sgemm ( MagmaTrans, 493 | MagmaNoTrans, 494 | n_h, 495 | n_r, 496 | n_h, 497 | -1.0, 498 | Q_grad, 499 | n_h, 500 | UT, 501 | n_h, 502 | 0.0, 503 | U_grad, 504 | n_h, 505 | queue 506 | ); 507 | // UT = U * T^T; where T is upper triangular matrix 508 | magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, UT, n_h, queue); 509 | magma_strmm ( MagmaRight, 510 | MagmaUpper, 511 | tTrans2, 512 | MagmaNonUnit, 513 | n_h, 514 | n_r, 515 | 1.0, 516 | T, 517 | n_r, 518 | UT, 519 | n_h, 520 | queue 521 | ); 522 | // twork = T * U^T * Q_grad^T * U * T = - T * U^T * U_grad = - UT^T * U_grad 523 | magma_sgemm ( MagmaTrans, 524 | MagmaNoTrans, 525 | n_r, 526 | n_r, 527 | n_h, 528 | -1.0, 529 | UT, 530 | n_h, 531 | U_grad, 532 | n_h, 533 | 0.0, 534 | twork, 535 | n_r, 536 | queue 537 | ); 538 | // For twork (M) copy the lower triangular to upper 539 | magmablas_ssymmetrize ( useTri, 540 | n_r, 541 | twork, 542 | n_r, 543 | queue 544 | ); 545 | // U_grad = - Q_grad * U * T^T + U_grad = -Q_grad * UT + U_grad 546 | magma_sgemm ( MagmaNoTrans, 547 | MagmaNoTrans, 548 | n_h, 549 | n_r, 550 | n_h, 551 | -1.0, 552 | Q_grad, 553 | n_h, 554 | UT, 555 | n_h, 556 | 1.0, 557 | U_grad, 558 | n_h, 559 | queue 560 | ); 561 | // U_grad = U * twork + U_grad 562 | magma_sgemm ( MagmaNoTrans, 563 | MagmaNoTrans, 564 | n_h, 565 | n_r, 566 | n_r, 567 | 1.0, 568 | U, 569 | n_h, 570 | twork, 571 | n_r, 572 | 1.0, 573 | U_grad, 574 | n_h, 575 | queue 576 | ); 577 | 578 | // zero out upper triangular part 579 | ZeroTriu<<>>(U_grad, n_h, n_r); 580 | 581 | // wait for all kernels in the queue 582 | magma_queue_sync(queue); 583 | cudaDeviceSynchronize(); 584 | 585 | #if 0 586 | // free memory 587 | magma_free(T_array); 588 | magma_free(Tau_array); 589 | magma_free(Twork_array); 590 | magma_free(V_array); 591 | 592 | magma_free(Q_grad); 593 | magma_free(UT); 594 | magma_free(T); 595 | magma_free(tau); 596 | magma_free(twork); 597 | magma_free(dwork); 598 | magma_free(dworkvt); 599 | #endif 600 | assert(stat == MAGMA_SUCCESS); 601 | // magma_queue_destroy(queue); 602 | // magma_finalize(); 603 | return EXIT_SUCCESS; 604 | } 605 | --------------------------------------------------------------------------------