├── README.md
├── code
    ├── README.md
    ├── test.json
    ├── cuda_svd_ops
    │   ├── svd_inv_prod_gpu.cu.cc
    │   ├── svd_prod_gpu.cu.cc
    │   ├── Makefile
    │   ├── svd_prod_gpu.cc
    │   ├── svd_inv_prod_gpu.cc
    │   ├── gpu_unit_inv_test.py
    │   ├── gpu_unit_test.py
    │   ├── grad_svd_prod_gpu.cc
    │   ├── grad_svd_inv_prod_gpu.cc
    │   ├── grad_svd_prod_gpu.cu.cc
    │   └── grad_svd_inv_prod_gpu.cu.cc
    ├── magma_svd_ops
    │   ├── Makefile
    │   ├── svd_block_prod_gpu.cc
    │   ├── gpu_unit_test.py
    │   ├── grad_svd_block_prod_gpu.cc
    │   ├── svd_block_prod_gpu.cu.cc
    │   └── grad_svd_block_prod_gpu.cu.cc
    ├── main.py
    ├── load.py
    ├── Params.py
    ├── svd_ops.py
    ├── spectral_rnn.py
    └── rnn.py
└── data
    └── Adding_task
        └── generate_data.py


/README.md:
--------------------------------------------------------------------------------
1 | # Spectral-RNN
2 | Implementation of Spectral-RNN (Stabilizing Gradients for Deep Neural Networks via Efficient SVD Parameterization) 
3 | 
4 | by Jiong Zhang, Qi Lei, Inderjit S. Dhillon
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/code/README.md:
--------------------------------------------------------------------------------
 1 | #BLAS2 operator:	
 2 | 
 3 |     Requires:
 4 | 		CUDA, cuBLAS, cudnn, tensorflow-gpu
 5 | 
 6 | 	Compile:
 7 | 		cd ./cuda_svd_ops
 8 | 		make
 9 | 
10 | #BLAS3 operator:	
11 | 
12 |     Requires:
13 | 		CUDA, cuBLAS, cudnn, MAGMA, tensorflow-gpu
14 | 
15 | 	Compile:
16 | 		cd ./magma_svd_ops
17 | 		make
18 | 
19 | #Running:
20 | 
21 |     python main.py test.json
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/code/test.json:
--------------------------------------------------------------------------------
 1 | {
 2 | "cell" : "LSTM", 
 3 | "initial_learning_rate" : 0.001, 
 4 | "lr_decay" : 0.99, 
 5 | "num_epochs" : 100, 
 6 | "dropout_keep_rate" : 1.0, 
 7 | "num_units" : 128, 
 8 | "r_size" : 16,
 9 | "r_margin": 0.01, 
10 | "gpu_flag" : 1, 
11 | "batch_size" : 128, 
12 | "random_seed" : 1000, 
13 | "dataset" : "add",
14 | "time_steps" : 100, 
15 | "model_dir" : "results/model",
16 | "pred_dir" : "results/pred",
17 | "load_model" : 0, 
18 | "train_flag" : 1, 
19 | "batch_norm" : 0,
20 | "display_epoch_num" : 10
21 | }
22 | 


--------------------------------------------------------------------------------
/data/Adding_task/generate_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys, os
 3 | 
 4 | cwd = os.getcwd()
 5 | 
 6 | 
 7 | N = 101000
 8 | L = 100
 9 | 
10 | data_file = cwd + '/data'+str(L)
11 | 
12 | sigma = 1; mu = 0
13 | 
14 | np.random.seed(0)
15 | 
16 | dataF = np.random.rand(N,L)
17 | dataI = np.zeros((N,L))
18 | dataY = np.zeros((N,))
19 | print dataY.shape
20 | 
21 | IdcLow = np.random.randint(0,L/2, size=N)
22 | IdcHigh = np.random.randint(L/2,L, size=N)
23 | for i in range(N):
24 |     dataI[i,IdcLow[i]]=1.0
25 |     dataI[i,IdcHigh[i]]=1.0
26 |     dataY[i] = dataF[i,IdcLow[i]] + dataF[i,IdcHigh[i]]
27 | 
28 | data = np.zeros((N,2*L+1))
29 | data[:,0] = dataY
30 | data[:, 1::2] = dataF
31 | data[:, 2::2] = dataI
32 | 
33 | try:
34 |     os.remove(data_file)
35 | except OSError:
36 |     pass
37 | 
38 | np.savetxt(data_file, data, fmt='%.5f', delimiter = ',')
39 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/svd_inv_prod_gpu.cu.cc:
--------------------------------------------------------------------------------
 1 | #if GOOGLE_CUDA
 2 | #define EIGEN_USE_GPU
 3 | #include <cuda_runtime.h>
 4 | #include <cublas_v2.h>
 5 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 6 | 
 7 | 
 8 | int inline Hprod(cublasHandle_t handle, float* H, const float* u, float* alpha, const int k, const int n_h, const int batch) {
 9 | 
10 | 	cublasStatus_t stat;
11 | 	float aa = 0;
12 | 	float bb = 0;
13 | 	float cc = -1.0;
14 | 	// aa = 2.0 / u^T * u
15 | 	stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa);
16 | 	aa = 2.0 / aa;
17 | 	// make sure that leading (n_h-k) entrees of u are 0
18 | 	//stat = cublasSscal(handle, n_h-k, &bb, u, 1);
19 | 	// compute alpha = aa * H^T * u
20 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
21 | 						&aa, H, n_h,
22 | 						u, 1,
23 | 						&bb, alpha, 1);
24 | 	// update H
25 | 	stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H, n_h);
26 | 
27 | 	if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE;
28 | 	return EXIT_SUCCESS;
29 | }
30 | 
31 | // host function for CUDA kernels
32 | int SvdInvProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r) {
33 | 	cublasStatus_t stat;
34 | 	cudaError_t cudaStat;
35 | 	cublasHandle_t handle;
36 | 	// creat handle
37 | 	stat = cublasCreate_v2(&handle);
38 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on SvdInvProd\n"); return EXIT_FAILURE; }
39 | 	// allocate alpha
40 | 	float* alpha;
41 | 	cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float));
42 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; }
43 | 	// begin computation
44 | 	stat = cublasScopy(handle, n_h * batch , H, 1, H_out, 1); // fill H_out with H
45 | 
46 | 	for(int r=n_r-1; r >= 0; r--) {
47 | 		Hprod(handle, H_out, U + n_h*r, alpha, n_h - r, n_h, batch);
48 | 	}
49 | 	cudaFree(alpha);
50 | 	cublasDestroy(handle);
51 | 	return EXIT_SUCCESS;
52 | }
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/svd_prod_gpu.cu.cc:
--------------------------------------------------------------------------------
 1 | #if GOOGLE_CUDA
 2 | #define EIGEN_USE_GPU
 3 | #include <cuda_runtime.h>
 4 | #include <cublas_v2.h>
 5 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 6 | 
 7 | 
 8 | // CUDA kernel TODO
 9 | __global__ void SvdProdGpuKernel(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r) {
10 |   //for (int i = 1; i < N; i++) t_out(i) = 0;
11 |   //T_out(0) = T_in(0);
12 | 	for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n_h*batch; i += blockDim.x * gridDim.x) {
13 |     	H_out[i] = 2.0 ;
14 |   	}
15 | }
16 | 
17 | 
18 | int inline Hprod(cublasHandle_t handle, float* H, const float* u, float* alpha, const int k, const int n_h, const int batch) {
19 | 
20 | 	cublasStatus_t stat;
21 | 	float aa = 0;
22 | 	float bb = 0;
23 | 	float cc = -1.0;
24 | 	// aa = 2.0 / u^T * u
25 | 	stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa);
26 | 	aa = 2.0 / aa;
27 | 	// make sure that leading (n_h-k) entrees of u are 0
28 | 	//stat = cublasSscal(handle, n_h-k, &bb, u, 1);
29 | 	// compute alpha = aa * H^T * u
30 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
31 | 						&aa, H, n_h,
32 | 						u, 1,
33 | 						&bb, alpha, 1);
34 | 	// update H
35 | 	stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H, n_h);
36 | 
37 | 	if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE;
38 | 	return EXIT_SUCCESS;
39 | }
40 | 
41 | // host function for CUDA kernels
42 | int SvdProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r) {
43 | 	cublasStatus_t stat;
44 | 	cudaError_t cudaStat;
45 | 	cublasHandle_t handle;
46 | 	// creat handle
47 | 	stat = cublasCreate_v2(&handle);
48 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on SvdProd\n"); return EXIT_FAILURE; }
49 | 	// allocate alpha
50 | 	float* alpha;
51 | 	cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float));
52 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; }
53 | 	// begin computation
54 | 	stat = cublasScopy(handle, n_h * batch , H, 1, H_out, 1); // fill H_out with H
55 | 
56 | 	for(int r=0; r < n_r; r++) {
57 | 		Hprod(handle, H_out, U + n_h*r, alpha, n_h - r, n_h, batch);
58 | 	}
59 | 	cudaFree(alpha);
60 | 	cublasDestroy(handle);
61 | 	return EXIT_SUCCESS;
62 | }
63 | 
64 | #endif
65 | 


--------------------------------------------------------------------------------
/code/magma_svd_ops/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile
 2 | OS := $(shell uname)
 3 | 
 4 | ifeq ($(OS),Darwin)  # Mac OS X
 5 | 	OSFLAGS = -undefined dynamic_lookup
 6 | 	CXX = clang++
 7 | else
 8 | 	CXX = g++
 9 | 	OSFLAGS = 
10 | endif
11 | 
12 | TF_INC = `python -c "import tensorflow; print(tensorflow.sysconfig.get_include())"`
13 | TF_LIB = `python -c "import tensorflow; print(tensorflow.sysconfig.get_lib())"`
14 | 
15 | # specify your magma dir and MKL dir here
16 | MAGMA  =/work/03941/jiongdys/maverick/magma
17 | CUDADIR =/opt/apps/cuda/8.0/
18 | MKL = /opt/apps/intel/15/composer_xe_2015.3.187/mkl/include
19 | MKLLIB = /opt/apps/intel/15/composer_xe_2015.3.187/mkl/lib/intel64
20 | 
21 | CC        = gcc -O2 -pthread
22 | GPUCC     = nvcc
23 | CFLAGS    = -O3 -std=c++11 -L$(TF_LIB) -I$(TF_INC) -I$(TF_INC)/external/nsync/public  -DADD_  -ltensorflow_framework 
24 | 
25 | 
26 | MAGMA_I   = -I${MAGMA}/include -I${MAGMA}/testing -I${MAGMA}/control -I${MKL}
27 | MAG_FLAGS =  -m64 -DNDEBUG -O3   -Wall -Wshadow -DMAGMA_NOAFFINITY -pedantic -Wno-long-long -DHAVE_CUBLAS -DMIN_CUDA_ARCH=300  -c ${MAGMA_I} -I${CUDADIR}/include
28 | CMAG_FLAGS= -m64  -fPIC ${MAGMA_I} -Wl,-rpath,${MAGMA}/lib -L${MAGMA}/lib  -L${MKLLIB} -lstdc++ -lm   #-framework Accelerate -lblas_fix
29 | 
30 | GPUCFLAGS = -c -arch=sm_30 --expt-relaxed-constexpr ${MAGMA_I}
31 | 
32 | LFLAGS    = -pthread -shared -fPIC
33 | GPULFLAGS = -x cu -Xcompiler -fPIC
34 | GPUDEF    = -DGOOGLE_CUDA=1
35 | CGPUFLAGS = -lcuda -lcublas -lmagma # -lmagmablas
36 | 
37 | SRC       = svd_block_prod_gpu.cc
38 | GPUSRC    = svd_block_prod_gpu.cu.cc
39 | PROD      = svd_block_prod_gpu.so
40 | GPUPROD   = svd_block_prod_cu_gpu.o
41 | 
42 | GRAD_SRC       = grad_svd_block_prod_gpu.cc
43 | GRAD_GPUSRC    = grad_svd_block_prod_gpu.cu.cc
44 | GRAD_PROD      = grad_svd_block_prod_gpu.so
45 | GRAD_GPUPROD   = grad_svd_block_prod_cu_gpu.o
46 | 
47 | default: gpu  gpu-grad
48 | 
49 | gpu:
50 | 	$(GPUCC) $(CFLAGS) $(GPUCFLAGS)  $(GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GPUPROD)
51 | 	$(CXX) $(CFLAGS) ${CMAG_FLAGS} $(SRC) $(GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(PROD)
52 | 
53 | gpu-grad:
54 | 	$(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GRAD_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GRAD_GPUPROD)
55 | 	$(CXX) $(CFLAGS) ${CMAG_FLAGS} $(GRAD_SRC) $(GRAD_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(GRAD_PROD)
56 | 
57 | clean:
58 | 	rm -f $(TEST_PROD) $(TEST_FINAL) $(PROD) $(GPUPROD) $(GRAD_PROD) $(GRAD_GPUPROD)
59 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile
 2 | OS := $(shell uname)
 3 | 
 4 | ifeq ($(OS),Darwin)  # Mac OS X
 5 | 	OSFLAGS = -undefined dynamic_lookup
 6 | 	CXX = clang++
 7 | else
 8 | 	CXX = g++
 9 | 	OSFLAGS = 
10 | endif
11 | 
12 | TF_INC = `python -c "import tensorflow; print(tensorflow.sysconfig.get_include())"`
13 | TF_LIB= `python -c 'import tensorflow as tf; print(tf.sysconfig.get_lib())'`
14 | 
15 | CC        = gcc -O2 -pthread
16 | GPUCC     = nvcc
17 | CFLAGS    = -std=c++11 -I$(TF_INC) -L$(TF_LIB) -I$(TF_INC)/external/nsync/public  -ltensorflow_framework 
18 | GPUCFLAGS = -c --expt-relaxed-constexpr
19 | LFLAGS    = -pthread -shared -fPIC
20 | GPULFLAGS = -x cu -Xcompiler -fPIC
21 | GPUDEF    = -DGOOGLE_CUDA=1
22 | CGPUFLAGS = -lcuda -lcublas
23 | 
24 | SRC       = svd_prod_gpu.cc
25 | GPUSRC    = svd_prod_gpu.cu.cc
26 | PROD      = svd_prod_gpu.so
27 | GPUPROD   = svd_prod_cu_gpu.o
28 | 
29 | GRAD_SRC       = grad_svd_prod_gpu.cc
30 | GRAD_GPUSRC    = grad_svd_prod_gpu.cu.cc
31 | GRAD_PROD      = grad_svd_prod_gpu.so
32 | GRAD_GPUPROD   = grad_svd_prod_cu_gpu.o
33 | 
34 | INV_SRC       = svd_inv_prod_gpu.cc
35 | INV_GPUSRC    = svd_inv_prod_gpu.cu.cc
36 | INV_PROD      = svd_inv_prod_gpu.so
37 | INV_GPUPROD   = svd_inv_prod_cu_gpu.o
38 | 
39 | INV_GRAD_SRC       = grad_svd_inv_prod_gpu.cc
40 | INV_GRAD_GPUSRC    = grad_svd_inv_prod_gpu.cu.cc
41 | INV_GRAD_PROD      = grad_svd_inv_prod_gpu.so
42 | INV_GRAD_GPUPROD   = grad_svd_inv_prod_cu_gpu.o
43 | 
44 | default: gpu  gpu-grad inv-gpu inv-gpu-grad
45 | 
46 | cpu:
47 | 	$(CXX) $(CFLAGS) $(SRC) $(LFLAGS) $(OSFLAGS) -o $(PROD)
48 | 
49 | gpu:
50 | 	$(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GPUPROD)
51 | 	$(CXX) $(CFLAGS)  $(SRC) $(GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(PROD)
52 | 
53 | gpu-grad:
54 | 	$(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(GRAD_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(GRAD_GPUPROD)
55 | 	$(CXX) $(CFLAGS)  $(GRAD_SRC) $(GRAD_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(GRAD_PROD)
56 | 
57 | inv-gpu:
58 | 	$(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(INV_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(INV_GPUPROD)
59 | 	$(CXX) $(CFLAGS)  $(INV_SRC) $(INV_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(INV_PROD)
60 | 
61 | inv-gpu-grad:
62 | 	$(GPUCC) $(CFLAGS) $(GPUCFLAGS) $(INV_GRAD_GPUSRC) $(GPULFLAGS) $(GPUDEF) -o $(INV_GRAD_GPUPROD)
63 | 	$(CXX) $(CFLAGS)  $(INV_GRAD_SRC) $(INV_GRAD_GPUPROD) $(LFLAGS) $(CGPUFLAGS) $(OSFLAGS) $(GPUDEF) -o $(INV_GRAD_PROD)
64 | 
65 | clean:
66 | 	rm -f $(PROD) $(GPUPROD) $(GRAD_PROD) $(GRAD_GPUPROD) $(INV_PROD) $(INV_GPUPROD) $(INV_GRAD_PROD) $(INV_GRAD_GPUPROD)
67 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/svd_prod_gpu.cc:
--------------------------------------------------------------------------------
 1 | #include "tensorflow/core/framework/op.h"
 2 | #include "tensorflow/core/framework/shape_inference.h"
 3 | #include "tensorflow/core/framework/op_kernel.h"
 4 | #include "tensorflow/core/framework/tensor_shape.h"
 5 | #include "tensorflow/core/platform/default/logging.h"
 6 | 
 7 | using namespace tensorflow;
 8 | 
 9 | REGISTER_OP("SvdProdGpu")
10 |   .Input("hidden_state: float")
11 |   .Input("householder_matrix: float")
12 |   .Output("output_state: float")
13 |   .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
14 |     c->set_output(0, c->input(0));
15 |     return Status::OK();
16 |   });
17 | 
18 | 
19 | int SvdProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r);
20 | 
21 | class SvdProdGpuOp : public OpKernel {
22 | public:
23 |   explicit SvdProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
24 | 
25 |   void Compute(OpKernelContext* context) override {
26 | 	// Check number of inputs
27 | 	OP_REQUIRES(context, context->num_inputs() == 2,
28 |                 errors::InvalidArgument("SvdProd expects 2 inputes."));
29 | 
30 | 	// Grab the input tensor
31 |     const Tensor& H = context->input(0);
32 |     const Tensor& U = context->input(1);
33 | 
34 | 	// Shapes of input
35 |     const TensorShape& H_shape = H.shape();
36 |     const TensorShape& U_shape = U.shape();
37 | 
38 | 	const int n_h = H_shape.dim_size(1);
39 | 	const int n_r = U_shape.dim_size(0);
40 | 	const int batch = H_shape.dim_size(0);
41 | 	// Perform dimension check
42 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape),
43 |                 errors::InvalidArgument("SvdProd expects H to be a 2-D matrix."));
44 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape),
45 |                 errors::InvalidArgument("SvdProd expects U to be a 2-D matrix."));
46 | 	OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1),
47 |                 errors::InvalidArgument("The second dimension of H and U does not match!"));
48 | 
49 | 	// Create an output tensor
50 |     Tensor* H_out = NULL;
51 |     OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&H_out));
52 | 
53 | 	// obtain data
54 | 	const float* H_data = H.flat<float>().data();
55 | 	const float* U_data = U.flat<float>().data();
56 | 	float* H_out_data = H_out->flat<float>().data();
57 | 	/*
58 | 	// test
59 | 	int idx  =0;
60 | 	std::printf( "Before:\n");
61 | 	for(int i=0; i < H_shape.dim_size(0); i++){
62 | 		for(int j=0; j < H_shape.dim_size(1); j++){
63 | 			idx = i * H_shape.dim_size(1) + j;
64 | 			std::printf ("H(%d,%d)=%4.4f,  %4.4f\n", i, j, H.flat<float>()(idx), H_out_data[idx]);
65 | 		}
66 | 	}
67 | 	*/
68 | #if GOOGLE_CUDA
69 | 	int op_status;
70 |     op_status = SvdProdGpuKernelLauncher(H_data, U_data, H_out_data, n_h, batch, n_r );
71 | #endif
72 |   }
73 | };
74 | 
75 | REGISTER_KERNEL_BUILDER(Name("SvdProdGpu").Device(DEVICE_GPU), SvdProdGpuOp);
76 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/svd_inv_prod_gpu.cc:
--------------------------------------------------------------------------------
 1 | #include "tensorflow/core/framework/op.h"
 2 | #include "tensorflow/core/framework/shape_inference.h"
 3 | #include "tensorflow/core/framework/op_kernel.h"
 4 | #include "tensorflow/core/framework/tensor_shape.h"
 5 | #include "tensorflow/core/platform/default/logging.h"
 6 | 
 7 | using namespace tensorflow;
 8 | 
 9 | REGISTER_OP("SvdInvProdGpu")
10 |   .Input("hidden_state: float")
11 |   .Input("householder_matrix: float")
12 |   .Output("output_state: float")
13 |   .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
14 |     c->set_output(0, c->input(0));
15 |     return Status::OK();
16 |   });
17 | 
18 | 
19 | int SvdInvProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r);
20 | 
21 | class SvdInvProdGpuOp : public OpKernel {
22 | public:
23 |   explicit SvdInvProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
24 | 
25 |   void Compute(OpKernelContext* context) override {
26 | 	// Check number of inputs
27 | 	OP_REQUIRES(context, context->num_inputs() == 2,
28 |                 errors::InvalidArgument("SvdInvProd expects 2 inputes."));
29 | 
30 | 	// Grab the input tensor
31 |     const Tensor& H = context->input(0);
32 |     const Tensor& U = context->input(1);
33 | 
34 | 	// Shapes of input
35 |     const TensorShape& H_shape = H.shape();
36 |     const TensorShape& U_shape = U.shape();
37 | 
38 | 	const int n_h = H_shape.dim_size(1);
39 | 	const int n_r = U_shape.dim_size(0);
40 | 	const int batch = H_shape.dim_size(0);
41 | 	// Perform dimension check
42 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape),
43 |                 errors::InvalidArgument("SvdInvProd expects H to be a 2-D matrix."));
44 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape),
45 |                 errors::InvalidArgument("SvdInvProd expects U to be a 2-D matrix."));
46 | 	OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1),
47 |                 errors::InvalidArgument("The second dimension of H and U does not match!"));
48 | 
49 | 	// Create an output tensor
50 |     Tensor* H_out = NULL;
51 |     OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&H_out));
52 | 
53 | 	// obtain data
54 | 	const float* H_data = H.flat<float>().data();
55 | 	const float* U_data = U.flat<float>().data();
56 | 	float* H_out_data = H_out->flat<float>().data();
57 | 	/*
58 | 	// test
59 | 	int idx  =0;
60 | 	std::printf( "Before:\n");
61 | 	for(int i=0; i < H_shape.dim_size(0); i++){
62 | 		for(int j=0; j < H_shape.dim_size(1); j++){
63 | 			idx = i * H_shape.dim_size(1) + j;
64 | 			std::printf ("H(%d,%d)=%4.4f,  %4.4f\n", i, j, H.flat<float>()(idx), H_out_data[idx]);
65 | 		}
66 | 	}
67 | 	*/
68 | #if GOOGLE_CUDA
69 | 	int op_status;
70 |     op_status = SvdInvProdGpuKernelLauncher(H_data, U_data, H_out_data, n_h, batch, n_r );
71 | #endif
72 |   }
73 | };
74 | 
75 | REGISTER_KERNEL_BUILDER(Name("SvdInvProdGpu").Device(DEVICE_GPU), SvdInvProdGpuOp);
76 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/gpu_unit_inv_test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from tensorflow.python.framework import ops
 4 | from tensorflow.python.ops import array_ops
 5 | from tensorflow.python.ops import sparse_ops
 6 | 
 7 | def Hgrad(H, u, G, k):
 8 |     # H.shape = (batch, n_h)
 9 |     # u.shape = (n_h,)
10 |     # G.shape = (batch, n_h)
11 |     alpha = 2* np.dot(H[:, -k:], u[-k:]) # alpha.shape = (batch,)
12 |     beta = 2* np.dot(G[:, -k:], u[-k:]) # beta.shape = (batch,)
13 |     u_bar = np.zeros_like(u)
14 |     u_bar[-k:] += -np.dot(alpha,G[:,-k:]) - np.dot(beta,H[:,-k:]) + np.dot(alpha,beta)*u[-k:]  # sum of gradient within the batch: averaging needed???
15 |     G_out = G.copy()
16 |     G_out[:,-k:] -=  np.outer(beta,u[-k:])
17 |     return G_out, u_bar  # G_out.shape = (batch, n_h); u_bar.shape = (n_h,)
18 | 
19 | n_h = 3
20 | n_b = 2
21 | n_r = 2
22 | 
23 | 
24 | 
25 | rng = np.random.RandomState(13)
26 | H_ = rng.uniform(-np.sqrt(6. / (n_b + n_h)), np.sqrt(6. / (n_b + n_h)), (n_b, n_h)).astype(np.float32)
27 | 
28 | U_full = rng.normal(0, 0.01, (n_h, n_r)).astype(np.float32)
29 | U_ = np.tril(U_full)
30 | norms_U_ = np.linalg.norm(U_, axis=0)
31 | U_ = np.transpose(1. / norms_U_ * U_)
32 | 
33 | print H_
34 | print U_
35 | 
36 | 
37 | 
38 | H1 = [H_]*(n_r+1)
39 | 
40 | for i in range(n_r-1,-1,-1):
41 |     alpha = np.dot(H1[i+1], U_[i])
42 |     H1[i] = H1[i+1] - 2 * np.outer(alpha, U_[i])
43 | 
44 | H2 = H1[0]
45 | print H2
46 | 
47 | for i in range(n_b):
48 |     print np.dot(H2[i],H2[i]) - np.dot(H_[i], H_[i])
49 | 
50 | G = np.ones_like(H_)
51 | Grad_U = np.ones_like(U_)
52 | 
53 | for i in range(0,n_r):
54 |     G, Grad_U[i] = Hgrad(H1[i+1], U_[i], G, n_h-i)
55 | 
56 | print G
57 | print Grad_U
58 | ############################################################
59 | ############################################################
60 | 
61 | grad_svd_inv_prod_module = tf.load_op_library('./grad_svd_inv_prod_gpu.so')
62 | 
63 | @ops.RegisterGradient("SvdInvProdGpu")
64 | def _svd_inv_prod_gpu_grad(op, grad):
65 |     """The gradients for `svd_inv_prod_gpu`.
66 | 
67 |     Args:
68 |     op: The `svd_prod_gpu` `Operation` that we are differentiating, which we can use
69 |       to find the inputs and outputs of the original op.
70 |     grad: Gradient with respect to the output of the op.
71 | 
72 |     Returns:
73 |     Gradients with respect to the inputs.
74 |     """
75 |     H = op.inputs[0]
76 |     U = op.inputs[1]
77 | 
78 |     return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad)
79 | ############################################################
80 | 
81 | 
82 | svd_inv_prod_module = tf.load_op_library('./svd_inv_prod_gpu.so')
83 | with tf.Session() as sess:
84 | 
85 |     H = tf.constant(H_, dtype=tf.float32)
86 |     U = tf.constant(U_, dtype = tf.float32)
87 | 
88 |     U = tf.matrix_band_part(U, 0, -1) # upper triangular
89 | 
90 |     z = svd_inv_prod_module.svd_inv_prod_gpu(H,U)
91 |     gr = tf.gradients(z, [H,U])
92 |     tf.global_variables_initializer().run()
93 | 
94 |     print('H,U and product: ',H.eval(), U.eval(),z.eval())
95 |     print('grad_H, grad_U: ' ,gr[0].eval(), gr[1].eval())
96 | 


--------------------------------------------------------------------------------
/code/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """ Script for running RNNs with fixed parameters. """
 3 | 
 4 | import os
 5 | import sys
 6 | import time
 7 | import math
 8 | import numpy as np
 9 | import csv
10 | import Params
11 | import load
12 | import rnn
13 | 
14 | 
15 | def train(params):
16 | 
17 |     print('%s starting......' % params.cell)
18 |     sys.stdout.flush()
19 | 
20 |     if params.dataset.startswith('mnist'):
21 |         train_X, test_X, train_y, test_y = load.load_mnist(params)
22 |     elif params.dataset.startswith('add'):
23 |         train_X, test_X, train_y, test_y = load.adding_task(params)
24 |     else:
25 |         assert 0, "unknown dataset %s" % (params.dataset)
26 | 
27 |     print "parameters = ", params
28 | 
29 |     model = rnn.RNNModel(params)
30 | 
31 |     # load model
32 |     if params.load_model:
33 |         model.load("%s" % (params.load_model_dir))
34 | 
35 |     # train model
36 |     train_error, test_error = model.train(params, train_X, train_y, test_X, test_y)
37 | 
38 |     # save model
39 |     if params.model_dir:
40 |         if os.path.isdir(os.path.dirname(params.model_dir)) == False:
41 |             os.makedirs(params.model_dir)
42 |         model.save("%s.%s" % (params.model_dir, params.cell))
43 | 
44 |     # predict
45 |     train_pred = model.predict(train_X, params.batch_size)
46 |     test_pred = model.predict(test_X, params.batch_size)
47 | 
48 |     # must close model when finish
49 |     model.close()
50 | 
51 |     # write prediction to file
52 |     if params.pred_dir:
53 |         if os.path.isdir(os.path.dirname(params.pred_dir)) == False:
54 |             os.makedirs(params.pred_dir)
55 |         with open("%s.%s.%s.y" % (params.pred_dir, params.dataset, params.cell), "w") as f:
56 |             content = ""
57 |             for pred in [train_pred, test_pred]:
58 |                 for entry in pred:
59 |                     for index, value in enumerate(entry):
60 |                         if index:
61 |                             content += ","
62 |                         content += "%f" % (value)
63 |                     content += "\n"
64 |             f.write(content)
65 |         with open("%s.%s.%s.X" % (params.pred_dir, params.dataset, params.cell), "w") as f:
66 |             content = ""
67 |             for X in [train_X, test_X]:
68 |                 for entry in X:
69 |                     for index, value in enumerate(entry.ravel()):
70 |                         if index:
71 |                             content += ","
72 |                         content += "%f" % (value)
73 |                     content += "\n"
74 |             f.write(content)
75 | 
76 |     return train_error, test_error
77 | 
78 | if __name__=='__main__':
79 |     if len(sys.argv) < 2:
80 |         print("input parameters in json format in required")
81 |         exit()
82 |     paramsArray = []
83 |     for i in range(1, len(sys.argv)):
84 |         params = Params.Params()
85 |         params.load(sys.argv[i])
86 |         paramsArray.append(params)
87 |     print("parameters[%d] = %s" % (len(paramsArray), paramsArray))
88 | 
89 |     tt = time.time()
90 |     for params in paramsArray:
91 |         train(params)
92 |     print("program takes %.3f seconds" % (time.time()-tt))
93 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/gpu_unit_test.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from tensorflow.python.framework import ops
 4 | from tensorflow.python.ops import array_ops
 5 | from tensorflow.python.ops import sparse_ops
 6 | 
 7 | def Hgrad(H, u, G, k):
 8 |     # H.shape = (batch, n_h)
 9 |     # u.shape = (n_h,)
10 |     # G.shape = (batch, n_h)
11 |     alpha = 2* np.dot(H[:, -k:], u[-k:]) # alpha.shape = (batch,)
12 |     beta = 2* np.dot(G[:, -k:], u[-k:]) # beta.shape = (batch,)
13 |     u_bar = np.zeros_like(u)
14 |     u_bar[-k:] += -np.dot(alpha,G[:,-k:]) - np.dot(beta,H[:,-k:]) + np.dot(alpha,beta)*u[-k:]  # sum of gradient within the batch: averaging needed???
15 |     G_out = G.copy()
16 |     G_out[:,-k:] -=  np.outer(beta,u[-k:])
17 |     return G_out, u_bar  # G_out.shape = (batch, n_h); u_bar.shape = (n_h,)
18 | 
19 | n_h = 3; n_b = 2; n_r = 2
20 | rng = np.random.RandomState(13)
21 | H_ = rng.uniform(-np.sqrt(6. / (n_b + n_h)), np.sqrt(6. / (n_b + n_h)), (n_b, n_h)).astype(np.float32)
22 | 
23 | U_full = rng.normal(0, 0.01, (n_h, n_r)).astype(np.float32)
24 | U_ = np.tril(U_full)
25 | norms_U_ = np.linalg.norm(U_, axis=0)
26 | U_ = np.transpose(1. / norms_U_ * U_)
27 | 
28 | print H_
29 | print U_
30 | 
31 | H1 = [H_]*(n_r+1)
32 | 
33 | for i in range(0,n_r):
34 |     alpha = np.dot(H1[i], U_[i])
35 |     print 'alpha: ', 2*alpha
36 |     H1[i+1] = H1[i] - 2 * np.outer(alpha, U_[i])
37 | 
38 | H2 = H1[-1]
39 | print H2
40 | 
41 | for i in range(n_b):
42 |     print np.dot(H2[i],H2[i]) - np.dot(H_[i], H_[i])
43 | 
44 | G = np.ones_like(H_)
45 | Grad_U = np.ones_like(U_)
46 | 
47 | for i in range(n_r-1, -1, -1):
48 |     G, Grad_U[i] = Hgrad(H1[i], U_[i], G, n_h-i)
49 | 
50 | print G
51 | print Grad_U
52 | ############################################################
53 | ############################################################
54 | 
55 | grad_svd_prod_module = tf.load_op_library('./grad_svd_prod_gpu.so')
56 | 
57 | @ops.RegisterGradient("SvdProdGpu")
58 | def _svd_prod_gpu_grad(op, grad):
59 |     H = op.inputs[0]
60 |     U = op.inputs[1]
61 |     return grad_svd_prod_module.grad_svd_prod_gpu(H,U,grad)
62 | ############################################################
63 | svd_prod_module = tf.load_op_library('./svd_prod_gpu.so')
64 | ############################################################
65 | 
66 | grad_svd_inv_prod_module = tf.load_op_library('./grad_svd_inv_prod_gpu.so')
67 | 
68 | @ops.RegisterGradient("SvdInvProdGpu")
69 | def _svd_inv_prod_gpu_grad(op, grad):
70 |     H = op.inputs[0]
71 |     U = op.inputs[1]
72 |     return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad)
73 | ############################################################
74 | svd_inv_prod_module = tf.load_op_library('./svd_inv_prod_gpu.so')
75 | ############################################################
76 | 
77 | with tf.Session() as sess:
78 | 
79 |     H = tf.constant(H_, dtype=tf.float32)
80 |     U = tf.constant(U_, dtype = tf.float32)
81 | 
82 |     U = tf.matrix_band_part(U, 0, -1) # upper triangular
83 | 
84 |     z = svd_prod_module.svd_prod_gpu(H,U)
85 |     z2 = svd_inv_prod_module.svd_inv_prod_gpu(z,U)
86 |     gr = tf.gradients(z, [H,U])
87 |     gr2 = tf.gradients(z2, [z,U])
88 |     tf.global_variables_initializer().run()
89 | 
90 |     print('H,U and product: ',H.eval(), U.eval(),z.eval())
91 |     print('grad_H, grad_U: ' ,gr[0].eval(), gr[1].eval())
92 | 
93 |     print('H,U and product: ',H.eval(), U.eval(),z2.eval())
94 |     print('grad_H, grad_U: ' ,gr2[0].eval(), gr2[1].eval())
95 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/grad_svd_prod_gpu.cc:
--------------------------------------------------------------------------------
 1 | #include "tensorflow/core/framework/op.h"
 2 | #include "tensorflow/core/framework/shape_inference.h"
 3 | #include "tensorflow/core/framework/op_kernel.h"
 4 | #include "tensorflow/core/framework/tensor_shape.h"
 5 | #include "tensorflow/core/platform/default/logging.h"
 6 | 
 7 | using namespace tensorflow;
 8 | 
 9 | REGISTER_OP("GradSvdProdGpu")
10 |   .Input("hidden_state: float")
11 |   .Input("householder_matrix: float")
12 |   .Input("gradient_backprop: float")
13 |   .Output("grad_hidden_state: float")
14 |   .Output("grad_householder_matrix: float")
15 |   .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
16 |     c->set_output(0, c->input(0));
17 |     c->set_output(1, c->input(1));
18 |     return Status::OK();
19 |   });
20 | 
21 | #include "tensorflow/core/framework/op_kernel.h"
22 | 
23 | int GradSvdProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* G_grad, const int n_h, const int batch, const int n_r);
24 | 
25 | class GradSvdProdGpuOp : public OpKernel {
26 | public:
27 |   explicit GradSvdProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
28 | 
29 |   void Compute(OpKernelContext* context) override {
30 | 	// Check number of inputs
31 | 	OP_REQUIRES(context, context->num_inputs() == 3,
32 |                 errors::InvalidArgument("GradSvdProd expects 3 inputes."));
33 | 
34 | 	// Grab the input tensor
35 |     const Tensor& H = context->input(0);
36 |     const Tensor& U = context->input(1);
37 |     const Tensor& G = context->input(2);
38 |     auto input = H.flat<float>();
39 | 
40 | 	// Shapes of input
41 |     const TensorShape& H_shape = H.shape();
42 |     const TensorShape& U_shape = U.shape();
43 |     const TensorShape& G_shape = G.shape();
44 | 
45 | 	const int n_h = H_shape.dim_size(1);
46 | 	const int n_r = U_shape.dim_size(0);
47 | 	const int batch = H_shape.dim_size(0);
48 | 	// Perform dimension check
49 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape),
50 |                 errors::InvalidArgument("SvdProd expects H to be a 2-D matrix."));
51 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape),
52 |                 errors::InvalidArgument("SvdProd expects U to be a 2-D matrix."));
53 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(G_shape),
54 |                 errors::InvalidArgument("SvdProd expects G to be a 2-D matrix."));
55 | 	OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1),
56 |                 errors::InvalidArgument("The second dimension of H and U does not match!"));
57 | 	OP_REQUIRES(context, G_shape.dim_size(0) == H_shape.dim_size(0),
58 |                 errors::InvalidArgument("The first dimension of G and H does not match!"));
59 | 	OP_REQUIRES(context, G_shape.dim_size(1) == H_shape.dim_size(1),
60 |                 errors::InvalidArgument("The second dimension of G and H does not match!"));
61 | 
62 | 	// Create an output tensor
63 |     Tensor* Grad_H = NULL;
64 |     OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&Grad_H));
65 |     Tensor* Grad_U = NULL;
66 |     OP_REQUIRES_OK(context, context->allocate_output(1, U_shape,&Grad_U));
67 | 
68 | 	// obtain data
69 | 	const float* H_data = H.flat<float>().data();
70 | 	const float* U_data = U.flat<float>().data();
71 | 	const float* G_data = G.flat<float>().data();
72 | 	float* Grad_H_data = Grad_H->flat<float>().data();
73 | 	float* Grad_U_data = Grad_U->flat<float>().data();
74 | #if GOOGLE_CUDA
75 |     GradSvdProdGpuKernelLauncher(H_data, U_data, G_data, Grad_H_data, Grad_U_data, n_h, batch, n_r);
76 | #endif
77 |   }
78 | };
79 | 
80 | REGISTER_KERNEL_BUILDER(Name("GradSvdProdGpu").Device(DEVICE_GPU), GradSvdProdGpuOp);
81 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/grad_svd_inv_prod_gpu.cc:
--------------------------------------------------------------------------------
 1 | #include "tensorflow/core/framework/op.h"
 2 | #include "tensorflow/core/framework/shape_inference.h"
 3 | #include "tensorflow/core/framework/op_kernel.h"
 4 | #include "tensorflow/core/framework/tensor_shape.h"
 5 | #include "tensorflow/core/platform/default/logging.h"
 6 | 
 7 | using namespace tensorflow;
 8 | 
 9 | REGISTER_OP("GradSvdInvProdGpu")
10 |   .Input("hidden_state: float")
11 |   .Input("householder_matrix: float")
12 |   .Input("gradient_backprop: float")
13 |   .Output("grad_hidden_state: float")
14 |   .Output("grad_householder_matrix: float")
15 |   .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
16 |     c->set_output(0, c->input(0));
17 |     c->set_output(1, c->input(1));
18 |     return Status::OK();
19 |   });
20 | 
21 | #include "tensorflow/core/framework/op_kernel.h"
22 | 
23 | int GradSvdInvProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* G_grad, const int n_h, const int batch, const int n_r);
24 | 
25 | class GradSvdInvProdGpuOp : public OpKernel {
26 | public:
27 |   explicit GradSvdInvProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {}
28 | 
29 |   void Compute(OpKernelContext* context) override {
30 | 	// Check number of inputs
31 | 	OP_REQUIRES(context, context->num_inputs() == 3,
32 |                 errors::InvalidArgument("GradSvdInvProd expects 3 inputes."));
33 | 
34 | 	// Grab the input tensor
35 |     const Tensor& H = context->input(0);
36 |     const Tensor& U = context->input(1);
37 |     const Tensor& G = context->input(2);
38 |     auto input = H.flat<float>();
39 | 
40 | 	// Shapes of input
41 |     const TensorShape& H_shape = H.shape();
42 |     const TensorShape& U_shape = U.shape();
43 |     const TensorShape& G_shape = G.shape();
44 | 
45 | 	const int n_h = H_shape.dim_size(1);
46 | 	const int n_r = U_shape.dim_size(0);
47 | 	const int batch = H_shape.dim_size(0);
48 | 	// Perform dimension check
49 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape),
50 |                 errors::InvalidArgument("SvdInvProd expects H to be a 2-D matrix."));
51 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape),
52 |                 errors::InvalidArgument("SvdInvProd expects U to be a 2-D matrix."));
53 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(G_shape),
54 |                 errors::InvalidArgument("SvdInvProd expects G to be a 2-D matrix."));
55 | 	OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1),
56 |                 errors::InvalidArgument("The second dimension of H and U does not match!"));
57 | 	OP_REQUIRES(context, G_shape.dim_size(0) == H_shape.dim_size(0),
58 |                 errors::InvalidArgument("The first dimension of G and H does not match!"));
59 | 	OP_REQUIRES(context, G_shape.dim_size(1) == H_shape.dim_size(1),
60 |                 errors::InvalidArgument("The second dimension of G and H does not match!"));
61 | 
62 | 	// Create an output tensor
63 |     Tensor* Grad_H = NULL;
64 |     OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&Grad_H));
65 |     Tensor* Grad_U = NULL;
66 |     OP_REQUIRES_OK(context, context->allocate_output(1, U_shape,&Grad_U));
67 | 
68 | 	// obtain data
69 | 	const float* H_data = H.flat<float>().data();
70 | 	const float* U_data = U.flat<float>().data();
71 | 	const float* G_data = G.flat<float>().data();
72 | 	float* Grad_H_data = Grad_H->flat<float>().data();
73 | 	float* Grad_U_data = Grad_U->flat<float>().data();
74 | #if GOOGLE_CUDA
75 |     GradSvdInvProdGpuKernelLauncher(H_data, U_data, G_data, Grad_H_data, Grad_U_data, n_h, batch, n_r);
76 | #endif
77 |   }
78 | };
79 | 
80 | REGISTER_KERNEL_BUILDER(Name("GradSvdInvProdGpu").Device(DEVICE_GPU), GradSvdInvProdGpuOp);
81 | 


--------------------------------------------------------------------------------
/code/load.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import os
  3 | import sys
  4 | import time
  5 | import math
  6 | import numpy as np
  7 | import csv
  8 | import pickle
  9 | import sklearn
 10 | from sklearn.utils import shuffle
 11 | from sklearn.cross_validation import train_test_split
 12 | from sklearn.datasets import fetch_mldata
 13 | import os
 14 | import cPickle as pickle
 15 | import urllib2
 16 | 
 17 | datasets_dir = os.getcwd() + '/../data/'
 18 | 
 19 | def load_mnist_local():
 20 | 	data_dir = os.path.join(datasets_dir,'mnist/')
 21 | 	fd = open(os.path.join(data_dir,'train-images-idx3-ubyte'))
 22 | 	loaded = np.fromfile(file=fd,dtype=np.uint8)
 23 | 	trX = loaded[16:].reshape((60000,28*28)).astype(float)
 24 | 
 25 | 	fd = open(os.path.join(data_dir,'train-labels-idx1-ubyte'))
 26 | 	loaded = np.fromfile(file=fd,dtype=np.uint8)
 27 | 	trY = loaded[8:].reshape((60000))
 28 | 
 29 | 	fd = open(os.path.join(data_dir,'t10k-images-idx3-ubyte'))
 30 | 	loaded = np.fromfile(file=fd,dtype=np.uint8)
 31 | 	teX = loaded[16:].reshape((10000,28*28)).astype(float)
 32 | 
 33 | 	fd = open(os.path.join(data_dir,'t10k-labels-idx1-ubyte'))
 34 | 	loaded = np.fromfile(file=fd,dtype=np.uint8)
 35 | 	teY = loaded[8:].reshape((10000))
 36 | 
 37 | 
 38 | 	return np.concatenate((trX,teX)), np.concatenate((trY,teY))
 39 | 
 40 | 
 41 | 
 42 | 
 43 | ''' prepare dataset '''
 44 | def load_mnist(params, permute=False):
 45 |     mnist = fetch_mldata('MNIST original')
 46 |     mnist_X, mnist_y = shuffle(mnist.data, mnist.target, random_state=params.random_seed)
 47 |     #mnist_X, mnist_y = load_mnist_local()
 48 |     mnist_X = mnist_X / 255.0
 49 |     print mnist_X.shape, mnist_y.shape
 50 |     print("MNIST data prepared")
 51 | 
 52 |     mnist_X, mnist_y = mnist_X.astype('float32'), mnist_y.astype('int64')
 53 |     if permute:
 54 |         np.random.seed(0); permute = np.random.permutation(784)
 55 |         mnist_X = mnist_X[:, permute]
 56 |     def flatten_img(images):
 57 |         '''
 58 |         images: shape => (n, rows, columns)
 59 |         output: shape => (n, rows*columns)
 60 |         '''
 61 |         n_rows    = images.shape[1]
 62 |         n_columns = images.shape[2]
 63 |         for num in range(n_rows):
 64 |             if num % 2 != 0:
 65 |                 images[:, num, :] = images[:, num, :][:, ::-1]
 66 |         output = images.reshape(-1, n_rows*n_columns)
 67 |         return output
 68 | 
 69 |     time_steps = 28*28
 70 |     if len(params.dataset) > 6: # mnist.xx
 71 |         time_steps = int(params.dataset.split('.')[1])
 72 |     mnist_X = mnist_X.reshape((-1, time_steps, 28*28/time_steps))
 73 |     #mnist_X = flatten_img(mnist_X) # X.shape => (n_samples, seq_len)
 74 |     print "mnist_X.shape = ", mnist_X.shape
 75 |     #mnist_X = mnist_X[:, :, np.newaxis] # X.shape => (n_samples, seq_len, n_features)
 76 |     mnist_y_one_hot = np.zeros((mnist_y.shape[0], 10))
 77 |     for i in xrange(len(mnist_y)):
 78 |         mnist_y_one_hot[i][mnist_y[i]] = 1
 79 |     print "mnist_y.shape = ", mnist_y_one_hot.shape
 80 | 
 81 |     train_X, test_X, train_y, test_y = train_test_split(mnist_X, mnist_y_one_hot,
 82 |                                                         test_size=0.2,
 83 |                                                         random_state=params.random_seed)
 84 |     # need to set parameters according to dataset
 85 |     params.time_steps = train_X.shape[1]
 86 |     params.input_size = train_X.shape[2]
 87 |     params.output_size = 10
 88 |     params.regression_flag = False
 89 |     return train_X, test_X, train_y, test_y
 90 | 
 91 | 
 92 | def adding_task(params, fname=datasets_dir+'Adding_task/data', ntrain=50000, ntest=1000):
 93 |     filename = fname + str(params.time_steps)
 94 |     data = np.loadtxt(filename, delimiter=',').astype(np.float32)
 95 |     x = data[:,1:]; y = data[:,0]
 96 |     assert(ntrain+ntest <= x.shape[0])
 97 |     train_X = x.reshape((x.shape[0], x.shape[1]//2, 2))
 98 |     train_Y = y.reshape((y.shape[0], 1))
 99 |     params.time_steps = train_X.shape[1]
100 |     params.input_size = train_X.shape[2]
101 |     params.output_size = 1
102 |     params.regression_flag = True
103 |     print("Adding task with %i time step prepared!"%params.time_steps)
104 |     print "Adding X shape: ", train_X.shape
105 |     print "Adding Y shape: ", train_Y.shape
106 | 
107 |     return train_X[0 : ntrain], train_X[ntrain : ntrain + ntest], train_Y[0 : ntrain], train_Y[ntrain : ntrain + ntest]
108 | 
109 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/grad_svd_prod_gpu.cu.cc:
--------------------------------------------------------------------------------
  1 | #if GOOGLE_CUDA
  2 | 
  3 | #define EIGEN_USE_GPU
  4 | #include <cuda_runtime.h>
  5 | #include <cublas_v2.h>
  6 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
  7 | 
  8 | 
  9 | int inline Hprod(cublasHandle_t handle,const float* H_in, float* H_out, const float* u, float* alpha, const int k, const int n_h, const int batch) {
 10 | 
 11 | 	cublasStatus_t stat;
 12 | 	float aa = 0;
 13 | 	float bb = 0;
 14 | 	float cc = -1.0;
 15 | 	// aa = 2.0 / u^T * u
 16 | 	stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa);
 17 | 	aa = 2.0 / aa;
 18 | 	// make sure that leading (n_h-k) entrees of u are 0
 19 | 	//stat = cublasSscal(handle, n_h-k, &bb, u, 1);
 20 | 	// compute alpha = aa * H^T * u
 21 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
 22 | 						&aa, H_in, n_h,
 23 | 						u, 1,
 24 | 						&bb, alpha, 1);
 25 | 	// update H
 26 | 	stat = cublasScopy(handle, n_h * batch , H_in, 1, H_out, 1); // fill H_out with H_in
 27 | 	stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H_out, n_h);
 28 | 
 29 | 	if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE;
 30 | 	return EXIT_SUCCESS;
 31 | }
 32 | 
 33 | int inline Hgrad(cublasHandle_t handle,const float* H, const float* u, float* G, float* u_grad, float* alpha, float* beta, const int k, const int n_h, const int batch) {
 34 | 
 35 | 	cublasStatus_t stat;
 36 | 	float aa = 0;
 37 | 	float zero = 0;
 38 | 	float neg_one = -1.0;
 39 | 	float pos_one = 1.0;
 40 | 	float alpha_dot_beta = 0;
 41 | 	// aa = 2.0 / u^T * u
 42 | 	stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa);
 43 | 	aa = 2.0 / aa;
 44 | 	// make sure that leading (n_h-k) entrees of u are 0
 45 | 	//stat = cublasSscal(handle, n_h-k, &bb, u, 1);
 46 | 	// compute alpha = aa * H^T * u
 47 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
 48 | 						&aa, H, n_h,
 49 | 						u, 1,
 50 | 						&zero, alpha, 1);
 51 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha failed\n"); return EXIT_FAILURE; }
 52 | 	// compute beta = aa * G^T * u
 53 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
 54 | 						&aa, G, n_h,
 55 | 						u, 1,
 56 | 						&zero, beta, 1);
 57 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("beta failed\n"); return EXIT_FAILURE; }
 58 | 	// compute dot(alpha, beta)
 59 | 	stat = cublasSdot (handle, batch, alpha, 1, beta, 1, &alpha_dot_beta);
 60 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha dot beta failed\n"); return EXIT_FAILURE; }
 61 | 	// u_grad = - G * alpha + 0 * u_grad
 62 | 	stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch,
 63 | 						&neg_one, G, n_h,
 64 | 						alpha, 1,
 65 | 						&zero, u_grad, 1);
 66 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad alpha failed\n"); return EXIT_FAILURE; }
 67 | 	// u_grad = - G * alpha + 1 * u_grad
 68 | 	stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch,
 69 | 						&neg_one, H, n_h,
 70 | 						beta, 1,
 71 | 						&pos_one, u_grad, 1);
 72 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad beta failed\n"); return EXIT_FAILURE; }
 73 | 	// u_grad = alpha_dot_beta * u + 1 * u_grad
 74 | 	stat = cublasSaxpy(handle, n_h, &alpha_dot_beta, u, 1, u_grad, 1);
 75 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad u failed\n"); return EXIT_FAILURE; }
 76 | 	// zero out first n_h - k entrees --- there is better way!
 77 | 	stat = cublasSscal(handle, n_h - k, &zero, u_grad, 1);
 78 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad zero out failed\n"); return EXIT_FAILURE; }
 79 | 	// update G
 80 | 	stat = cublasSger(handle, n_h, batch, &neg_one, u, 1, beta, 1, G, n_h);
 81 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("G update failed\n"); return EXIT_FAILURE; }
 82 | 	return EXIT_SUCCESS;
 83 | }
 84 | // host function for CUDA kernels
 85 | int GradSvdProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* U_grad, const int n_h, const int batch, const int n_r) {
 86 | 	cublasStatus_t stat;
 87 | 	cudaError_t cudaStat;
 88 | 	cublasHandle_t handle;
 89 | 	// creat handle
 90 | 	stat = cublasCreate_v2(&handle);
 91 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on GradSvdProd\n"); return EXIT_FAILURE; }
 92 | 	// allocate alpha
 93 | 	float* alpha;
 94 | 	cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float));
 95 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; }
 96 | 	float* beta;
 97 | 	cudaStat = cudaMalloc ((void**)&beta, batch*sizeof(float));
 98 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of beta failed"); return EXIT_FAILURE; }
 99 | 	// allocate H_hist
100 | 	float* H_hist;
101 | 	cudaStat = cudaMalloc ((void**)&H_hist, (n_r-1)*batch*n_h*sizeof(float));
102 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of H_hist failed"); return EXIT_FAILURE; }
103 | 	// begin computation
104 | 	Hprod(handle, H, H_hist, U, alpha, n_h, n_h, batch);
105 | 	for(int r=1; r < n_r-1; r++) {
106 | 		Hprod(handle, H_hist + (r-1)*batch*n_h, H_hist + r*batch*n_h, U + n_h*r, alpha, n_h - r, n_h, batch);
107 | 	}
108 | 
109 | 	stat = cublasScopy(handle, n_h * batch , G, 1, H_grad, 1); // fill H_out with H_in
110 | 
111 | 	for(int r=n_r-1; r >0; r--) {
112 | 		Hgrad(handle, H_hist + (r-1)*batch*n_h, U + n_h*r, H_grad, U_grad + n_h*r, alpha, beta, n_h - r, n_h, batch);
113 | 	}
114 | 	Hgrad(handle, H, U, H_grad, U_grad, alpha, beta, n_h, n_h, batch);
115 | 
116 | 
117 | 	cudaFree(alpha);
118 | 	cudaFree(beta);
119 | 	cudaFree(H_hist);
120 | 	cublasDestroy(handle);
121 | 	return EXIT_SUCCESS;
122 | }
123 | #endif
124 | 


--------------------------------------------------------------------------------
/code/cuda_svd_ops/grad_svd_inv_prod_gpu.cu.cc:
--------------------------------------------------------------------------------
  1 | #if GOOGLE_CUDA
  2 | 
  3 | #define EIGEN_USE_GPU
  4 | #include <cuda_runtime.h>
  5 | #include <cublas_v2.h>
  6 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
  7 | 
  8 | 
  9 | int inline Hprod(cublasHandle_t handle,const float* H_in, float* H_out, const float* u, float* alpha, const int k, const int n_h, const int batch) {
 10 | 
 11 | 	cublasStatus_t stat;
 12 | 	float aa = 0;
 13 | 	float bb = 0;
 14 | 	float cc = -1.0;
 15 | 	// aa = 2.0 / u^T * u
 16 | 	stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa);
 17 | 	aa = 2.0 / aa;
 18 | 	// make sure that leading (n_h-k) entrees of u are 0
 19 | 	//stat = cublasSscal(handle, n_h-k, &bb, u, 1);
 20 | 	// compute alpha = aa * H^T * u
 21 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
 22 | 						&aa, H_in, n_h,
 23 | 						u, 1,
 24 | 						&bb, alpha, 1);
 25 | 	// update H
 26 | 	stat = cublasScopy(handle, n_h * batch , H_in, 1, H_out, 1); // fill H_out with H_in
 27 | 	stat = cublasSger(handle, n_h, batch, &cc, u, 1, alpha, 1, H_out, n_h);
 28 | 
 29 | 	if (stat != CUBLAS_STATUS_SUCCESS) return EXIT_FAILURE;
 30 | 	return EXIT_SUCCESS;
 31 | }
 32 | 
 33 | int inline Hgrad(cublasHandle_t handle,const float* H, const float* u, float* G, float* u_grad, float* alpha, float* beta, const int k, const int n_h, const int batch) {
 34 | 
 35 | 	cublasStatus_t stat;
 36 | 	float aa = 0;
 37 | 	float zero = 0;
 38 | 	float neg_one = -1.0;
 39 | 	float pos_one = 1.0;
 40 | 	float alpha_dot_beta = 0;
 41 | 	// aa = 2.0 / u^T * u
 42 | 	stat = cublasSdot (handle, k, u + n_h - k, 1, u + n_h - k, 1, &aa);
 43 | 	aa = 2.0 / aa;
 44 | 	// make sure that leading (n_h-k) entrees of u are 0
 45 | 	//stat = cublasSscal(handle, n_h-k, &bb, u, 1);
 46 | 	// compute alpha = aa * H^T * u
 47 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
 48 | 						&aa, H, n_h,
 49 | 						u, 1,
 50 | 						&zero, alpha, 1);
 51 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha failed\n"); return EXIT_FAILURE; }
 52 | 	// compute beta = aa * G^T * u
 53 | 	stat = cublasSgemv(handle, CUBLAS_OP_T, n_h, batch,
 54 | 						&aa, G, n_h,
 55 | 						u, 1,
 56 | 						&zero, beta, 1);
 57 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("beta failed\n"); return EXIT_FAILURE; }
 58 | 	// compute dot(alpha, beta)
 59 | 	stat = cublasSdot (handle, batch, alpha, 1, beta, 1, &alpha_dot_beta);
 60 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("alpha dot beta failed\n"); return EXIT_FAILURE; }
 61 | 	// u_grad = - G * alpha + 0 * u_grad
 62 | 	stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch,
 63 | 						&neg_one, G, n_h,
 64 | 						alpha, 1,
 65 | 						&zero, u_grad, 1);
 66 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad alpha failed\n"); return EXIT_FAILURE; }
 67 | 	// u_grad = - G * alpha + 1 * u_grad
 68 | 	stat = cublasSgemv(handle, CUBLAS_OP_N, n_h, batch,
 69 | 						&neg_one, H, n_h,
 70 | 						beta, 1,
 71 | 						&pos_one, u_grad, 1);
 72 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad beta failed\n"); return EXIT_FAILURE; }
 73 | 	// u_grad = alpha_dot_beta * u + 1 * u_grad
 74 | 	stat = cublasSaxpy(handle, n_h, &alpha_dot_beta, u, 1, u_grad, 1);
 75 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad u failed\n"); return EXIT_FAILURE; }
 76 | 	// zero out first n_h - k entrees --- there is better way!
 77 | 	stat = cublasSscal(handle, n_h - k, &zero, u_grad, 1);
 78 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("u_grad zero out failed\n"); return EXIT_FAILURE; }
 79 | 	// update G
 80 | 	stat = cublasSger(handle, n_h, batch, &neg_one, u, 1, beta, 1, G, n_h);
 81 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("G update failed\n"); return EXIT_FAILURE; }
 82 | 	return EXIT_SUCCESS;
 83 | }
 84 | // host function for CUDA kernels
 85 | int GradSvdInvProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* U_grad, const int n_h, const int batch, const int n_r) {
 86 | 	cublasStatus_t stat;
 87 | 	cudaError_t cudaStat;
 88 | 	cublasHandle_t handle;
 89 | 	// creat handle
 90 | 	stat = cublasCreate_v2(&handle);
 91 | 	if (stat != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS initialization failed on GradSvdInvProd\n"); return EXIT_FAILURE; }
 92 | 	// allocate alpha
 93 | 	float* alpha;
 94 | 	cudaStat = cudaMalloc ((void**)&alpha, batch*sizeof(float));
 95 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of alpha failed"); return EXIT_FAILURE; }
 96 | 	float* beta;
 97 | 	cudaStat = cudaMalloc ((void**)&beta, batch*sizeof(float));
 98 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of beta failed"); return EXIT_FAILURE; }
 99 | 	// allocate H_hist
100 | 	float* H_hist;
101 | 	cudaStat = cudaMalloc ((void**)&H_hist, (n_r-1)*batch*n_h*sizeof(float));
102 | 	if (cudaStat != cudaSuccess) { printf ("device memory allocation of H_hist failed"); return EXIT_FAILURE; }
103 | 	// begin computation
104 | 	Hprod(handle, H, H_hist + (n_r-2)*batch*n_h, U + n_h*(n_r-1), alpha, n_h - n_r + 1, n_h, batch);
105 | 	for(int r=n_r-2; r > 0; r--) {
106 | 		Hprod(handle, H_hist + r*batch*n_h, H_hist + (r-1)*batch*n_h, U + n_h*r, alpha, n_h - r, n_h, batch);
107 | 	}
108 | 
109 | 	stat = cublasScopy(handle, n_h * batch , G, 1, H_grad, 1); // fill H_out with H_in
110 | 
111 | 	for(int r=0; r < n_r-1; r++) {
112 | 		Hgrad(handle, H_hist + r*batch*n_h, U + n_h*r, H_grad, U_grad + n_h*r, alpha, beta, n_h - r, n_h, batch);
113 | 	}
114 | 	Hgrad(handle, H, U + n_h*(n_r-1), H_grad, U_grad + n_h*(n_r-1), alpha, beta, n_h - n_r + 1, n_h, batch);
115 | 
116 | 
117 | 	cudaFree(alpha);
118 | 	cudaFree(beta);
119 | 	cudaFree(H_hist);
120 | 	cublasDestroy(handle);
121 | 	return EXIT_SUCCESS;
122 | }
123 | #endif
124 | 


--------------------------------------------------------------------------------
/code/Params.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | 
  4 | """
  5 | Parameter class
  6 | """
  7 | class Params (object):
  8 |     def __init__(self):
  9 |         self.cell = None # RNN cell
 10 |         self.initial_learning_rate = math.exp(-10) # learning rate for SGD, [exp(-10), 1]
 11 |         self.lr_decay = 0.8 # the multiplier to multiply the learning rate every epoch
 12 |         self.num_epochs = 100 # number of epochs
 13 |         self.dropout_keep_rate = 0.5 # percent of output units that are kept during dropout, in range (0, 1]
 14 |         self.num_units = 200 # number of units
 15 |         self.num_layers = 1 # number of layers
 16 |         self.r_size = 60 # the number of Householder reflectors used in Spectral-RNN
 17 |         self.r_margin = 0.01 # the singular value margin in Spectral-RNN
 18 |         self.time_steps = None # time steps, time_steps*input_size = sequence length
 19 |         self.input_size = None # dimensionality of input features at each time step
 20 |         self.output_size = None # dimensionality of label
 21 |         self.gpu_flag = True # use GPU or not, Spectral-RNN only available in GPU mode
 22 |         self.random_seed = 1000 # random seed
 23 |         self.dataset = 'mnist.28' # dataset name, mnist.[length] where length will overwrite self.time_steps
 24 |         self.batch_size = 128 # batch size
 25 |         self.regression_flag = True # regression or classification
 26 |         self.model_dir = '' # directory to save model, will append .cell_name
 27 |         self.load_model_dir = '' # directory to save model, will append .cell_name
 28 |         self.pred_dir = '' # directory for prediction results, will append .dataset.cell_name.[Xy]
 29 |         self.load_model = False # load model or not
 30 |         self.train_flag = True # train model or not
 31 |         self.batch_norm = False # batch normalization or not
 32 |         self.display_epoch_num = 1 # display how many evaluations per epoch
 33 |     """
 34 |     convert to json
 35 |     """
 36 |     def toJson(self):
 37 |         data = dict()
 38 |         data['cell'] = self.cell
 39 |         data['initial_learning_rate'] = self.initial_learning_rate
 40 |         data['lr_decay'] = self.lr_decay
 41 |         data['num_epochs'] = self.num_epochs
 42 |         data['dropout_keep_rate'] = self.dropout_keep_rate
 43 |         data['num_units'] = self.num_units
 44 |         data['num_layers'] = self.num_layers
 45 |         data['r_size'] = self.r_size
 46 |         data['r_margin'] = self.r_margin
 47 |         data['time_steps'] = self.time_steps
 48 |         data['input_size'] = self.input_size
 49 |         data['output_size'] = self.output_size
 50 |         data['gpu_flag'] = self.gpu_flag
 51 |         data['batch_size'] = self.batch_size
 52 |         data['random_seed'] = self.random_seed
 53 |         data['dataset'] = self.dataset
 54 |         data['regression_flag'] = self.regression_flag
 55 |         data['model_dir'] = self.model_dir
 56 |         data['load_model_dir'] = self.load_model_dir
 57 |         data['pred_dir'] = self.pred_dir
 58 |         data['load_model'] = self.load_model
 59 |         data['train_flag'] = self.train_flag
 60 |         data['batch_norm'] = self.batch_norm
 61 |         data['display_epoch_num'] = self.display_epoch_num
 62 |         return data
 63 |     """
 64 |     load form json
 65 |     """
 66 |     def fromJson(self, data):
 67 |         if 'cell' in data: self.cell = data['cell']
 68 |         if 'initial_learning_rate' in data: self.initial_learning_rate = data['initial_learning_rate']
 69 |         if 'lr_decay' in data: self.lr_decay = data['lr_decay']
 70 |         if 'num_epochs' in data: self.num_epochs = data['num_epochs']
 71 |         if 'dropout_keep_rate' in data: self.dropout_keep_rate = data['dropout_keep_rate']
 72 |         if 'num_units' in data: self.num_units = data['num_units']
 73 |         if 'num_layers' in data: self.num_layers = data['num_layers']
 74 |         if 'r_size' in data: self.r_size = data['r_size']
 75 |         if 'r_margin' in data: self.r_margin = data['r_margin']
 76 |         if 'time_steps' in data: self.time_steps = data['time_steps']
 77 |         if 'input_size' in data: self.input_size = data['input_size']
 78 |         if 'output_size' in data: self.output_size = data['output_size']
 79 |         if 'gpu_flag' in data: self.gpu_flag = data['gpu_flag']
 80 |         if 'batch_size' in data: self.batch_size = data['batch_size']
 81 |         if 'random_seed' in data: self.random_seed = data['random_seed']
 82 |         if 'dataset' in data: self.dataset = data['dataset']
 83 |         if 'regression_flag' in data: self.regression_flag = data['regression_flag']
 84 |         if 'model_dir' in data: self.model_dir = data['model_dir']
 85 |         if 'load_model_dir' in data: self.load_model_dir = data['load_model_dir']
 86 |         if 'pred_dir' in data: self.pred_dir = data['pred_dir']
 87 |         if 'load_model' in data: self.load_model = data['load_model']
 88 |         if 'train_flag' in data: self.train_flag = data['train_flag']
 89 |         if 'batch_norm' in data: self.batch_norm = data['batch_norm']
 90 |         if 'display_epoch_num' in data: self.display_epoch_num = data['display_epoch_num']
 91 | 
 92 |     """
 93 |     dump to json file
 94 |     """
 95 |     def dump(self, filename):
 96 |         with open(filename, 'w') as f:
 97 |             meta = self.toJson()
 98 |             json.dump(dict((key, value) for key, value in meta.iteritems() if value != None), f)
 99 |     """
100 |     load from json file
101 |     """
102 |     def load(self, filename):
103 |         with open(filename, 'r') as f:
104 |             self.fromJson(json.load(f))
105 |     """
106 |     string
107 |     """
108 |     def __str__(self):
109 |         return str(self.toJson())
110 |     """
111 |     print
112 |     """
113 |     def __repr__(self):
114 |         return self.__str__()
115 | 


--------------------------------------------------------------------------------
/code/svd_ops.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from tensorflow.python.framework import ops
  3 | import tensorflow as tf
  4 | 
  5 | def Hprod(H, u, k):
  6 |     # H.shape = (batch, n_h)
  7 |     # u.shape = (n_h,)
  8 |     alpha = 2* np.dot(H[:, -k:], u[-k:]) / np.dot(u[-k:],u[-k:]) # alpha.shape = (batch,)
  9 |     H_out = H.copy()
 10 |     H_out[:, -k:] -= np.outer(alpha, u[-k:])
 11 |     return H_out
 12 | 
 13 | def tf_Hprod(H, u, k):
 14 |     # H.shape = (batch, n_h)
 15 |     # u.shape = (n_h,)
 16 |     u_square = tf.tensordot(u[-k:],u[-k:],1)
 17 |     alpha = 2* tf.tensordot(H[:, -k:],  u[-k:],1) / u_square # alpha.shape = (batch,)
 18 |     H_update = tf.identity(H[:,-k:])
 19 |     #H_update = tf.subtract(H_update, tf.einsum('i,j->ij',alpha, u[-k:]))
 20 |     H_update = tf.subtract(H_update, tf.expand_dims(alpha,1) * tf.expand_dims(u[-k:],0))
 21 | 
 22 |     H_out = tf.concat([H[:,0 :-k], H_update], axis=1)
 23 |     return H_out
 24 | 
 25 | def Hgrad(H, u, G, k):  # unused
 26 |     # H.shape = (batch, n_h)
 27 |     # u.shape = (n_h,)
 28 |     # G.shape = (batch, n_h)
 29 |     alpha = 2* np.dot(H[:, -k:], u[-k:]) # alpha.shape = (batch,)
 30 |     beta = 2* np.dot(G[:, -k:], u[-k:]) # beta.shape = (batch,)
 31 |     u_bar = -np.dot(alpha,G) - np.dot(beta,H) + np.dot(alpha,beta)*u  # sum of gradient within the batch: averaging needed???
 32 |     G_out = G.copy()
 33 |     G_out -=  np.outer(beta,u)
 34 |     return G_out, u_bar  # G_out.shape = (batch, n_h); u_bar.shape = (n_h,)
 35 | 
 36 | def tf_Hgrad(H, u, G, k):
 37 |     # H.shape = (batch, n_h)
 38 |     # u.shape = (n_h,)
 39 |     # G.shape = (batch, n_h)
 40 |     u_square = tf.tensordot(u[-k:],u[-k:],1)
 41 |     alpha = 2* tf.tensordot(H[:, -k:],  u[-k:],1) / u_square # alpha.shape = (batch,)
 42 |     beta = 2* tf.tensordot(G[:, -k:],  u[-k:],1) / u_square # beta.shape = (batch,)
 43 | 
 44 |     u_bar = -tf.tensordot(alpha,G[:,-k:],1) - tf.tensordot(beta,H[:,-k:],1) + tf.tensordot(alpha,beta,1)*u[-k:]  # sum of gradient within the batch: averaging needed???
 45 |     u_bar = tf.concat([u[0 :-k],u_bar], axis=0)
 46 | 
 47 | 
 48 |     G_update = tf.identity(G[:,-k:])
 49 |     delta_G = tf.expand_dims(beta, 1) * tf.reshape(u[-k: ], shape=(1 , k))
 50 |     G_update = tf.subtract(G_update,  delta_G)
 51 |     G_out = tf.concat([G[:,0 :-k], G_update], axis=1)
 52 | 
 53 |     return G_out, u_bar  # G_out.shape = (batch, n_h); u_bar.shape = (n_h,)
 54 | ###### FP definition ########
 55 | 
 56 | def np_svdProd(H,U):
 57 |     #U_shape = U.get_shape().as_list()
 58 |     U_shape = U.shape
 59 |     n_r = U_shape[0]; n_h = U_shape[1]
 60 |     assert( H.shape[1] == n_h)
 61 |     H_copy = H.copy()
 62 |     for i in range(0, n_r):
 63 |         H_copy = Hprod(H_copy, U[i], n_h-i)
 64 |     return H_copy
 65 | 
 66 | def np_svdProd_inv(H,U):
 67 |     #U_shape = U.get_shape().as_list()
 68 |     U_shape = U.shape
 69 |     n_r = U_shape[0]; n_h = U_shape[1]
 70 |     assert( H.shape[1] == n_h)
 71 |     H_copy = H.copy()
 72 |     for i in range(n_r-1,-1,-1):
 73 |         H_copy = Hprod(H_copy, U[i], n_h-i)
 74 |     return H_copy
 75 | ###### BP definition #########
 76 | 
 77 | def svdProdGrad(op, grad):
 78 |     H = op.inputs[0]
 79 |     U = op.inputs[1]
 80 | 
 81 |     #return H, grad
 82 | 
 83 |     U_shape = U.get_shape().as_list()
 84 |     n_r = U_shape[0]; n_h = U_shape[1]
 85 |     #batch = H.get_shape().as_list()[0]
 86 |     #assert( H.get_shape().as_list()[1] == n_h)
 87 | 
 88 |     H_hist = [tf.zeros_like(H, dtype=tf.float32)]*n_r
 89 | 
 90 |     H_hist[0] = tf.add(H_hist[0], H)
 91 |     for i in range(0, n_r-1):
 92 |         H_hist[i+1] = tf_Hprod( H_hist[i], U[i,:], n_h-i)
 93 | 
 94 |     U_bar = [tf.zeros_like(U[0,:], dtype=tf.float32)] * n_r
 95 |     G = grad
 96 | 
 97 |     for i in range(n_r-1, -1, -1):
 98 |         G, U_bar[i] = tf_Hgrad(H_hist[i], U[i], G, n_h-i)
 99 |     U_grad = tf.stack(U_bar)
100 | 
101 |     return G, U_grad #the propagated gradient with respect to the first and second argument respectively
102 | 
103 | def svdProdGrad_inv(op, grad):
104 |     H = op.inputs[0]
105 |     U = op.inputs[1]
106 | 
107 |     U_shape = U.get_shape().as_list()
108 |     n_r = U_shape[0]; n_h = U_shape[1]
109 | 
110 |     H_hist = [tf.zeros_like(H, dtype=tf.float32)]*n_r
111 | 
112 |     H_hist[n_r-1] = tf.add(H_hist[n_r-1], H)
113 |     for i in range(n_r-1, 0, -1):
114 |         H_hist[i-1] = tf_Hprod( H_hist[i], U[i,:], n_h-i)
115 | 
116 |     U_bar = [tf.zeros_like(U[0,:], dtype=tf.float32)] * n_r
117 |     G = grad
118 | 
119 |     for i in range(0, n_r):
120 |         G, U_bar[i] = tf_Hgrad(H_hist[i], U[i], G, n_h-i)
121 |     U_grad = tf.stack(U_bar)
122 | 
123 |     return G, U_grad #the propagated gradient with respect to the first and second argument respectively
124 | 
125 | ###### TF operator definition #######
126 | 
127 | def py_func(func, inp, Tout, stateful=True, name=None, grad=None):
128 | 
129 |     # Need to generate a unique name to avoid duplicates:
130 |     rnd_name = 'PyFuncGrad' + str(np.random.randint(0, 1E+8))
131 | 
132 |     tf.RegisterGradient(rnd_name)(grad)  # see _MySquareGrad for grad example
133 |     g = tf.get_default_graph()
134 |     with g.gradient_override_map({"PyFunc": rnd_name}):
135 |         return tf.py_func(func, inp, Tout, stateful=stateful, name=name)
136 | 
137 | 
138 | def tf_svdProd(H,U, name=None):
139 | 
140 |     with ops.name_scope(name, "svdProd",[H,U] )as name:
141 |         z = py_func(np_svdProd,
142 |                         [H,U],
143 |                         [tf.float32],
144 |                         name=name,
145 |                         grad=svdProdGrad)  # <-- here's the call to the gradient
146 |         return z[0]
147 | 
148 | def tf_svdProd_inv(H,U, name=None):
149 | 
150 |     with ops.name_scope(name, "svdProd_inv",[H,U] )as name:
151 |         z = py_func(np_svdProd_inv,
152 |                         [H,U],
153 |                         [tf.float32],
154 |                         name=name,
155 |                         grad=svdProdGrad_inv)  # <-- here's the call to the gradient
156 |         return z[0]
157 | 


--------------------------------------------------------------------------------
/code/magma_svd_ops/svd_block_prod_gpu.cc:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <iostream>
  3 | #include "magma_v2.h"
  4 | #include "tensorflow/core/framework/op.h"
  5 | #include "tensorflow/core/framework/shape_inference.h"
  6 | #include "tensorflow/core/framework/op_kernel.h"
  7 | #include "tensorflow/core/framework/tensor_shape.h"
  8 | #include "tensorflow/core/platform/default/logging.h"
  9 | //#define PRINT_DEBUG
 10 | 
 11 | using namespace tensorflow;
 12 | 
 13 | REGISTER_OP("SvdBlockProdGpu")
 14 |   .Input("hidden_state: float")
 15 |   .Input("householder_matrix: float")
 16 |   .Attr("is_forward: bool = true")
 17 |   .Output("output_state: float")
 18 |   .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
 19 |     c->set_output(0, c->input(0));
 20 |     return Status::OK();
 21 |   });
 22 | 
 23 | 
 24 | // TODO: move the two declars to a .hpp
 25 | struct workspace {
 26 |     magmaFloat_ptr *T_array;
 27 |     magmaFloat_ptr *Tau_array;
 28 |     magmaFloat_ptr *Twork_array;
 29 |     magmaFloat_ptr *V_array;
 30 |     magmaFloat_ptr T;
 31 |     magmaFloat_ptr tau;
 32 |     magmaFloat_ptr twork;
 33 |     magmaFloat_ptr dwork;
 34 |     magmaFloat_ptr dworkvt;
 35 | };
 36 | 
 37 | int SvdBlockProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r, magma_queue_t queue, workspace ws, const bool isForward=true);
 38 | 
 39 | class SvdBlockProdGpuOp : public OpKernel {
 40 | private:
 41 | 	bool _isForward;
 42 | 	magma_queue_t _queue;
 43 | 	bool _queue_created;
 44 | 	bool _persistent_tensor_created;
 45 | 	PersistentTensor _T_array;
 46 | 	PersistentTensor _Tau_array;
 47 | 	PersistentTensor _Twork_array;
 48 | 	PersistentTensor _V_array;
 49 | 	PersistentTensor _T;
 50 | 	PersistentTensor _tau;
 51 | 	PersistentTensor _twork;
 52 | 	int _batchCount;
 53 | public:
 54 |   explicit SvdBlockProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {
 55 |   	// Get the index of the value to preserve
 56 |     OP_REQUIRES_OK(context,
 57 |                    context->GetAttr("is_forward", &_isForward));
 58 |     // printf("Calling magma init...\n");
 59 |     magma_int_t stat = magma_init();
 60 |     if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); }
 61 |     _queue_created = false;
 62 |     _persistent_tensor_created = false;
 63 |     // create array space
 64 |     _batchCount = 1;
 65 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_T_array, nullptr));
 66 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Tau_array, nullptr));
 67 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Twork_array, nullptr));
 68 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_V_array, nullptr));
 69 |   }
 70 | 
 71 |   ~SvdBlockProdGpuOp() override {
 72 |     // printf("Calling magma finalize...\n");
 73 |     if (!_queue_created) {
 74 |       // printf("destroying magma queue!\n");
 75 |       magma_queue_destroy(_queue);
 76 |       _queue_created = false;
 77 |     }
 78 |     magma_finalize();
 79 |   }
 80 | 
 81 |   void Compute(OpKernelContext* context) override {
 82 | 	// Check number of inputs
 83 | 	OP_REQUIRES(context, context->num_inputs() == 2,
 84 |                 errors::InvalidArgument("SvdBlockProd expects 2 inputes."));
 85 | 
 86 | 	// Grab the input tensor
 87 |     const Tensor& H = context->input(0);
 88 |     const Tensor& U = context->input(1);
 89 | 	// Shapes of input
 90 |     const TensorShape& H_shape = H.shape();
 91 |     const TensorShape& U_shape = U.shape();
 92 | 
 93 | 	const int n_h = H_shape.dim_size(1);
 94 | 	const int n_r = U_shape.dim_size(0);
 95 | 	const int batch = H_shape.dim_size(0);
 96 | 	// Perform dimension check
 97 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape),
 98 |                 errors::InvalidArgument("SvdBlockProd expects H to be a 2-D matrix."));
 99 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape),
100 |                 errors::InvalidArgument("SvdBlockProd expects U to be a 2-D matrix."));
101 | 	OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1),
102 |                 errors::InvalidArgument("The second dimension of H and U does not match!"));
103 | 
104 | 	// Create an output tensor
105 |     Tensor* H_out = NULL;
106 |     OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&H_out));
107 | 
108 | 	// obtain data
109 | 	const float* H_data = H.flat<float>().data();
110 | 	const float* U_data = U.flat<float>().data();
111 | 	float* H_out_data = H_out->flat<float>().data();
112 | 	/*
113 | 	// test
114 | 	int idx  =0;
115 | 	std::printf( "Before:\n");
116 | 	for(int i=0; i < H_shape.dim_size(0); i++){
117 | 		for(int j=0; j < H_shape.dim_size(1); j++){
118 | 			idx = i * H_shape.dim_size(1) + j;
119 | 			std::printf ("H(%d,%d)=%4.4f,  %4.4f\n", i, j, H.flat<float>()(idx), H_out_data[idx]);
120 | 		}
121 | 	}
122 | 	*/
123 | 	// Allocate temp tensors
124 | 	Tensor *T_array = _T_array.AccessTensor(context);
125 | 	Tensor *Tau_array = _Tau_array.AccessTensor(context);
126 | 	Tensor *Twork_array = _Twork_array.AccessTensor(context);
127 | 	Tensor *V_array = _V_array.AccessTensor(context);
128 | 	Tensor dwork;
129 | 	Tensor dworkvt;
130 | 	int ldwork = n_h, ldworkvt = std::max(n_h,batch);
131 | 	if (!_persistent_tensor_created) {
132 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_T, nullptr));
133 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r}), &_tau, nullptr));
134 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_twork, nullptr));
135 | 	  _persistent_tensor_created = true;
136 | 	}
137 | 	Tensor *T = _T.AccessTensor(context);
138 | 	Tensor *tau = _tau.AccessTensor(context);
139 | 	Tensor *twork = _twork.AccessTensor(context);
140 | 	OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dwork));
141 | 	OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dworkvt));
142 | 	workspace ws;
143 | 	ws.T_array = reinterpret_cast<magmaFloat_ptr*>(T_array->flat<long long>().data());
144 | 	ws.Tau_array = reinterpret_cast<magmaFloat_ptr*>(Tau_array->flat<long long>().data());
145 | 	ws.Twork_array = reinterpret_cast<magmaFloat_ptr*>(Twork_array->flat<long long>().data());
146 | 	ws.V_array = reinterpret_cast<magmaFloat_ptr*>(V_array->flat<long long>().data());
147 | 	ws.T = T->flat<float>().data();
148 | 	ws.tau = tau->flat<float>().data();
149 | 	ws.twork = twork->flat<float>().data();
150 | 	ws.dwork = dwork.flat<float>().data();
151 | 	ws.dworkvt = dworkvt.flat<float>().data();
152 | #if GOOGLE_CUDA
153 | 	int op_status;
154 | 	if (!_queue_created) {
155 | 	  _queue_created = true;
156 | 	  magma_queue_create(0, &_queue);
157 | 	  // printf("created magma queue at %p!\n", reinterpret_cast<void *>(_queue));
158 | 	}
159 |     op_status = SvdBlockProdGpuKernelLauncher(H_data, U_data, H_out_data, n_h, batch, n_r, _queue, ws, _isForward);
160 | #endif
161 |   }
162 | };
163 | 
164 | REGISTER_KERNEL_BUILDER(Name("SvdBlockProdGpu").Device(DEVICE_GPU), SvdBlockProdGpuOp);
165 | 


--------------------------------------------------------------------------------
/code/magma_svd_ops/gpu_unit_test.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | from tensorflow.python.framework import ops
  4 | from tensorflow.python.ops import array_ops
  5 | from tensorflow.python.ops import sparse_ops
  6 | np.set_printoptions(threshold=np.nan)
  7 | n_h = 128; n_b = 512; n_r = 16
  8 | print_res = (n_h * n_r < 100)
  9 | 
 10 | 
 11 | def Hgrad(H, u, G, k):
 12 |     # H.shape = (batch, n_h)
 13 |     # u.shape = (n_h,)
 14 |     # G.shape = (batch, n_h)
 15 |     alpha = 2* np.dot(H[:, -k:], u[-k:]) / np.dot(u[-k:],u[-k:]) # alpha.shape = (batch,)
 16 |     beta = 2* np.dot(G[:, -k:], u[-k:]) / np.dot(u[-k:],u[-k:]) # beta.shape = (batch,)
 17 |     u_bar = np.zeros_like(u)
 18 |     u_bar[-k:] += -np.dot(alpha,G[:,-k:]) - np.dot(beta,H[:,-k:]) + np.dot(alpha,beta)*u[-k:]  # sum of gradient within the batch: averaging needed???
 19 |     G_out = G.copy()
 20 |     G_out[:,-k:] -=  np.outer(beta,u[-k:])
 21 |     return G_out, u_bar  # G_out.shape = (batch, n_h); u_bar.shape = (n_h,)
 22 | 
 23 | rng = np.random.RandomState(13)
 24 | H_ = rng.uniform(-np.sqrt(6. / (n_b + n_h)), np.sqrt(6. / (n_b + n_h)), (n_b, n_h)).astype(np.float32)
 25 | 
 26 | U_full = rng.normal(0, 0.01, (n_h, n_r)).astype(np.float32)
 27 | U_ = np.tril(U_full)
 28 | norms_U_ = np.linalg.norm(U_, axis=0)
 29 | #U_ = np.transpose(1. / norms_U_ * U_)
 30 | U_ = np.transpose(U_)
 31 | 
 32 | T = np.triu( np.dot(U_, U_.T))
 33 | for ii in range(n_r):
 34 |     T[ii,ii] /=2
 35 | 
 36 | T_inverse = np.linalg.inv(T)
 37 | 
 38 | #for ii in range(n_r):
 39 | #    for jj in range(n_r):
 40 | #        print ' %6.3f'%(T_inverse[ii,jj]),
 41 | #    print ''
 42 | 
 43 | 
 44 | 
 45 | if print_res:
 46 |     print "T: ", T
 47 |     print "T_inverse: ", np.linalg.inv(T)
 48 | 
 49 |     print "H: ",H_
 50 |     print "U: ",U_
 51 | 
 52 | 
 53 | ############################################################
 54 | #Forward
 55 | ############################################################
 56 | H1 = [H_]*(n_r+1)
 57 | 
 58 | for i in range(0,n_r):
 59 |     alpha = np.dot(H1[i], U_[i])
 60 |     #print 'alpha: ', 2*alpha
 61 |     H1[i+1] = H1[i] - 2 * np.outer(alpha, U_[i]) / np.dot(U_[i],U_[i])
 62 | 
 63 | H2 = H1[-1]
 64 | 
 65 | 
 66 | G = np.ones_like(H_)
 67 | Grad_U = np.ones_like(U_)
 68 | 
 69 | for i in range(n_r-1, -1, -1):
 70 |     G, Grad_U[i] = Hgrad(H1[i], U_[i], G, n_h-i)
 71 | 
 72 | if print_res:
 73 |     print "Hprod: ",H2
 74 |     print "Grad_H: ",G
 75 |     print "Grad_U: ",Grad_U
 76 | 
 77 | ################ BLAS3 VER ########################
 78 | #Blas_G = np.ones_like(H_.T)
 79 | #Blas_U = U_.T
 80 | #Blas_H = H_.T
 81 | #Grad_Q = np.dot( Blas_G , Blas_H.T)
 82 | #print "Grad_Q: ", Grad_Q
 83 | #print "U * T: ", np.dot(Blas_U, np.linalg.inv(T.T))
 84 | 
 85 | #R =np.dot( np.dot( Grad_Q.T , Blas_U ),  np.linalg.inv(T.T))
 86 | #print "R: ", R
 87 | #print "U * T^T: ", np.dot(Blas_U, np.linalg.inv(T.T).T)
 88 | #S =np.dot( np.dot( Grad_Q , Blas_U ), np.linalg.inv(T.T).T)
 89 | 
 90 | #M =np.dot(np.dot( np.linalg.inv(T.T) , Blas_U.T) , R)
 91 | #print "M: ", M
 92 | #i_lower = np.tril_indices(2, -1)
 93 | #M[i_lower] = M.T[i_lower]
 94 | 
 95 | #print "P: ", M
 96 | 
 97 | #Hprod_BLAS3 = np.eye(3) - np.dot(np.dot( Blas_U, np.linalg.inv(T.T)), Blas_U.T)
 98 | #Hprod_BLAS3 = np.dot( Hprod_BLAS3, Blas_H)
 99 | 
100 | #Grad_U_BLAS3 = np.dot( Blas_U, M )- S - R
101 | 
102 | #print "Hprod_BLAS3: ", Hprod_BLAS3.T
103 | #print "Grad_U_BLAS3: ", Grad_U_BLAS3.T
104 | 
105 | 
106 | ############################################################
107 | #Backward
108 | ############################################################
109 | H1 = [H_]*(n_r+1)
110 | for i in range(n_r-1,-1,-1):
111 |     alpha = np.dot(H1[i+1], U_[i])
112 |     H1[i] = H1[i+1] - 2 * np.outer(alpha, U_[i]) / np.dot(U_[i],U_[i])
113 | 
114 | 
115 | H2_back = H1[0]
116 | 
117 | 
118 | G_back = np.ones_like(H_)
119 | Grad_U_back = np.ones_like(U_)
120 | 
121 | for i in range(0,n_r):
122 |     G_back, Grad_U_back[i] = Hgrad(H1[i+1], U_[i], G_back, n_h-i)
123 | 
124 | if print_res:
125 |     print "H_inv_prod: ",H2_back
126 |     print "Grad_inv_H: ",G_back
127 |     print "Grad_inv_U: ",Grad_U_back
128 | ############################################################
129 | ############################################################
130 | svd_block_prod_module = tf.load_op_library('./svd_block_prod_gpu.so')
131 | ############################################################
132 | 
133 | grad_svd_block_prod_module = tf.load_op_library('./grad_svd_block_prod_gpu.so')
134 | 
135 | @ops.RegisterGradient("SvdBlockProdGpu")
136 | def _svd_block_prod_gpu_grad(op, grad):
137 |     H = op.inputs[0]
138 |     U = op.inputs[1]
139 |     isForward = op.get_attr("is_forward")
140 |     return grad_svd_block_prod_module.grad_svd_block_prod_gpu(H,U,grad, isForward)
141 | ############################################################
142 | svd_prod_module = tf.load_op_library('../cuda_svd_ops/svd_prod_gpu.so')
143 | ############################################################
144 | grad_svd_prod_module = tf.load_op_library('../cuda_svd_ops/grad_svd_prod_gpu.so')
145 | 
146 | @ops.RegisterGradient("SvdProdGpu")
147 | def _svd_prod_gpu_grad(op, grad):
148 |     H = op.inputs[0]
149 |     U = op.inputs[1]
150 |     return grad_svd_prod_module.grad_svd_prod_gpu(H,U,grad)
151 | ############################################################
152 | svd_inv_prod_module = tf.load_op_library('../cuda_svd_ops/svd_inv_prod_gpu.so')
153 | grad_svd_inv_prod_module = tf.load_op_library('../cuda_svd_ops/grad_svd_inv_prod_gpu.so')
154 | 
155 | @ops.RegisterGradient("SvdInvProdGpu")
156 | def _svd_inv_prod_gpu_grad(op, grad):
157 |     H = op.inputs[0]
158 |     U = op.inputs[1]
159 |     return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad)
160 | ############################################################
161 | with tf.Session() as sess:
162 | 
163 |     H = tf.constant(H_, dtype=tf.float32)
164 |     U = tf.constant(U_, dtype = tf.float32)
165 |     V = tf.constant(U_, dtype = tf.float32)
166 | 
167 |     U = tf.matrix_band_part(U, 0, -1) # upper triangular
168 |     V = tf.matrix_band_part(V, 0, -1) # upper triangular
169 | 
170 |     z = svd_block_prod_module.svd_block_prod_gpu(H,U, True)
171 |     blas2_z = svd_prod_module.svd_prod_gpu(H,U)
172 | 
173 |     z2 = svd_block_prod_module.svd_block_prod_gpu(H,V, False)
174 |     blas2_z2 = svd_inv_prod_module.svd_inv_prod_gpu(H,V)
175 |     
176 |     gr = tf.gradients(z, [H,U])
177 |     blas2_gr = tf.gradients(blas2_z, [H,U])
178 | 
179 |     gr2 = tf.gradients(z2, [H,V])
180 |     blas2_gr2 = tf.gradients(blas2_z2, [H,V])
181 |     
182 |     tf.global_variables_initializer().run()
183 | 
184 |     if print_res:
185 |         print('H,U and product: ',H.eval(), U.eval(),z.eval())
186 |         print('BLAS2 H,U and product: ',H.eval(), U.eval(),blas2_z.eval())
187 |         print('grad_H, grad_U: ' ,gr[0].eval(), gr[1].eval())
188 | 
189 |         print('H,U and product: ',z.eval(), V.eval(),z2.eval())
190 |         print('grad_H, grad_U: ' ,gr2[0].eval(), gr2[1].eval())
191 | 
192 | 
193 |     print "Forward Hprod error:", np.amax( abs(H2 - z.eval()))
194 |     print "Forward Hgrad G error:", np.amax( abs(G - gr[0].eval()))
195 |     print "Forward Hgrad U error:", np.amax( abs(Grad_U - gr[1].eval()))
196 |     print "BLAS2 Forward Hprod error:", np.amax( abs(H2 - blas2_z.eval()))
197 |     print "BLAS2 Forward Hgrad G error:", np.amax( abs(G - blas2_gr[0].eval()))
198 |     print "BLAS2 Forward Hgrad U error:", np.amax( abs(Grad_U - blas2_gr[1].eval()))
199 |     print "Backward Hprod error:", np.amax( abs(H2_back - z2.eval()))
200 |     print "Backward Hgrad G error:", np.amax( abs(G_back - gr2[0].eval()))
201 |     print "Backward Hgrad U error:", np.amax( abs(Grad_U_back  - gr2[1].eval()))
202 |     print "BLAS2 Backward Hprod error:", np.amax( abs(H2_back - blas2_z2.eval()))
203 |     print "BLAS2 Backward Hgrad G error:", np.amax( abs(G_back - blas2_gr2[0].eval()))
204 |     print "BLAS2 Backward Hgrad U error:", np.amax( abs(Grad_U_back - blas2_gr2[1].eval()))
205 | 


--------------------------------------------------------------------------------
/code/magma_svd_ops/grad_svd_block_prod_gpu.cc:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <iostream>
  3 | #include "magma_v2.h"
  4 | #include "tensorflow/core/framework/op.h"
  5 | #include "tensorflow/core/framework/shape_inference.h"
  6 | #include "tensorflow/core/framework/op_kernel.h"
  7 | #include "tensorflow/core/framework/tensor_shape.h"
  8 | #include "tensorflow/core/platform/default/logging.h"
  9 | 
 10 | using namespace tensorflow;
 11 | 
 12 | REGISTER_OP("GradSvdBlockProdGpu")
 13 |   .Input("hidden_state: float")
 14 |   .Input("householder_matrix: float")
 15 |   .Input("gradient_backprop: float")
 16 |   .Attr("is_forward: bool = true")
 17 |   .Output("grad_hidden_state: float")
 18 |   .Output("grad_householder_matrix: float")
 19 |   .SetShapeFn([](::tensorflow::shape_inference::InferenceContext *c) {
 20 |     c->set_output(0, c->input(0));
 21 |     c->set_output(1, c->input(1));
 22 |     return Status::OK();
 23 |   });
 24 | 
 25 | #include "tensorflow/core/framework/op_kernel.h"
 26 | 
 27 | struct grad_workspace {
 28 |     magmaFloat_ptr *T_array;
 29 |     magmaFloat_ptr *Tau_array;
 30 |     magmaFloat_ptr *Twork_array;
 31 |     magmaFloat_ptr *V_array;
 32 |     magmaFloat_ptr T;
 33 |     magmaFloat_ptr tau;
 34 |     magmaFloat_ptr twork;
 35 |     magmaFloat_ptr dwork;
 36 |     magmaFloat_ptr dworkvt;
 37 |     magmaFloat_ptr Q_grad;
 38 |     magmaFloat_ptr UT;
 39 | };
 40 | 
 41 | int GradSvdBlockProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* G_grad, const int n_h, const int batch, const int n_r, magma_queue_t queue, grad_workspace ws, const bool isForward);
 42 | 
 43 | class GradSvdBlockProdGpuOp : public OpKernel {
 44 | private:
 45 | 	bool _isForward;
 46 | 	magma_queue_t _queue;
 47 | 	bool _queue_created;
 48 | 	bool _persistent_tensor_created;
 49 | 	PersistentTensor _T_array;
 50 | 	PersistentTensor _Tau_array;
 51 | 	PersistentTensor _Twork_array;
 52 | 	PersistentTensor _V_array;
 53 | 	PersistentTensor _T;
 54 | 	PersistentTensor _tau;
 55 | 	PersistentTensor _twork;
 56 | 	PersistentTensor _Q_grad;
 57 | 	PersistentTensor _UT;
 58 | 	int _batchCount;
 59 | public:
 60 |   explicit GradSvdBlockProdGpuOp(OpKernelConstruction* context) : OpKernel(context) {
 61 |   	// Get the index of the value to preserve
 62 |     OP_REQUIRES_OK(context,
 63 |                    context->GetAttr("is_forward", &_isForward));
 64 |     // printf("Calling magma init in grad...\n");
 65 |     magma_int_t stat = magma_init();
 66 |     if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); }
 67 |     _queue_created = false;
 68 |     _persistent_tensor_created = false;
 69 |     // create array space
 70 |     _batchCount = 1;
 71 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_T_array, nullptr));
 72 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Tau_array, nullptr));
 73 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_Twork_array, nullptr));
 74 |     OP_REQUIRES_OK(context, context->allocate_persistent(DT_INT64, TensorShape({_batchCount}), &_V_array, nullptr));
 75 |   }
 76 | 
 77 |   ~GradSvdBlockProdGpuOp() override {
 78 |     // printf("Calling magma finalize in grad...\n");
 79 |     if (!_queue_created) {
 80 |       // printf("destroying magma queue!\n");
 81 |       magma_queue_destroy(_queue);
 82 |       _queue_created = false;
 83 |     }
 84 |     magma_finalize();
 85 |   }
 86 | 
 87 |   void Compute(OpKernelContext* context) override {
 88 | 	// Check number of inputs
 89 | 	OP_REQUIRES(context, context->num_inputs() == 3,
 90 |                 errors::InvalidArgument("GradSvdBlockProd expects 3 inputes."));
 91 | 
 92 | 	// Grab the input tensor
 93 |     const Tensor& H = context->input(0);
 94 |     const Tensor& U = context->input(1);
 95 |     const Tensor& G = context->input(2);
 96 |     auto input = H.flat<float>();
 97 | 
 98 | 	// Shapes of input
 99 |     const TensorShape& H_shape = H.shape();
100 |     const TensorShape& U_shape = U.shape();
101 |     const TensorShape& G_shape = G.shape();
102 | 
103 | 	const int n_h = H_shape.dim_size(1);
104 | 	const int n_r = U_shape.dim_size(0);
105 | 	const int batch = H_shape.dim_size(0);
106 | 	// Perform dimension check
107 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(H_shape),
108 |                 errors::InvalidArgument("SvdBlockProd expects H to be a 2-D matrix."));
109 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(U_shape),
110 |                 errors::InvalidArgument("SvdBlockProd expects U to be a 2-D matrix."));
111 | 	OP_REQUIRES(context, TensorShapeUtils::IsMatrix(G_shape),
112 |                 errors::InvalidArgument("SvdBlockProd expects G to be a 2-D matrix."));
113 | 	OP_REQUIRES(context, H_shape.dim_size(1) == U_shape.dim_size(1),
114 |                 errors::InvalidArgument("The second dimension of H and U does not match!"));
115 | 	OP_REQUIRES(context, G_shape.dim_size(0) == H_shape.dim_size(0),
116 |                 errors::InvalidArgument("The first dimension of G and H does not match!"));
117 | 	OP_REQUIRES(context, G_shape.dim_size(1) == H_shape.dim_size(1),
118 |                 errors::InvalidArgument("The second dimension of G and H does not match!"));
119 | 
120 | 	// Create an output tensor
121 |     Tensor* Grad_H = NULL;
122 |     OP_REQUIRES_OK(context, context->allocate_output(0, H_shape,&Grad_H));
123 |     Tensor* Grad_U = NULL;
124 |     OP_REQUIRES_OK(context, context->allocate_output(1, U_shape,&Grad_U));
125 | 
126 | 	// obtain data
127 | 	const float* H_data = H.flat<float>().data();
128 | 	const float* U_data = U.flat<float>().data();
129 | 	const float* G_data = G.flat<float>().data();
130 | 	float* Grad_H_data = Grad_H->flat<float>().data();
131 | 	float* Grad_U_data = Grad_U->flat<float>().data();
132 | 	// Allocate temp tensors
133 | 	Tensor *T_array = _T_array.AccessTensor(context);
134 | 	Tensor *Tau_array = _Tau_array.AccessTensor(context);
135 | 	Tensor *Twork_array = _Twork_array.AccessTensor(context);
136 | 	Tensor *V_array = _V_array.AccessTensor(context);
137 | 	Tensor dwork;
138 | 	Tensor dworkvt;
139 | 	int ldwork = n_h, ldworkvt = std::max(n_h,batch);
140 | 	if (!_persistent_tensor_created) {
141 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_T, nullptr));
142 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r}), &_tau, nullptr));
143 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_r*n_r}), &_twork, nullptr));
144 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_h*n_h}), &_Q_grad, nullptr));
145 | 	  OP_REQUIRES_OK(context, context->allocate_persistent(DT_FLOAT, TensorShape({n_h*n_r}), &_UT, nullptr));
146 | 	  _persistent_tensor_created = true;
147 | 	}
148 | 	Tensor *T = _T.AccessTensor(context);
149 | 	Tensor *tau = _tau.AccessTensor(context);
150 | 	Tensor *twork = _twork.AccessTensor(context);
151 | 	Tensor *Q_grad = _Q_grad.AccessTensor(context);
152 | 	Tensor *UT = _UT.AccessTensor(context);
153 | 	OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dwork));
154 | 	OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, TensorShape({ldworkvt*n_r}), &dworkvt));
155 | 	grad_workspace ws;
156 | 	ws.T_array = reinterpret_cast<magmaFloat_ptr*>(T_array->flat<long long>().data());
157 | 	ws.Tau_array = reinterpret_cast<magmaFloat_ptr*>(Tau_array->flat<long long>().data());
158 | 	ws.Twork_array = reinterpret_cast<magmaFloat_ptr*>(Twork_array->flat<long long>().data());
159 | 	ws.V_array = reinterpret_cast<magmaFloat_ptr*>(V_array->flat<long long>().data());
160 | 	ws.T = T->flat<float>().data();
161 | 	ws.tau = tau->flat<float>().data();
162 | 	ws.twork = twork->flat<float>().data();
163 | 	ws.dwork = dwork.flat<float>().data();
164 | 	ws.dworkvt = dworkvt.flat<float>().data();
165 | 	ws.Q_grad = Q_grad->flat<float>().data();
166 | 	ws.UT = UT->flat<float>().data();
167 | #if GOOGLE_CUDA
168 | 	if (!_queue_created) {
169 | 	  _queue_created = true;
170 | 	  magma_queue_create(0, &_queue);
171 | 	  // printf("created magma queue at %p!\n", reinterpret_cast<void *>(_queue));
172 | 	}
173 |     GradSvdBlockProdGpuKernelLauncher(H_data, U_data, G_data, Grad_H_data, Grad_U_data, n_h, batch, n_r, _queue, ws, _isForward);
174 | #endif
175 |   }
176 | };
177 | 
178 | REGISTER_KERNEL_BUILDER(Name("GradSvdBlockProdGpu").Device(DEVICE_GPU), GradSvdBlockProdGpuOp);
179 | 


--------------------------------------------------------------------------------
/code/spectral_rnn.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | from tensorflow.python.util import nest
  3 | import numpy as np
  4 | import os
  5 | #from svd_ops import tf_svdProd, tf_svdProd_inv
  6 | from tensorflow.python.framework import ops
  7 | from tensorflow.python.ops import array_ops
  8 | from tensorflow.python.ops import sparse_ops
  9 | 
 10 | ############################################################
 11 | ############ BLAS3 version of SVD ops ######################
 12 | ############################################################
 13 | svd_block_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/magma_svd_ops/svd_block_prod_gpu.so')
 14 | ############################################################
 15 | grad_svd_block_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/magma_svd_ops/grad_svd_block_prod_gpu.so')
 16 | 
 17 | @ops.RegisterGradient("SvdBlockProdGpu")
 18 | def _svd_block_prod_gpu_grad(op, grad):
 19 |     H = op.inputs[0]
 20 |     U = op.inputs[1]
 21 |     isForward = op.get_attr("is_forward")
 22 |     return grad_svd_block_prod_module.grad_svd_block_prod_gpu(H,U,grad, isForward)
 23 | ############################################################
 24 | ############ BLAS2 version of SVD ops ######################
 25 | ############################################################
 26 | svd_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/svd_prod_gpu.so')
 27 | grad_svd_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/grad_svd_prod_gpu.so')
 28 | @ops.RegisterGradient("SvdProdGpu")
 29 | def _svd_prod_gpu_grad(op, grad):
 30 |     H = op.inputs[0]
 31 |     U = op.inputs[1]
 32 |     return grad_svd_prod_module.grad_svd_prod_gpu(H,U,grad)
 33 | 
 34 | ############################################################
 35 | svd_inv_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/svd_inv_prod_gpu.so')
 36 | grad_svd_inv_prod_module = tf.load_op_library(os.path.dirname(os.path.abspath(__file__)) + '/cuda_svd_ops/grad_svd_inv_prod_gpu.so')
 37 | @ops.RegisterGradient("SvdInvProdGpu")
 38 | def _svd_inv_prod_gpu_grad(op, grad):
 39 |     H = op.inputs[0]
 40 |     U = op.inputs[1]
 41 |     return grad_svd_inv_prod_module.grad_svd_inv_prod_gpu(H,U,grad)
 42 | 
 43 | ############################################################
 44 | 
 45 | class SpectralRNNCell(tf.contrib.rnn.RNNCell):
 46 |     """Implements a simple distribution based recurrent unit that keeps moving
 47 |     averages of the mean map embeddings of features of inputs.
 48 |     """
 49 |     """
 50 |     n_h: hidden state size
 51 |     n_o: output size
 52 |     n_r: reflector size
 53 |     variables: pass a dictionary of Variables, and we will not create new ones
 54 |     backend: blas3, blas2 or python
 55 |     """
 56 | 
 57 |     def __init__(self, n_h, n_r = None, r_margin = 0.01,
 58 |                  linear_out=False, activation=tf.nn.relu, variables=None, backend="blas3"):
 59 |         self._n_h = n_h
 60 |         self._n_r = n_r or n_h//4
 61 |         self._r_margin = r_margin
 62 | 
 63 |         self._linear_out = linear_out
 64 |         self._activation = activation
 65 |         self._variables = variables
 66 |         self._backend = backend
 67 | 
 68 |     @property
 69 |     def state_size(self):
 70 |         return self._n_h
 71 | 
 72 |     @property
 73 |     def reflector_size(self):
 74 |         return self._n_r
 75 | 
 76 |     @property
 77 |     def output_size(self):
 78 |         return self._n_h
 79 | 
 80 |     def __call__(self, inputs, state, scope=None):
 81 |         """
 82 |         recur*: r
 83 |         state*: mu
 84 |         stats*: phi
 85 |         _mavg_alphas: alpha vector
 86 |         """
 87 |         with tf.variable_scope(scope or type(self).__name__):
 88 |             # Compute the output.
 89 |             """
 90 |             o_t = W^o mu_t + b^o
 91 |             """
 92 |             output = _svdlinear([inputs, state], self._n_h, self._n_r, True, r=self._r_margin,  scope='output', variables=self._variables, backend=self._backend)
 93 |             #output = _linear([inputs, state], self._n_h, True, scope='output')
 94 | 
 95 | 
 96 |             if not self._linear_out:
 97 |                 output = self._activation(output, name='output_act')
 98 |             """
 99 |             o_t and mu_t
100 |             """
101 |             return (output, output)
102 | 
103 | 
104 | # No longer publicly expose function in tensorflow.
105 | def _svdlinear(args, output_size, reflector_size, bias, bias_start=0.0, sig_mean = 1.0, r = 0.01, scope=None, variables=None, backend="blas3"):
106 |     """Linear map with svd operator
107 | 
108 |     Args:
109 |       args: a 2D Tensor or a list of 2D, batch x n, Tensors.
110 |       output_size: int, second dimension of W[i].
111 |       bias: boolean, whether to add a bias term or not.
112 |       bias_start: starting value to initialize the bias; 0 by default.
113 |       sig_mean: initial and "mean" value of singular values, usually set to 1.0,
114 |                 for ResNet should be set to 0.0
115 |       r: singular margin, the allowed margin for singular values
116 |       scope: VariableScope for the created subgraph; defaults to "Linear".
117 |       variables: pass a dictionary of Variables, and we will not create new ones
118 |       backend: blas3, blas2 or python
119 | 
120 |     Returns:
121 |       A 2D Tensor with shape [batch x output_size] 
122 | 
123 |     Raises:
124 |       ValueError: if some of the arguments has unspecified or wrong shape or unknown backend is passed
125 |     """
126 |     if args is None or (nest.is_sequence(args) and not args):
127 |         raise ValueError("`args` must be specified")
128 |     if not nest.is_sequence(args):
129 |         args = [args]
130 | 
131 |     dtype = [a.dtype for a in args][0]
132 |     # computation for svd:Hprod
133 |     with tf.variable_scope(scope or "svdHprod"):
134 |         if variables:
135 |             U_full = variables["Householder_U_full"]
136 |         else:
137 |             U_full = tf.get_variable(
138 |                 "Householder_U_full", [reflector_size, output_size], dtype=dtype)
139 |         U = tf.matrix_band_part(U_full, 0, -1) # upper triangular
140 |         if variables: 
141 |             p = variables["p"]
142 |         else:
143 |             p = tf.get_variable(
144 |                 "p", [ output_size], dtype=dtype,
145 |                 initializer=tf.constant_initializer(np.zeros(output_size)))
146 |         Sig = 2*r*(tf.sigmoid(p) - 0.5) + sig_mean
147 |         if variables:
148 |             V_full = variables["Householder_V_full"]
149 |         else:
150 |             V_full = tf.get_variable(
151 |                 "Householder_V_full", [reflector_size, output_size], dtype=dtype)
152 |         V = tf.matrix_band_part(V_full, 0, -1) # upper triangular
153 | 
154 | 
155 |         if backend == "python":
156 |             svd_term = tf_svdProd( args[1], V) # python operator
157 |             svd_term = tf.multiply(svd_term, Sig)
158 |             svd_term = tf_svdProd_inv( svd_term, U) # python operator
159 |         elif backend == "blas2":
160 |             svd_term = svd_prod_module.svd_prod_gpu( args[1], V) # BLAS2 operator
161 |             svd_term = tf.multiply(svd_term, Sig)
162 |             svd_term = svd_inv_prod_module.svd_inv_prod_gpu( svd_term, U) # BLAS2 operator
163 |         elif backend == "blas3":
164 |             svd_term = svd_block_prod_module.svd_block_prod_gpu( args[1], V, True) # BLAS3 operator
165 |             svd_term = tf.multiply(svd_term, Sig)
166 |             svd_term = svd_block_prod_module.svd_block_prod_gpu( svd_term, U, False) # BLAS3 operator
167 |         else:
168 |             raise ValueError("Unknown backend " + backend)
169 | 
170 | 
171 | 
172 |     # Now the computation for the rest
173 |     with tf.variable_scope(scope or "svdLinear"):
174 |         if variables:
175 |             matrix = variables["Matrix"]
176 |         else:
177 |             matrix = tf.get_variable(
178 |                 "Matrix", [args[0].shape[1].value, output_size], dtype=dtype)
179 |         res = tf.matmul(args[0], matrix)
180 |         if not bias:
181 |             return res + svd_term
182 |         if variables:
183 |             bias_term = variables["Bias"]
184 |         else:
185 |             bias_term = tf.get_variable(
186 |                 "Bias", [output_size],
187 |                 dtype=dtype,
188 |                 initializer=tf.constant_initializer(bias_start, dtype=dtype)
189 |             )
190 |     return res + bias_term + svd_term
191 | 
192 | def _linear(args, output_size, bias, bias_start=0.0, scope=None):
193 |     """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.
194 | 
195 |     Args:
196 |       args: a 2D Tensor or a list of 2D, batch x n, Tensors.
197 |       output_size: int, second dimension of W[i].
198 |       bias: boolean, whether to add a bias term or not.
199 |       bias_start: starting value to initialize the bias; 0 by default.
200 |       scope: VariableScope for the created subgraph; defaults to "Linear".
201 | 
202 |     Returns:
203 |       A 2D Tensor with shape [batch x output_size] equal to
204 |       sum_i(args[i] * W[i]), where W[i]s are newly created matrices.
205 | 
206 |     Raises:
207 |       ValueError: if some of the arguments has unspecified or wrong shape.
208 |     """
209 |     if args is None or (nest.is_sequence(args) and not args):
210 |         raise ValueError("`args` must be specified")
211 |     if not nest.is_sequence(args):
212 |         args = [args]
213 | 
214 |     # Calculate the total size of arguments on dimension 1.
215 |     total_arg_size = 0
216 |     shapes = [a.get_shape().as_list() for a in args]
217 |     for shape in shapes:
218 |         if len(shape) != 2:
219 |             raise ValueError(
220 |                 "Linear is expecting 2D arguments: %s" %
221 |                 str(shapes))
222 |         if not shape[1]:
223 |             raise ValueError(
224 |                 "Linear expects shape[1] of arguments: %s" %
225 |                 str(shapes))
226 |         else:
227 |             total_arg_size += shape[1]
228 | 
229 |     dtype = [a.dtype for a in args][0]
230 | 
231 |     # Now the computation.
232 |     with tf.variable_scope(scope or "Linear"):
233 |         matrix = tf.get_variable(
234 |             "Matrix", [total_arg_size, output_size], dtype=dtype)
235 |         if len(args) == 1:
236 |             res = tf.matmul(args[0], matrix)
237 |         else:
238 |             res = tf.matmul(tf.concat(args, 1), matrix)
239 |         if not bias:
240 |             return res
241 |         bias_term = tf.get_variable(
242 |             "Bias", [output_size],
243 |             dtype=dtype,
244 |             initializer=tf.constant_initializer(bias_start, dtype=dtype)
245 |         )
246 |     return res + bias_term
247 | 


--------------------------------------------------------------------------------
/code/rnn.py:
--------------------------------------------------------------------------------
  1 | import math, time
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import spectral_rnn
  5 | import Params
  6 | import sys,os
  7 | from tensorflow.python.framework import ops
  8 | from tensorflow.python.ops import array_ops
  9 | from tensorflow.python.ops import sparse_ops
 10 | 
 11 | 
 12 | class RNNModel (object):
 13 |     def __init__(self, params):
 14 |         self.rnn_cell = None
 15 |         # feature
 16 |         self.x = tf.placeholder("float", [None, params.time_steps, params.input_size])
 17 |         # label
 18 |         self.y = tf.placeholder("float", [None, params.output_size])
 19 |         # train_flag placeholder
 20 |         self.train_flag = tf.placeholder(tf.bool, [], name="train_flag")
 21 |         # learning rate placeholder
 22 |         self.learning_rate = tf.placeholder(tf.float32, [], name='learning_rate')
 23 | 
 24 |         self.init_epoch = 0
 25 |         print 'Var names: ', self.x.name, self.y.name, self.train_flag.name, self.learning_rate.name
 26 | 
 27 |         sys.stdout.flush()
 28 |         # set random seed before build the graph
 29 |         tf.set_random_seed(params.random_seed)
 30 | 
 31 |         # build graph
 32 |         logits = self.build(params)
 33 | 
 34 |         # prediction
 35 |         # Define loss and optimizer
 36 |         # evaluation
 37 |         if params.regression_flag:
 38 |             self.pred = logits
 39 |             self.loss_op = tf.reduce_mean(tf.pow(self.pred-self.y, 2))
 40 |             self.accuracy = self.loss_op
 41 |         else:
 42 |             self.pred = tf.nn.softmax(logits)
 43 |             self.loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
 44 |                     logits=logits, labels=self.y))
 45 |             correct_pred = tf.equal(tf.argmax(self.pred, 1), tf.argmax(self.y, 1))
 46 |             self.accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
 47 | 
 48 |         config = tf.ConfigProto(device_count={'GPU' : int(params.gpu_flag)})
 49 |         config.gpu_options.allow_growth = True
 50 |         config.gpu_options.per_process_gpu_memory_fraction = 0.9
 51 |         # running session
 52 |         self.session = tf.Session(config=config)
 53 |         self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)
 54 |         self.train_op = self.optimizer.minimize(self.loss_op)
 55 | 
 56 |     def __enter__(self):
 57 |         return self
 58 |     def __exit__(self, exc_type, exc_value, traceback):
 59 |         self.close()
 60 | 
 61 |     """
 62 |     call this function to destroy globally defined variables in tensorflow
 63 |     """
 64 |     def close(self):
 65 |         self.session.close()
 66 |         tf.reset_default_graph()
 67 | 
 68 | 
 69 |     def set_cell(self, params):
 70 |         if params.cell == "LSTM":
 71 |             self.rnn_cell = tf.contrib.rnn.BasicLSTMCell(
 72 |                     num_units=params.num_units
 73 |                     )
 74 |         elif params.cell == "RNN":
 75 |             self.rnn_cell = tf.contrib.rnn.BasicRNNCell(
 76 |                     num_units=params.num_units
 77 |                     )
 78 |         elif params.cell == "SpectralRNN":
 79 |             self.rnn_cell = spectral_rnn.SpectralRNNCell(
 80 |                     n_h=params.num_units,
 81 |                     n_r=params.r_size,
 82 |                     r_margin = params.r_margin
 83 |                     )
 84 |         else:
 85 |             assert 0, "unsupported cell %s" % (params.cell)
 86 | 
 87 |     def build(self, params):
 88 | 
 89 |         self.set_cell(params)
 90 |         # last linear layer
 91 |         last_w = tf.get_variable("last_w", initializer=tf.truncated_normal([self.rnn_cell.output_size, params.output_size], stddev=0.1))
 92 |         last_b = tf.get_variable("last_b", initializer=tf.truncated_normal([params.output_size], stddev=0.1))
 93 | 
 94 |         # Unstack to get a list of 'time_steps' tensors of shape (batch_size, n_input)
 95 |         # assume time_steps is on axis 1
 96 |         x = tf.unstack(self.x, params.time_steps, 1)
 97 |         # get RNN cell output
 98 |         output, states = tf.contrib.rnn.static_rnn(self.rnn_cell, x, dtype=np.float32)
 99 | 	    # Apply Dropout
100 |         output = tf.cond(self.train_flag, lambda: tf.nn.dropout(output, params.dropout_keep_rate), lambda: tf.identity(output))
101 |         # linear activation, using rnn inner loop last output
102 |         logits = tf.matmul(output[-1], last_w) + last_b
103 |         print "output[-1].shape = ", output[-1].get_shape()
104 |         print "last_w.shape = ", last_w.get_shape()
105 | 
106 |         self.vars = tf.trainable_variables()
107 |         self.normalize_vars = [v for v in tf.trainable_variables() if 'Householder' in v.name]
108 | 
109 |         self.validate_batch_size = params.batch_size * 4
110 |         print "trainable_variables = ", [v.name for v in self.vars]
111 |         print "normalize_variables = ", [v.name for v in self.normalize_vars]
112 | 
113 |         sys.stdout.flush()
114 |         return logits
115 | 
116 |     """
117 |     @brief model training
118 |     @param params parameters
119 |     """
120 |     def train(self, params, train_x, train_y, test_x, test_y):
121 |         if params.regression_flag:
122 |             metric = "RMS"
123 |         else:
124 |             metric = "accuracy"
125 |         #optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate)
126 | 
127 |         normalize_op = [tf.assign(v,tf.nn.l2_normalize(tf.matrix_band_part(v,0,-1),1)) for v in self.normalize_vars]
128 |         # Initialize the variables (i.e. assign their default value)
129 |         init = tf.global_variables_initializer()
130 | 
131 |         # Start training
132 |         if not params.load_model:
133 |             self.session.run(init)
134 |         else:
135 |             uninitialized_vars = self.get_un_init_vars()
136 |             if len(uninitialized_vars) > 0:
137 |                 print "Sth not right, these vars are not loaded: ", [x.name for x in uninitialized_vars]
138 |         # only initialize if not train
139 |         if not params.train_flag:
140 |             print("model not trained")
141 |             return None, None
142 | 
143 |         print "start trainging! "
144 |         sys.stdout.flush()
145 |         train_error = []
146 |         test_error = []
147 |         iterations = 0
148 |         time_used = 0
149 |         num_batches = math.ceil(len(train_x)/float(params.batch_size))
150 |         for epoch in range(self.init_epoch, params.num_epochs):
151 |             # reduce learning rate by epoch
152 |             learning_rate = params.initial_learning_rate*math.pow(params.lr_decay, int(epoch))
153 |             if epoch == self.init_epoch:
154 |                 train_error.append(self.validate(train_x, train_y, batch_size=self.validate_batch_size))
155 |                 test_error.append(self.validate(test_x, test_y, batch_size=self.validate_batch_size))
156 |                 print("Epoch %d, iterations = %d, time = %.6f, training %s = %.6f, testing %s = %.6f" % (self.init_epoch-1, iterations, time_used,  metric, train_error[-1], metric, test_error[-1]))
157 |                 sys.stdout.flush()
158 |                 t0 = time.time()
159 |             # permuate batches
160 |             perm = np.random.permutation(len(train_x))
161 | 
162 |             # run on batches
163 |             batch_index = 0
164 |             for batch_begin in range(0, len(train_x), params.batch_size):
165 |                 # get batch x and y
166 |                 batch_x = train_x[perm[batch_begin:min(batch_begin+params.batch_size, len(train_x))]]
167 |                 batch_y = train_y[perm[batch_begin:min(batch_begin+params.batch_size, len(train_x))]]
168 |                 feed_dict = {self.x: batch_x,
169 |                         self.y: batch_y,
170 |                         self.train_flag: True,
171 |                         self.learning_rate: learning_rate}
172 |                 # Run optimization op (backprop)
173 |                 self.session.run(self.train_op, feed_dict=feed_dict)
174 |                 if params.cell=='SpectralRNN':
175 |                     self.session.run(normalize_op)
176 | 
177 |                 batch_index += 1
178 |                 iterations += 1
179 | 
180 |                 # decay the display intervals for speedup
181 |                 if batch_index % (num_batches//params.display_epoch_num) == 0:
182 |                     time_used += time.time() - t0
183 |                     train_error.append(self.validate(train_x, train_y, batch_size=self.validate_batch_size))
184 |                     test_error.append(self.validate(test_x, test_y, batch_size=self.validate_batch_size))
185 |                     print("Epoch %.6f, iterations = %s, time = %.6f, training %s = %.6f, testing %s = %.6f, learning rate = %f" %
186 |                             ( self.init_epoch+float(iterations)/num_batches, '{:05}'.format(iterations), time_used,  metric, train_error[-1], metric, test_error[-1], learning_rate))
187 |                     sys.stdout.flush()
188 |                     t0 = time.time()
189 |             # save model
190 |             if params.model_dir and iterations%(5*num_batches)==0:
191 |                 if os.path.isdir(os.path.dirname(params.model_dir+'/'+params.dataset)) == False:
192 |                     os.makedirs(params.model_dir+'/'+params.dataset)
193 |                     print 'making dir: '+params.model_dir+'/'+params.dataset
194 |                 if params.cell=='SpectralRNN':
195 |                     self.save("%s/%s/%s.%s.%s.%s.%s" % (params.model_dir,params.dataset,params.cell,params.r_size,params.num_units,"init"+str(params.initial_learning_rate),"epoch"+str(epoch) ))
196 |                 else:
197 |                     self.save("%s/%s/%s.%s.%s.%s" % (params.model_dir,params.dataset,params.cell,params.num_units,"init"+str(params.initial_learning_rate),"epoch"+str(epoch) ))
198 | 
199 |             if np.isnan(train_error[-1]) or np.isinf(train_error[-1]) or np.isnan(test_error[-1]) or np.isinf(test_error[-1]):
200 |                 print("found nan or inf, stop training")
201 |                 break
202 | 
203 |         print("Optimization Finished!")
204 | 
205 |         return train_error, test_error
206 | 
207 |     """
208 |     @brief prediction
209 |     @param params parameters
210 |     """
211 |     def predict(self, x, batch_size=128):
212 |         # Launch the graph
213 |         pred = np.zeros((len(x), self.pred.get_shape().as_list()[1]))
214 |         # run on batches
215 |         for batch_begin in range(0, len(x), batch_size):
216 |             # get batch x and y
217 |             batch_x = x[batch_begin:min(batch_begin+batch_size, len(x))]
218 |             # Run optimization op (backprop)
219 |             pred[batch_begin:min(batch_begin+batch_size, len(x))] = self.session.run(self.pred, feed_dict={self.x: batch_x,
220 |                                            self.train_flag: False})
221 |         return pred
222 | 
223 |     """
224 |     @brief validate prediction
225 |     @params x feature
226 |     @params y label
227 |     @param batch_size batch size
228 |     @return accuracy
229 |     """
230 |     def validate(self, x, y, batch_size=128):
231 |         # error
232 |         cost = self.accuracy
233 |         # relative error
234 |         validate_cost = 0.0
235 |         for batch_begin in range(0, len(x), batch_size):
236 |             # get batch x and y
237 |             batch_x = x[batch_begin:min(batch_begin+batch_size, len(x))]
238 |             batch_y = y[batch_begin:min(batch_begin+batch_size, len(x))]
239 |             feed_dict = {self.x: batch_x,
240 |                     self.y: batch_y,
241 |                     self.train_flag: False}
242 |             # Calculate batch loss and accuracy
243 |             validate_cost += self.session.run(cost, feed_dict=feed_dict)*len(batch_y)
244 |         return validate_cost/len(x)
245 | 
246 |     """
247 |     @brief save model
248 |     @param filename file name
249 |     """
250 |     def save(self, filename):
251 |         print "save model ", filename
252 | 
253 |         saver = tf.train.Saver()
254 |         saver.save(self.session, filename)
255 | 
256 |     """
257 |     @brief load model
258 |     @param filename model file name
259 |     """
260 |     def load(self, filename):
261 |         print "load model ", filename
262 | 
263 |         saver = tf.train.Saver()
264 |         saver.restore(self.session, filename)
265 | 
266 |         graph = tf.get_default_graph()
267 |         self.x = graph.get_tensor_by_name("Placeholder:0")
268 |         self.y = graph.get_tensor_by_name("Placeholder_1:0")
269 |         self.trainFlag = graph.get_tensor_by_name("train_flag:0")
270 |         self.learningRate = graph.get_tensor_by_name("learning_rate:0")
271 | 
272 |         self.init_epoch = int(filename.split('epoch')[1])+1
273 | 
274 |     def get_un_init_vars(self):
275 |         uninitialized_vars = []
276 |         for var in tf.all_variables():
277 |             try:
278 |                 self.session.run(var)
279 |             except tf.errors.FailedPreconditionError:
280 |                 uninitialized_vars.append(var)
281 |         return uninitialized_vars
282 | 


--------------------------------------------------------------------------------
/code/magma_svd_ops/svd_block_prod_gpu.cu.cc:
--------------------------------------------------------------------------------
  1 | #define EIGEN_USE_GPU
  2 | #include <cuda_runtime.h>
  3 | #include <cublas_v2.h>
  4 | #include <stdio.h>
  5 | #include <cuda.h>
  6 | #include "magma_v2.h"
  7 | #include "magma_internal.h"
  8 | #include "batched_kernel_param.h"
  9 | #define THREAD_SIZE 512
 10 | #define  max_shared_bsiz 32
 11 | 
 12 | #define RFT_MAG_GEM
 13 | #define use_gemm_larft
 14 | 
 15 | extern __shared__ float shared_data[];
 16 | 
 17 | __global__ void ZeroTriu(float* U, const int n_h, const int n_r) {
 18 | 	int col = blockIdx.x;
 19 | 	for(int row = threadIdx.x; row < col; row += blockDim.x){
 20 | 		U[col*n_h + row] = 0;
 21 | 	}
 22 | }
 23 | 
 24 | __global__ void UpperTri(float* T, const int n_r, const int N) {
 25 | 	int idx = threadIdx.x + blockIdx.x * blockDim.x;
 26 | 	if(idx < N and idx%n_r > idx/n_r  ){
 27 | 		T[idx] = 0;
 28 | 	}
 29 | }
 30 | 
 31 | __global__ void ConstSet(float* tau, const float a, const int N) {
 32 | 	int idx = threadIdx.x + blockIdx.x * blockDim.x;
 33 | 	if(idx < N)	tau[idx] = a;
 34 | }
 35 | 
 36 | __global__ void ConstDevide(float* tau, const float a, const int N) {
 37 | 	int idx = blockIdx.x;
 38 | 	if(idx < N)	tau[idx] = a /  tau[idx];
 39 | }
 40 | 
 41 | __global__ void CalculateTau(float *tau, float* V, const int n_r, const int n_h, float init) {
 42 | 	//int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 43 | 	int col = blockIdx.x;
 44 | 	__shared__ float sdata[THREAD_SIZE];
 45 | 	assert(blockDim.x == THREAD_SIZE);
 46 | 	//===========================
 47 | 	// init tau to be zero
 48 | 	// ==========================
 49 | 	//if(threadIdx.x==0){tau[col] = init;}
 50 | 	//===========================
 51 | 	// reduce col square
 52 | 	//===========================
 53 | 	// compute local col square
 54 | 	float temp = 0.0;
 55 | 	for(int row=threadIdx.x; row < n_h; row += blockDim.x){
 56 | 		temp += V[ col*n_h + row] * V[col*n_h + row];
 57 | 	}
 58 | 	sdata[threadIdx.x] = temp;
 59 | 	__syncthreads();
 60 | 	// reduction within block (across all threads)
 61 | 	int i = blockDim.x/2;
 62 | 	while (i != 0){
 63 | 		if (threadIdx.x < i)
 64 | 			sdata[threadIdx.x] += sdata[threadIdx.x + i];
 65 | 		__syncthreads();
 66 | 		i /= 2;
 67 | 	}
 68 | 	//=========================
 69 | 	// compute tau
 70 | 	// =======================
 71 | 	if(threadIdx.x == 0)
 72 | 		tau[col] = 2.0 / sdata[0];
 73 | }
 74 | 
 75 | 
 76 | __global__ void SetAddress(float** array, float* one_matrix) {
 77 | 	int idx = blockIdx.x;
 78 | 	array[idx] = one_matrix;
 79 | }
 80 | 
 81 | void printDeviceMatrix(const float* A, int col, int row, magma_queue_t queue){
 82 | 	float* hA;
 83 | 	magma_smalloc_cpu(&hA, col*row);
 84 | 
 85 | 	magma_sgetmatrix(  row, col, A, row, hA, row, queue); // copy d_a -> r
 86 | 	for(int i = 0; i<row; i++){
 87 | 		for(int j=0; j<col; j++){
 88 | 			printf("%7.3f ", hA[j*row + i]);
 89 | 		}
 90 | 		printf("\n");
 91 | 	}
 92 | 	free(hA);
 93 | }
 94 | 
 95 | static __device__ void
 96 | my_slarft_recstrmv_sm32x32_device(
 97 |     int m, int n, float *tau,
 98 |     float *Trec, int ldtrec, float *Ttri, int ldttri)
 99 | {
100 |     int tx = threadIdx.x;
101 |     float *sdata = (float*)shared_data;
102 |     float res;
103 | 
104 |     // to update a certain column i, threads go in horizontal fashion where
105 |     // every thread read one row and do it gemv(dot) to generate
106 |     // one element of the column of T then move to the next column
107 | 
108 |     // read T into shared
109 |     for (int s=0; s < n; s++)
110 |     {
111 |         sdata[tx + s*m] = Trec[tx + s * ldtrec];
112 |     }
113 |     __syncthreads();
114 | 
115 |     // perform sequence of n-1 gemv
116 |     for (int i=0; i < n; i++)
117 |     {
118 |         res = MAGMA_S_ZERO;
119 |         for (int j=0; j < i; j++)
120 |         {
121 |             res += sdata[tx + j * m] * Ttri[j+ i * ldttri];
122 |         }
123 |         __syncthreads();   // a enlever
124 |         sdata[tx + i * m] = -tau[i] * (sdata[tx + i * m] + res);
125 |         __syncthreads();
126 |     }
127 | 
128 |     // write back the updated block of k column of T  multiplying by -tau
129 |     for (int s=0; s < n; s++)
130 |     {
131 |         Trec[tx + s * ldtrec] = sdata[tx + s*m];
132 |     }
133 | }
134 | 
135 | /******************************************************************************/
136 | __global__ void
137 | my_slarft_recstrmv_sm32x32_kernel_batched(
138 |     int m, int n, float **tau_array,
139 |     float **Trec_array, int ldtrec, float **Ttri_array, int ldttri)
140 | {
141 |     int batchId = blockIdx.z;
142 |     my_slarft_recstrmv_sm32x32_device(m, n, tau_array[batchId], Trec_array[batchId], ldtrec, Ttri_array[batchId], ldttri);
143 | }
144 | 
145 | 
146 | /******************************************************************************/
147 | extern "C"
148 | void my_magmablas_slarft_recstrmv_sm32x32_batched(
149 |     magma_int_t m, magma_int_t n,
150 |     float **tau_array,
151 |     float **Trec_array, magma_int_t ldtrec,
152 |     float **Ttri_array, magma_int_t ldttri,
153 |     magma_int_t batchCount, magma_queue_t queue)
154 | {
155 |     dim3 grid(1, 1, batchCount);
156 |     dim3 threads(max(m,1), 1, 1);
157 |     size_t shmem = sizeof(float)*(m*n);
158 |     my_slarft_recstrmv_sm32x32_kernel_batched
159 |         <<< grid, threads, shmem, queue->cuda_stream() >>>
160 |         (m, n,  tau_array, Trec_array, ldtrec, Ttri_array, ldttri);
161 | }
162 | 
163 | /******************************************************************************/
164 | extern "C" magma_int_t
165 | my_magma_slarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T,
166 |                 float **v_array, magma_int_t ldv,
167 |                 float **tau_array, float **T_array, magma_int_t ldt,
168 |                 float **work_array, magma_int_t lwork,
169 |                 magma_int_t batchCount, magma_queue_t queue)
170 | {
171 |     float c_one  = MAGMA_S_ONE;
172 |     float c_zero = MAGMA_S_ZERO;
173 | 
174 |     if ( k <= 0) return 0;
175 |     if ( stair_T > 0 && k <= stair_T) return 0;
176 | 
177 |     magma_int_t maxnb = max_shared_bsiz;
178 | 
179 |     magma_int_t info = 0;
180 |     if (stair_T > 0 && stair_T > maxnb) {
181 |         info = -3;
182 |     }
183 |     else if (lwork < k*ldt) {
184 |         info = -10;
185 |     }
186 |     if (info != 0) {
187 |         magma_xerbla( __func__, -(info) );
188 |         return info;
189 |     }
190 | 
191 |     magma_int_t DEBUG=0;
192 |     magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T;
193 | 
194 |     magma_int_t i, j, prev_n, mycol, rows;
195 | 
196 |     float **dW1_displ  = NULL;
197 |     float **dW2_displ  = NULL;
198 |     float **dW3_displ  = NULL;
199 |     float **dTstep_array  = NULL;
200 | 
201 |     magma_malloc((void**)&dW1_displ,  batchCount * sizeof(*dW1_displ));
202 |     magma_malloc((void**)&dW2_displ,  batchCount * sizeof(*dW2_displ));
203 |     magma_malloc((void**)&dW3_displ,  batchCount * sizeof(*dW3_displ));
204 |     magma_malloc((void**)&dTstep_array,  batchCount * sizeof(*dTstep_array));
205 | 
206 |     //float *Tstep =  k > nb ? work : T;
207 |     if (k > nb)
208 |     {
209 |         magma_sdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue);
210 |     }
211 |     else
212 |     {
213 |         magma_sdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue);
214 |     }
215 | 
216 |     //magma_int_t ldtstep = k > nb ? k : ldt;
217 |     magma_int_t ldtstep = ldt; //a enlever
218 |     // stair_T = 0 meaning all T
219 |     // stair_T > 0 meaning the triangular portion of T has been computed.
220 |     //                    the value of stair_T is the nb of these triangulars
221 | 
222 | 
223 |     //GEMV compute the whole triangular upper portion of T (phase 1)
224 |     // TODO addcublas to check perf
225 | 
226 |     magma_sgemm_batched( MagmaConjTrans, MagmaNoTrans,
227 |                          k, k, n,
228 |                          c_one,  v_array, ldv,
229 |                                  v_array, ldv,
230 |                          c_zero, dTstep_array, ldtstep,
231 |                          batchCount, queue );
232 | 
233 |     magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue );
234 |     // no need for it as T is expected to be lower zero
235 |     //if (k > nb) magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue );
236 | 
237 | 
238 |     //TRMV
239 |     //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k]
240 |     // TRMV is split over block of column of size nb
241 |     // the update should be done from top to bottom so:
242 |     // 1- a gemm using the previous computed columns
243 |     //    of T to update rectangular upper protion above
244 |     //    the triangle of my columns
245 |     // 2- the columns need to be updated by a serial
246 |     //    loop over of gemv over itself. since we limit the
247 |     //    shared memory to nb, this nb column
248 |     //    are split vertically by chunk of nb rows
249 | 
250 |     dim3 grid(1, 1, batchCount);
251 | 
252 |     for (j=0; j < k; j += nb)
253 |     {
254 |         prev_n =  j;
255 |         mycol  =  min(nb, k-j);
256 |         // note that myrow = prev_n + mycol;
257 |         if (prev_n > 0 && mycol > 0) {
258 |             if (DEBUG == 3) {
259 |                 printf("doing gemm on the rectangular portion of size %lld %lld of T(%lld,%lld)\n",
260 |                         (long long) prev_n, (long long) mycol, (long long) 0, (long long) j );
261 |             }
262 | 
263 |             magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue);
264 |             magma_sdisplace_pointers(dW2_displ, T_array,     ldt, 0, j, batchCount, queue);
265 |             magma_sgemm_batched( MagmaNoTrans, MagmaNoTrans,
266 |                                  prev_n, mycol, prev_n,
267 |                                  c_one,  T_array, ldt,
268 |                                          dW1_displ, ldtstep,
269 |                                  c_zero, dW2_displ, ldt,
270 |                                  batchCount, queue );
271 | 
272 |             // update my rectangular portion (prev_n,mycol) using sequence of gemv
273 |             magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
274 |             magma_sdisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);
275 | 
276 |             for (i=0; i < prev_n; i += nb)
277 |             {
278 |                 rows = min(nb,prev_n-i);
279 |                 if (DEBUG == 3) {
280 |                     printf("        doing recstrmv on the rectangular portion of size %lld %lld of T(%lld,%lld)\n",
281 |                             (long long) rows, (long long) mycol, (long long) i, (long long) j );
282 |                 }
283 | 
284 |                 if (rows > 0 && mycol > 0)
285 |                 {
286 |                     magma_sdisplace_pointers(dW2_displ, T_array,     ldt, i, j, batchCount, queue);
287 |                     my_magmablas_slarft_recstrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue);
288 |                 }
289 |             }
290 |         }
291 | 
292 |         // the upper rectangular protion is updated, now if needed update the triangular portion
293 |         if (stair_T == 0) {
294 |             if (DEBUG == 3) {
295 |                 printf("doing strmv on the triangular portion of size %lld %lld of T(%lld,%lld)\n",
296 |                         (long long) mycol, (long long) mycol, (long long) j, (long long) j );
297 |             }
298 | 
299 |             if (mycol > 0)
300 |             {
301 |                 magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
302 |                 magma_sdisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);
303 |                 magma_sdisplace_pointers(dW2_displ, T_array,     ldt, j, j, batchCount, queue);
304 |                 magmablas_slarft_strmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue);
305 |             }
306 |         }
307 |     }// end of j
308 | 
309 |     magma_free(dW1_displ);
310 |     magma_free(dW2_displ);
311 |     magma_free(dW3_displ);
312 |     magma_free(dTstep_array);
313 | 
314 |     return 0;
315 | }
316 | 
317 | struct workspace {
318 |     magmaFloat_ptr *T_array;
319 |     magmaFloat_ptr *Tau_array;
320 |     magmaFloat_ptr *Twork_array;
321 |     magmaFloat_ptr *V_array;
322 |     magmaFloat_ptr T;
323 |     magmaFloat_ptr tau;
324 |     magmaFloat_ptr twork;
325 |     magmaFloat_ptr dwork;
326 |     magmaFloat_ptr dworkvt;
327 | };
328 | 
329 | int SvdBlockProdGpuKernelLauncher(const float* H, const float* U, float* H_out, const int n_h, const int batch, const int n_r, magma_queue_t queue, workspace ws, const bool isForward) {
330 | 	/*
331 | 	 * Computes the Hprod(U, H) = Q * H or Q^T * H, according to isForward.
332 | 	 * where Q = House(u_1)*House(u_2)*...*House(u_{n_r})
333 | 	 *
334 | 	 * H 		: the hidden states, shape = (n_h, batch)
335 | 	 * U 		: the elementary reflectors, lower triabgular, shape = (n_h ,n_r)
336 | 	 * n_h 		: hidden dimension
337 | 	 * batch 	: batch size
338 | 	 * n_r 	 	: number of Householder reflectors
339 | 	 * isForward: if doing transpose on Q
340 | 	 *
341 | 	 * According to MAGMA documents, the diagonal elements of U are supposed to be 1, but does not influence the results ??
342 | 	 * TODO : avoid the memcpy
343 | 	 */
344 | 
345 | 	magma_int_t stat;
346 | 	int batchCount = 1;
347 | 	cudaDeviceSynchronize();
348 | 
349 |         // printf("using magma queue at %p!\n", reinterpret_cast<void*>(queue));
350 | 
351 | 	// stat = magma_init();
352 | 	// TODO: avoid creating the queue here
353 | 	// magma_queue_t queue;
354 | 	// magma_queue_create(0, &queue);
355 | 	// if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); return EXIT_FAILURE;}
356 | 
357 | 
358 | 	magmaFloat_ptr *T_array, *Tau_array, *Twork_array, *V_array;
359 | #if 0
360 |     magma_malloc((void**)&T_array,  batchCount * sizeof(*T_array));
361 |     magma_malloc((void**)&Tau_array,  batchCount * sizeof(*Tau_array));
362 |     magma_malloc((void**)&Twork_array,  batchCount * sizeof(*Twork_array));
363 |     magma_malloc((void**)&V_array,  batchCount * sizeof(*V_array));
364 | #else
365 |     T_array = ws.T_array;
366 |     Tau_array = ws.Tau_array;
367 |     Twork_array = ws.Twork_array;
368 |     V_array = ws.V_array;
369 | #endif 
370 | #if 0
371 | 	// construct alpha and fill alpha with 2.0 (assume u_i are of unit norm)
372 | 	magmaFloat_ptr T, tau;
373 | 	/*
374 | 	if( magma_smalloc(&T, n_r*n_r) != MAGMA_SUCCESS){
375 | 		printf("Error allocating T!\n");
376 | 		return EXIT_FAILURE;
377 | 	}
378 | 	*/
379 | 	T = ws.T;
380 | 	/*
381 | 	if( magma_smalloc(&tau, n_r) != MAGMA_SUCCESS){
382 | 		printf("Error allocating tau!\n");
383 | 		return EXIT_FAILURE;
384 | 	}
385 | 	*/
386 | 	tau = ws.tau;
387 | 
388 | 	// allocate workspace
389 | 	magmaFloat_ptr twork, dwork, dworkvt;
390 | 	magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch);
391 | 	/*
392 | 	if( magma_smalloc(&twork, n_r*n_r) != MAGMA_SUCCESS){
393 | 		printf("Error allocating twork!\n");
394 | 		return EXIT_FAILURE;
395 | 	}
396 | 	*/
397 | 	twork = ws.twork;
398 | 	/*
399 | 	if( magma_smalloc(&dwork, ldwork*n_r) != MAGMA_SUCCESS){
400 | 		printf("Error allocating dwork!\n");
401 | 		return EXIT_FAILURE;
402 | 	}
403 | 	*/
404 | 	dwork = ws.dwork;
405 | 	if( magma_smalloc(&dworkvt, ldworkvt*n_r) != MAGMA_SUCCESS){
406 | 		printf("Error allocating dworkvt!\n");
407 | 		return EXIT_FAILURE;
408 | 	}
409 | 	// dworkvt = ws.dworkvt;
410 | 
411 | #else
412 | 	magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch);
413 | 	magmaFloat_ptr T = ws.T;
414 | 	magmaFloat_ptr tau = ws.tau;
415 | 	magmaFloat_ptr twork = ws.twork;
416 | 	magmaFloat_ptr dwork = ws.dwork;
417 | 	magmaFloat_ptr dworkvt = ws.dworkvt;
418 | 	/*
419 | 	magmaFloat_ptr dwork;
420 | 	if( magma_smalloc(&dwork, ldwork*n_r) != MAGMA_SUCCESS){
421 | 		printf("Error allocating dwork!\n");
422 | 		return EXIT_FAILURE;
423 | 	}
424 | 	*/
425 | 
426 | #endif
427 | 	// copy H to H_out
428 | 	magmablas_slacpy(MagmaFull, n_h, batch, H, n_h, H_out, n_h, queue);
429 | 	// compute T = inv( striu(U'U) + 0.5 * diag(U'U))
430 | 	magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, dwork, n_h, queue);
431 | 
432 | 	// calculate tau[i] = 2.0/ dot(V[i],V[i])
433 | 	CalculateTau<<< n_r, THREAD_SIZE>>>(tau, dwork, n_r, n_h, 0.0); // tau = [u_i*u_i]
434 | 	//ConstSet<<< n_r, 1>>>(tau, 2.0, n_r); // tau = [u_i*u_i]
435 | 	ConstSet<<< n_r, n_r>>>(T, 0, n_r*n_r); // set T to zero
436 | 
437 | 	SetAddress<<<batchCount,1>>>(T_array, T);
438 | 	SetAddress<<<batchCount,1>>>(Tau_array, tau);
439 | 	SetAddress<<<batchCount,1>>>(V_array, dwork);
440 | 	SetAddress<<<batchCount,1>>>(Twork_array, twork);
441 | 
442 | 	stat = my_magma_slarft_batched(	n_h,
443 | 							n_r,
444 | 							0, 			// stair_T not sure what it does
445 |                 			V_array, n_h, 	//
446 |                 			Tau_array, 		//
447 | 							T_array, n_r, 	//
448 |                 			Twork_array, n_r*n_r,
449 |                 			1,  		// batchCount
450 | 							queue);
451 | 
452 | 
453 | 	// compute H = Q * H
454 | 	magma_trans_t isTrans = MagmaTrans;
455 | 	if(not isForward){
456 | 		isTrans = MagmaNoTrans;
457 | 	}
458 | 
459 | 	stat |= magma_slarfb_gpu_gemm(		MagmaLeft, 		// side
460 | 								isTrans, 		// transpose
461 | 								MagmaForward, 	// Q = H(u_{n_r}) . . . H(u_2) H(u_1) (Backward)
462 | 								MagmaColumnwise,// elementary reflectors are stored
463 | 								n_h,			// number of rows of H
464 | 								batch,			// number of columns of H
465 | 								n_r,			// number of Householder reflectors
466 | 								U,				// U = (u_1, u_2,..., u_{n_r})
467 | 								n_h,			// The leading dimension of U
468 | 								T,				// block Householder T
469 | 								n_r,			// The leading dimension of T
470 | 								H_out,			// H.shape = (n_h, batch)
471 | 								n_h,			// leading dimension of H
472 | 								dwork, 			// workspace
473 | 								ldwork,			// leading dimension of workspace
474 | 								dworkvt,		// workspace 2
475 | 								ldworkvt,		// leading dimension of workspace2
476 | 								queue
477 | 								);
478 | 
479 | 	// wait for all kernels in the queue
480 | 	magma_queue_sync(queue);
481 | 	cudaDeviceSynchronize();
482 | 
483 | 	// free memory
484 | #if 0
485 | 	magma_free(T_array);
486 | 	magma_free(Tau_array);
487 | 	magma_free(Twork_array);
488 | 	magma_free(V_array);
489 | 
490 | 	magma_free(T);
491 | 	magma_free(tau);
492 | 	magma_free(twork);
493 | 	magma_free(dwork);
494 | 	magma_free(dworkvt);
495 | #endif
496 | 	assert(stat == MAGMA_SUCCESS);
497 | 	// magma_queue_destroy(queue);
498 | 	// magma_finalize();
499 | 
500 | 	return EXIT_SUCCESS;
501 | }
502 | 
503 | 


--------------------------------------------------------------------------------
/code/magma_svd_ops/grad_svd_block_prod_gpu.cu.cc:
--------------------------------------------------------------------------------
  1 | #define EIGEN_USE_GPU
  2 | #include <cuda_runtime.h>
  3 | #include <cublas_v2.h>
  4 | #include <stdio.h>
  5 | #include <cuda.h>
  6 | #include "magma_v2.h"
  7 | #include "magma_internal.h"
  8 | #include "batched_kernel_param.h"
  9 | #define THREAD_SIZE 512
 10 | #define  max_shared_bsiz 32
 11 | 
 12 | #define RFT_MAG_GEM
 13 | #define use_gemm_larft
 14 | 
 15 | extern __shared__ float shared_data[];
 16 | 
 17 | __global__ void ZeroTriu(float* U, const int n_h, const int n_r) {
 18 | 	int col = blockIdx.x;
 19 | 	for(int row = threadIdx.x; row < col; row += blockDim.x){
 20 | 		U[col*n_h + row] = 0;
 21 | 	}
 22 | }
 23 | 
 24 | __global__ void UpperTri(float* T, const int n_r, const int N) {
 25 | 	int idx = threadIdx.x + blockIdx.x * blockDim.x;
 26 | 	if(idx < N and idx%n_r > idx/n_r  ){
 27 | 		T[idx] = 0;
 28 | 	}
 29 | }
 30 | 
 31 | __global__ void ConstSet(float* tau, const float a, const int N) {
 32 | 	int idx = threadIdx.x + blockIdx.x * blockDim.x;
 33 | 	if(idx < N)	tau[idx] = a;
 34 | }
 35 | 
 36 | __global__ void ConstDevide(float* tau, const float a, const int N) {
 37 | 	int idx = blockIdx.x;
 38 | 	if(idx < N)	tau[idx] = a /  tau[idx];
 39 | }
 40 | 
 41 | __global__ void CalculateTau(float *tau, float* V, const int n_r, const int n_h, float init) {
 42 | 	//int global_idx = blockIdx.x * blockDim.x + threadIdx.x;
 43 | 	int col = blockIdx.x;
 44 | 	__shared__ float sdata[THREAD_SIZE];
 45 | 	assert(blockDim.x == THREAD_SIZE);
 46 | 	//===========================
 47 | 	// init tau to be zero
 48 | 	// ==========================
 49 | 	//if(threadIdx.x==0){tau[col] = init;}
 50 | 	//===========================
 51 | 	// reduce col square
 52 | 	//===========================
 53 | 	// compute local col square
 54 | 	float temp = 0.0;
 55 | 	for(int row=threadIdx.x; row < n_h; row += blockDim.x){
 56 | 		temp += V[ col*n_h + row] * V[col*n_h + row];
 57 | 	}
 58 | 	sdata[threadIdx.x] = temp;
 59 | 	__syncthreads();
 60 | 	// reduction within block (across all threads)
 61 | 	int i = blockDim.x/2;
 62 | 	while (i != 0){
 63 | 		if (threadIdx.x < i)
 64 | 			sdata[threadIdx.x] += sdata[threadIdx.x + i];
 65 | 		__syncthreads();
 66 | 		i /= 2;
 67 | 	}
 68 | 	//=========================
 69 | 	// compute tau
 70 | 	// =======================
 71 | 	if(threadIdx.x == 0)
 72 | 		tau[col] = 2.0 / sdata[0];
 73 | }
 74 | 
 75 | 
 76 | __global__ void SetAddress(float** array, float* one_matrix) {
 77 | 	int idx = blockIdx.x;
 78 | 	array[idx] = one_matrix;
 79 | }
 80 | 
 81 | void printDeviceMatrix(const float* A, int col, int row, magma_queue_t queue){
 82 | 	float* hA;
 83 | 	magma_smalloc_cpu(&hA, col*row);
 84 | 
 85 | 	magma_sgetmatrix(  row, col, A, row, hA, row, queue); // copy d_a -> r
 86 | 	for(int i = 0; i<row; i++){
 87 | 		for(int j=0; j<col; j++){
 88 | 			printf("%7.3f ", hA[j*row + i]);
 89 | 		}
 90 | 		printf("\n");
 91 | 	}
 92 | 	free(hA);
 93 | }
 94 | 
 95 | static __device__ void
 96 | my_slarft_recstrmv_sm32x32_device(
 97 |     int m, int n, float *tau,
 98 |     float *Trec, int ldtrec, float *Ttri, int ldttri)
 99 | {
100 |     int tx = threadIdx.x;
101 |     float *sdata = (float*)shared_data;
102 |     float res;
103 | 
104 |     // to update a certain column i, threads go in horizontal fashion where
105 |     // every thread read one row and do it gemv(dot) to generate
106 |     // one element of the column of T then move to the next column
107 | 
108 |     // read T into shared
109 |     for (int s=0; s < n; s++)
110 |     {
111 |         sdata[tx + s*m] = Trec[tx + s * ldtrec];
112 |     }
113 |     __syncthreads();
114 | 
115 |     // perform sequence of n-1 gemv
116 |     for (int i=0; i < n; i++)
117 |     {
118 |         res = MAGMA_S_ZERO;
119 |         for (int j=0; j < i; j++)
120 |         {
121 |             res += sdata[tx + j * m] * Ttri[j+ i * ldttri];
122 |         }
123 |         __syncthreads();   // a enlever
124 |         sdata[tx + i * m] = -tau[i] * (sdata[tx + i * m] + res);
125 |         __syncthreads();
126 |     }
127 | 
128 |     // write back the updated block of k column of T  multiplying by -tau
129 |     for (int s=0; s < n; s++)
130 |     {
131 |         Trec[tx + s * ldtrec] = sdata[tx + s*m];
132 |     }
133 | }
134 | 
135 | /******************************************************************************/
136 | __global__ void
137 | my_slarft_recstrmv_sm32x32_kernel_batched(
138 |     int m, int n, float **tau_array,
139 |     float **Trec_array, int ldtrec, float **Ttri_array, int ldttri)
140 | {
141 |     int batchId = blockIdx.z;
142 |     my_slarft_recstrmv_sm32x32_device(m, n, tau_array[batchId], Trec_array[batchId], ldtrec, Ttri_array[batchId], ldttri);
143 | }
144 | 
145 | 
146 | /******************************************************************************/
147 | extern "C"
148 | void my_magmablas_slarft_recstrmv_sm32x32_batched(
149 |     magma_int_t m, magma_int_t n,
150 |     float **tau_array,
151 |     float **Trec_array, magma_int_t ldtrec,
152 |     float **Ttri_array, magma_int_t ldttri,
153 |     magma_int_t batchCount, magma_queue_t queue)
154 | {
155 |     dim3 grid(1, 1, batchCount);
156 |     dim3 threads(max(m,1), 1, 1);
157 |     size_t shmem = sizeof(float)*(m*n);
158 |     my_slarft_recstrmv_sm32x32_kernel_batched
159 |         <<< grid, threads, shmem, queue->cuda_stream() >>>
160 |         (m, n,  tau_array, Trec_array, ldtrec, Ttri_array, ldttri);
161 | }
162 | 
163 | /******************************************************************************/
164 | extern "C" magma_int_t
165 | my_magma_slarft_batched(magma_int_t n, magma_int_t k, magma_int_t stair_T,
166 |                 float **v_array, magma_int_t ldv,
167 |                 float **tau_array, float **T_array, magma_int_t ldt,
168 |                 float **work_array, magma_int_t lwork,
169 |                 magma_int_t batchCount, magma_queue_t queue)
170 | {
171 |     float c_one  = MAGMA_S_ONE;
172 |     float c_zero = MAGMA_S_ZERO;
173 | 
174 |     if ( k <= 0) return 0;
175 |     if ( stair_T > 0 && k <= stair_T) return 0;
176 | 
177 |     magma_int_t maxnb = max_shared_bsiz;
178 | 
179 |     magma_int_t info = 0;
180 |     if (stair_T > 0 && stair_T > maxnb) {
181 |         info = -3;
182 |     }
183 |     else if (lwork < k*ldt) {
184 |         info = -10;
185 |     }
186 |     if (info != 0) {
187 |         magma_xerbla( __func__, -(info) );
188 |         return info;
189 |     }
190 | 
191 |     magma_int_t DEBUG=0;
192 |     magma_int_t nb = stair_T == 0 ? min(k,maxnb) : stair_T;
193 | 
194 |     magma_int_t i, j, prev_n, mycol, rows;
195 | 
196 |     float **dW1_displ  = NULL;
197 |     float **dW2_displ  = NULL;
198 |     float **dW3_displ  = NULL;
199 |     float **dTstep_array  = NULL;
200 | 
201 |     magma_malloc((void**)&dW1_displ,  batchCount * sizeof(*dW1_displ));
202 |     magma_malloc((void**)&dW2_displ,  batchCount * sizeof(*dW2_displ));
203 |     magma_malloc((void**)&dW3_displ,  batchCount * sizeof(*dW3_displ));
204 |     magma_malloc((void**)&dTstep_array,  batchCount * sizeof(*dTstep_array));
205 | 
206 |     //float *Tstep =  k > nb ? work : T;
207 |     if (k > nb)
208 |     {
209 |         magma_sdisplace_pointers(dTstep_array, work_array, lwork, 0, 0, batchCount, queue);
210 |     }
211 |     else
212 |     {
213 |         magma_sdisplace_pointers(dTstep_array, T_array, ldt, 0, 0, batchCount, queue);
214 |     }
215 | 
216 |     //magma_int_t ldtstep = k > nb ? k : ldt;
217 |     magma_int_t ldtstep = ldt; //a enlever
218 |     // stair_T = 0 meaning all T
219 |     // stair_T > 0 meaning the triangular portion of T has been computed.
220 |     //                    the value of stair_T is the nb of these triangulars
221 | 
222 | 
223 |     //GEMV compute the whole triangular upper portion of T (phase 1)
224 |     // TODO addcublas to check perf
225 | 
226 |     magma_sgemm_batched( MagmaConjTrans, MagmaNoTrans,
227 |                          k, k, n,
228 |                          c_one,  v_array, ldv,
229 |                                  v_array, ldv,
230 |                          c_zero, dTstep_array, ldtstep,
231 |                          batchCount, queue );
232 | 
233 |     magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue );
234 |     // no need for it as T is expected to be lower zero
235 |     //if (k > nb) magmablas_slaset_batched( MagmaLower, k, k, MAGMA_S_ZERO, MAGMA_S_ZERO, dTstep_array, ldtstep, batchCount, queue );
236 | 
237 | 
238 |     //TRMV
239 |     //T(1:i-1,i) := T(1:i-1,1:i-1) * W(1:i-1) i=[1:k]
240 |     // TRMV is split over block of column of size nb
241 |     // the update should be done from top to bottom so:
242 |     // 1- a gemm using the previous computed columns
243 |     //    of T to update rectangular upper protion above
244 |     //    the triangle of my columns
245 |     // 2- the columns need to be updated by a serial
246 |     //    loop over of gemv over itself. since we limit the
247 |     //    shared memory to nb, this nb column
248 |     //    are split vertically by chunk of nb rows
249 | 
250 |     dim3 grid(1, 1, batchCount);
251 | 
252 |     for (j=0; j < k; j += nb)
253 |     {
254 |         prev_n =  j;
255 |         mycol  =  min(nb, k-j);
256 |         // note that myrow = prev_n + mycol;
257 |         if (prev_n > 0 && mycol > 0) {
258 |             if (DEBUG == 3) {
259 |                 printf("doing gemm on the rectangular portion of size %lld %lld of T(%lld,%lld)\n",
260 |                         (long long) prev_n, (long long) mycol, (long long) 0, (long long) j );
261 |             }
262 | 
263 |             magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, 0, j, batchCount, queue);
264 |             magma_sdisplace_pointers(dW2_displ, T_array,     ldt, 0, j, batchCount, queue);
265 |             magma_sgemm_batched( MagmaNoTrans, MagmaNoTrans,
266 |                                  prev_n, mycol, prev_n,
267 |                                  c_one,  T_array, ldt,
268 |                                          dW1_displ, ldtstep,
269 |                                  c_zero, dW2_displ, ldt,
270 |                                  batchCount, queue );
271 | 
272 |             // update my rectangular portion (prev_n,mycol) using sequence of gemv
273 |             magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
274 |             magma_sdisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);
275 | 
276 |             for (i=0; i < prev_n; i += nb)
277 |             {
278 |                 rows = min(nb,prev_n-i);
279 |                 if (DEBUG == 3) {
280 |                     printf("        doing recstrmv on the rectangular portion of size %lld %lld of T(%lld,%lld)\n",
281 |                             (long long) rows, (long long) mycol, (long long) i, (long long) j );
282 |                 }
283 | 
284 |                 if (rows > 0 && mycol > 0)
285 |                 {
286 |                     magma_sdisplace_pointers(dW2_displ, T_array,     ldt, i, j, batchCount, queue);
287 |                     my_magmablas_slarft_recstrmv_sm32x32_batched(rows, mycol, dW3_displ, dW2_displ, ldt, dW1_displ, ldtstep, batchCount, queue);
288 |                 }
289 |             }
290 |         }
291 | 
292 |         // the upper rectangular protion is updated, now if needed update the triangular portion
293 |         if (stair_T == 0) {
294 |             if (DEBUG == 3) {
295 |                 printf("doing strmv on the triangular portion of size %lld %lld of T(%lld,%lld)\n",
296 |                         (long long) mycol, (long long) mycol, (long long) j, (long long) j );
297 |             }
298 | 
299 |             if (mycol > 0)
300 |             {
301 |                 magma_sdisplace_pointers(dW1_displ, dTstep_array, ldtstep, j, j, batchCount, queue);
302 |                 magma_sdisplace_pointers(dW3_displ, tau_array,  1, j, 0, batchCount, queue);
303 |                 magma_sdisplace_pointers(dW2_displ, T_array,     ldt, j, j, batchCount, queue);
304 |                 magmablas_slarft_strmv_sm32x32_batched(mycol, mycol, dW3_displ, dW1_displ, ldtstep, dW2_displ, ldt, batchCount, queue);
305 |             }
306 |         }
307 |     }// end of j
308 | 
309 |     magma_free(dW1_displ);
310 |     magma_free(dW2_displ);
311 |     magma_free(dW3_displ);
312 |     magma_free(dTstep_array);
313 | 
314 |     return 0;
315 | }
316 | 
317 | struct grad_workspace {
318 |     magmaFloat_ptr *T_array;
319 |     magmaFloat_ptr *Tau_array;
320 |     magmaFloat_ptr *Twork_array;
321 |     magmaFloat_ptr *V_array;
322 |     magmaFloat_ptr T;
323 |     magmaFloat_ptr tau;
324 |     magmaFloat_ptr twork;
325 |     magmaFloat_ptr dwork;
326 |     magmaFloat_ptr dworkvt;
327 |     magmaFloat_ptr Q_grad;
328 |     magmaFloat_ptr UT;
329 | };
330 | 
331 | int GradSvdBlockProdGpuKernelLauncher(const float* H, const float* U, const float* G, float* H_grad, float* U_grad, const int n_h, const int batch, const int n_r, magma_queue_t queue, grad_workspace ws, const bool isForward) {
332 | 	magma_int_t stat;
333 | 	int batchCount = 1;
334 | 	// stat = magma_init();
335 | 	// magma_queue_t queue;
336 | 	// magma_queue_create(0, &queue);
337 | 	// if( stat != MAGMA_SUCCESS){ printf("Error init magma!\n"); return EXIT_FAILURE;}
338 | 	
339 | 	// wait for all kernels in the queue
340 | 	// magma_queue_sync(queue);
341 | 	cudaDeviceSynchronize();
342 | 
343 | 	magmaFloat_ptr *T_array, *Tau_array, *Twork_array, *V_array;
344 | #if 0
345 |     magma_malloc((void**)&T_array,  batchCount * sizeof(*T_array));
346 |     magma_malloc((void**)&Tau_array,  batchCount * sizeof(*Tau_array));
347 |     magma_malloc((void**)&Twork_array,  batchCount * sizeof(*Twork_array));
348 |     magma_malloc((void**)&V_array,  batchCount * sizeof(*V_array));
349 | #else
350 |     T_array = ws.T_array;
351 |     Tau_array = ws.Tau_array;
352 |     Twork_array = ws.Twork_array;
353 |     V_array = ws.V_array;
354 | #endif 
355 | 	// construct alpha and fill alpha with 2.0 (assume u_i are of unit norm)
356 |     
357 | #if 0
358 | 	magmaFloat_ptr T, tau;
359 | 	if( magma_smalloc(&T, n_r*n_r) != MAGMA_SUCCESS){
360 | 		printf("Error allocating T!\n");
361 | 		return EXIT_FAILURE;
362 | 	}
363 | 	if( magma_smalloc(&tau, n_r) != MAGMA_SUCCESS){
364 | 		printf("Error allocating tau!\n");
365 | 		return EXIT_FAILURE;
366 | 	}
367 | 	// allocate workspace
368 | 	magmaFloat_ptr twork, dwork, dworkvt;
369 | 	magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch);
370 | 	if( magma_smalloc(&twork, n_r*n_r) != MAGMA_SUCCESS){
371 | 		printf("Error allocating twork!\n");
372 | 		return EXIT_FAILURE;
373 | 	}
374 | 	if( magma_smalloc(&dwork, ldwork*n_r) != MAGMA_SUCCESS){
375 | 		printf("Error allocating dwork!\n");
376 | 		return EXIT_FAILURE;
377 | 	}
378 | 	if( magma_smalloc(&dworkvt, ldworkvt*n_r) != MAGMA_SUCCESS){
379 | 		printf("Error allocating dworkvt!\n");
380 | 		return EXIT_FAILURE;
381 | 	}
382 | 	// calculating U_grad
383 | 	magmaFloat_ptr Q_grad, UT;
384 | 	if( magma_smalloc(&Q_grad, n_h*n_h) != MAGMA_SUCCESS){
385 | 		printf("Error allocating Q_grad!\n");
386 | 		return EXIT_FAILURE;
387 | 	}
388 | 	if( magma_smalloc(&UT, n_h*n_r) != MAGMA_SUCCESS){
389 | 		printf("Error allocating UT!\n");
390 | 		return EXIT_FAILURE;
391 | 	}
392 | 
393 | #else
394 | 	magma_int_t ldwork = n_h, ldworkvt = max(n_h,batch);
395 | 	magmaFloat_ptr T = ws.T;
396 | 	magmaFloat_ptr tau = ws.tau;
397 | 	magmaFloat_ptr twork = ws.twork;
398 | 	magmaFloat_ptr dwork = ws.dwork;
399 | 	magmaFloat_ptr dworkvt = ws.dworkvt;
400 | 	magmaFloat_ptr Q_grad = ws.Q_grad;
401 | 	magmaFloat_ptr UT = ws.UT;
402 | #endif
403 | 
404 | 	// copy G to Grad_H
405 | 	magmablas_slacpy(MagmaFull, n_h, batch, G, n_h, H_grad, n_h, queue);
406 | 	// compute T = inv( striu(U'U) + 0.5 * diag(U'U))
407 | 	magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, dwork, n_h, queue);
408 | 	// calculate tau[i] = 2.0/ dot(V[i],V[i])
409 | 	CalculateTau<<< n_r, THREAD_SIZE>>>(tau, dwork, n_r, n_h, 0.0); // tau = [u_i*u_i]
410 |     //ConstSet<<< n_r, 1>>>(tau, 2.0, n_r); // tau = [u_i*u_i]
411 | 	ConstSet<<< n_r, n_r>>>(T, 0, n_r*n_r); // set T to zero
412 | 
413 | 	SetAddress<<<batchCount,1>>>(T_array, T);
414 | 	SetAddress<<<batchCount,1>>>(Tau_array, tau);
415 | 	SetAddress<<<batchCount,1>>>(V_array, dwork);
416 | 	SetAddress<<<batchCount,1>>>(Twork_array, twork);
417 | 
418 | 	stat = my_magma_slarft_batched(	n_h,
419 | 							n_r,
420 | 							0, 			// stair_T not sure what it does
421 |                 			V_array, n_h, 	//
422 |                 			Tau_array, 		//
423 | 							T_array, n_r, 	//
424 |                 			Twork_array, n_r*n_r,
425 |                 			1,  		// batchCount
426 | 							queue);
427 | 
428 | 	// compute H_grad = Q^T * G
429 | 	magma_trans_t isTrans = MagmaNoTrans, tTrans1 = MagmaTrans, tTrans2 = MagmaNoTrans;
430 | 	magma_uplo_t useTri = MagmaUpper;
431 | 	if(not isForward){ // opposite of Hprod
432 | 		isTrans = MagmaTrans;
433 | 		useTri = MagmaLower;
434 | 		tTrans1 = MagmaNoTrans;
435 | 		tTrans2 = MagmaTrans;
436 | 	}
437 | 
438 | 	stat |= magma_slarfb_gpu_gemm(		MagmaLeft, 		// side
439 | 								isTrans, 		// transpose
440 | 								MagmaForward, 	// Q = H(u_{n_r}) . . . H(u_2) H(u_1) (Backward)
441 | 								MagmaColumnwise,// elementary reflectors are stored
442 | 								n_h,			// number of rows of H
443 | 								batch,			// number of columns of H
444 | 								n_r,			// number of Householder reflectors
445 | 								U,				// U = (u_1, u_2,..., u_{n_r})
446 | 								n_h,			// The leading dimension of U
447 | 								T,				// block Householder T
448 | 								n_r,			// The leading dimension of T
449 | 								H_grad,			// H_grad.shape = (n_h, batch)
450 | 								n_h,			// leading dimension of H
451 | 								dwork, 			// workspace
452 | 								ldwork,			// leading dimension of workspace
453 | 								dworkvt,		// workspace 2
454 | 								ldworkvt,		// leading dimension of workspace2
455 | 								queue
456 | 								);
457 | 
458 | 
459 | 
460 | 	// Q_grad = G * H^T
461 | 	magma_sgemm	(	MagmaNoTrans,
462 | 					MagmaTrans,
463 | 					n_h,
464 | 					n_h,
465 | 					batch,
466 | 					1.0,
467 | 					G,
468 | 					n_h,
469 | 					H,
470 | 					n_h,
471 | 					0.0,
472 | 					Q_grad,
473 | 					n_h,
474 | 					queue
475 | 					);
476 | 	// UT = U * T; where T is upper triangular matrix
477 | 	magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, UT, n_h, queue);
478 | 	magma_strmm	(	MagmaRight,
479 | 					MagmaUpper,
480 | 					tTrans1,
481 | 					MagmaNonUnit,
482 | 					n_h,
483 | 					n_r,
484 | 					1.0,
485 | 					T,
486 | 					n_r,
487 | 					UT,
488 | 					n_h,
489 | 					queue
490 | 					);
491 | 	// U_grad = - Q_grad^T * UT + 0*U_grad
492 | 	magma_sgemm	(	MagmaTrans,
493 | 					MagmaNoTrans,
494 | 					n_h,
495 | 					n_r,
496 | 					n_h,
497 | 					-1.0,
498 | 					Q_grad,
499 | 					n_h,
500 | 					UT,
501 | 					n_h,
502 | 					0.0,
503 | 					U_grad,
504 | 					n_h,
505 | 					queue
506 | 					);
507 | 	// UT = U * T^T; where T is upper triangular matrix
508 | 	magmablas_slacpy(MagmaFull, n_h, n_r, U, n_h, UT, n_h, queue);
509 | 	magma_strmm	(	MagmaRight,
510 | 					MagmaUpper,
511 | 					tTrans2,
512 | 					MagmaNonUnit,
513 | 					n_h,
514 | 					n_r,
515 | 					1.0,
516 | 					T,
517 | 					n_r,
518 | 					UT,
519 | 					n_h,
520 | 					queue
521 | 					);
522 | 	// twork = T * U^T * Q_grad^T * U * T = - T * U^T * U_grad = - UT^T * U_grad
523 | 	magma_sgemm	(	MagmaTrans,
524 | 					MagmaNoTrans,
525 | 					n_r,
526 | 					n_r,
527 | 					n_h,
528 | 					-1.0,
529 | 					UT,
530 | 					n_h,
531 | 					U_grad,
532 | 					n_h,
533 | 					0.0,
534 | 					twork,
535 | 					n_r,
536 | 					queue
537 | 					);
538 | 	// For twork (M) copy the lower triangular to upper
539 | 	magmablas_ssymmetrize	(	useTri,
540 | 								n_r,
541 | 								twork,
542 | 								n_r,
543 | 								queue
544 | 								);
545 | 	// U_grad = - Q_grad * U * T^T + U_grad = -Q_grad * UT + U_grad
546 | 	magma_sgemm	(	MagmaNoTrans,
547 | 					MagmaNoTrans,
548 | 					n_h,
549 | 					n_r,
550 | 					n_h,
551 | 					-1.0,
552 | 					Q_grad,
553 | 					n_h,
554 | 					UT,
555 | 					n_h,
556 | 					1.0,
557 | 					U_grad,
558 | 					n_h,
559 | 					queue
560 | 					);
561 | 	// U_grad = U * twork + U_grad
562 | 	magma_sgemm	(	MagmaNoTrans,
563 | 					MagmaNoTrans,
564 | 					n_h,
565 | 					n_r,
566 | 					n_r,
567 | 					1.0,
568 | 					U,
569 | 					n_h,
570 | 					twork,
571 | 					n_r,
572 | 					1.0,
573 | 					U_grad,
574 | 					n_h,
575 | 					queue
576 | 					);
577 | 
578 | 	// zero out upper triangular part
579 | 	ZeroTriu<<<n_r, THREAD_SIZE>>>(U_grad, n_h, n_r);
580 | 
581 | 	// wait for all kernels in the queue
582 | 	magma_queue_sync(queue);
583 | 	cudaDeviceSynchronize();
584 | 
585 | #if 0
586 | 	// free memory
587 | 	magma_free(T_array);
588 | 	magma_free(Tau_array);
589 | 	magma_free(Twork_array);
590 | 	magma_free(V_array);
591 | 
592 | 	magma_free(Q_grad);
593 | 	magma_free(UT);
594 | 	magma_free(T);
595 | 	magma_free(tau);
596 | 	magma_free(twork);
597 | 	magma_free(dwork);
598 | 	magma_free(dworkvt);
599 | #endif
600 | 	assert(stat == MAGMA_SUCCESS);
601 | 	// magma_queue_destroy(queue); 
602 | 	// magma_finalize();
603 | 	return EXIT_SUCCESS;
604 | }
605 | 


--------------------------------------------------------------------------------