├── Lab3.pdf ├── compile_c.sh ├── jacobi-cuda.pdf ├── compile_cu.sh ├── lab3_io.cu ├── testcases ├── gen_testcase.py └── iris_stndardized ├── main_cuda.cu ├── lab3_io.h ├── lab3_cuda.h ├── README.md ├── sequential_C_codes └── lab3_cuda.c └── lab3_cuda.cu /Lab3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arneish/CUDA-PCA-jacobi/HEAD/Lab3.pdf -------------------------------------------------------------------------------- /compile_c.sh: -------------------------------------------------------------------------------- 1 | g++ -lm -g lab3_cuda.c lab3_io.c main_cuda.c -o jacobi 2 | 3 | -------------------------------------------------------------------------------- /jacobi-cuda.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arneish/CUDA-PCA-jacobi/HEAD/jacobi-cuda.pdf -------------------------------------------------------------------------------- /compile_cu.sh: -------------------------------------------------------------------------------- 1 | #export CUDA_DEVICE_ORDER=PCI_BUS_ID 2 | #export CUDA_VISIBLE_DEVICES=3 3 | nvcc -Xcompiler="--std=c++0x" -lm -arch=sm_35 -std=c++11 main_cuda.cu lab3_cuda.cu lab3_io.cu -o pca 4 | 5 | #export CUDA_LAUNCH_BLOCKING=1 6 | -------------------------------------------------------------------------------- /lab3_io.cu: -------------------------------------------------------------------------------- 1 | #include "lab3_io.h" 2 | 3 | void read_matrix (const char* input_filename, int* M, int* N, double** D){ 4 | FILE *fin = fopen(input_filename, "r"); 5 | int i; 6 | 7 | fscanf(fin, "%d%d", M, N); 8 | 9 | int num_elements = (*M) * (*N); 10 | *D = (double*) malloc(sizeof(double)*(num_elements)); 11 | 12 | for (i = 0; i < num_elements; i++){ 13 | fscanf(fin, "%lf", (*D + i)); 14 | } 15 | fclose(fin); 16 | } 17 | 18 | void write_result (int M, 19 | int N, 20 | double* D, 21 | double* U, 22 | double* SIGMA, 23 | double* V_T, 24 | int SIGMAm, 25 | int SIGMAn, 26 | int K, 27 | double* D_HAT, 28 | double computation_time){ 29 | // Will contain output code 30 | } 31 | 32 | void format_checker (int M, 33 | int N, 34 | double* D, 35 | double* U, 36 | double* SIGMA, 37 | double* V_T, 38 | int SIGMAm, 39 | int SIGMAn, 40 | int K, 41 | double* D_HAT){ 42 | printf("checking format\n"); 43 | if (SIGMAm==M && SIGMAn==N) { 44 | printf("SVD of D:\n"); 45 | } 46 | else if (SIGMAm==N && SIGMAn==M) { 47 | printf("SVD of D_T:\n"); 48 | } 49 | 50 | printf("Matrix U:\n"); 51 | for (int i = 0; i < SIGMAm; i++) { 52 | for (int j = 0; j < SIGMAm; j++) { 53 | printf("%.2lf\t", U[i*SIGMAm+j]); 54 | } 55 | printf("\n"); 56 | } 57 | 58 | printf("Matrix SIGMA:\n"); 59 | for (int i = 0; i < SIGMAm; i++) { 60 | for( int j = 0; j < SIGMAn; j++) { 61 | if (i == j) 62 | printf("%.2lf ", SIGMA[i]); 63 | else printf("0\t"); 64 | } 65 | printf("\n"); 66 | } 67 | 68 | printf("Matrix V_T:\n"); 69 | for (int i = 0; i < SIGMAn; i++) { 70 | for (int j = 0; j < SIGMAn; j++) { 71 | printf("%lf\t", V_T[i*SIGMAn+j]); 72 | } 73 | printf("\n"); 74 | } 75 | 76 | printf("K = %d\n", K); 77 | 78 | printf("Matrix D_HAT:\n"); 79 | for (int i = 0; i < M; i++) { 80 | for (int j = 0; j < K; j++) { 81 | printf("%lf\t", D_HAT[i*K+j]); 82 | } 83 | printf("\n"); 84 | } 85 | } 86 | 87 | -------------------------------------------------------------------------------- /testcases/gen_testcase.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | ######################################################################### 4 | # Generate M x N matrix of real numbers and store # 5 | # the the matrix in file named 'testcase__' # 6 | # Parameters: # 7 | # M :no of rows (samples) in matrix # 8 | # N :no of coulmns (features) in matrix # 9 | # lrange, urange :range of matrix elements ie # 10 | # forall 0<=i 5 | #include 6 | 7 | /* 8 | Arguments: 9 | arg1: input filename (consist M, N and D) 10 | arg2: retention (percentage of information to be retained by PCA) 11 | */ 12 | 13 | int main(int argc, char const *argv[]) 14 | { 15 | if (argc < 3){ 16 | printf("\nLess Arguments\n"); 17 | return 0; 18 | } 19 | 20 | if (argc > 3){ 21 | printf("\nTOO many Arguments\n"); 22 | return 0; 23 | } 24 | 25 | //--------------------------------------------------------------------- 26 | int M; //no of rows (samples) in input matrix D (input) 27 | int N; //no of columns (features) in input matrix D (input) 28 | double* D; //1D array of M x N matrix to be reduced (input) 29 | double* U; //1D array of N x N (or M x M) matrix U (to be computed by SVD) 30 | double* SIGMA; //1D array of N x M (or M x N) diagonal matrix SIGMA (to be computed by SVD) 31 | //SIGMA consists only digonal elements, #elements = N. 32 | //So it is vector of N elements 33 | double* V_T; //1D array of M x M (or N x N) matrix V_T (to be computed by SVD) 34 | int SIGMAm; //#rows in SIGMA, read note in lab3_cuda.h (to be computed by SVD) 35 | int SIGMAn; //#columns in SIGMA, read note in lab3_cuda.h (to be computed by SVD) 36 | int K; //no of coulmns (features) in reduced matrix D_HAT (to be computed by PCA) 37 | double *D_HAT; //1D array of M x K reduced matrix (to be computed by PCA) 38 | int retention; //percentage of information to be retained by PCA (command line input) 39 | //--------------------------------------------------------------------- 40 | 41 | retention = atoi(argv[2]); //retention = 90 means 90% of information should be retained 42 | 43 | float computation_time; 44 | 45 | /* 46 | -- Pre-defined function -- 47 | reads matrix and its dimentions from input file and creats array D 48 | #elements in D is M * N 49 | format - 50 | -------------------------------------------------------------------------------------- 51 | | D[0][0] | D[0][1] | ... | D[0][N-1] | D[1][0] | ... | D[1][N-1] | ... | D[M-1][N-1] | 52 | -------------------------------------------------------------------------------------- 53 | */ 54 | read_matrix (argv[1], &M, &N, &D); 55 | 56 | cudaEvent_t start, stop; 57 | cudaEventCreate(&start); 58 | cudaEventCreate(&stop); 59 | 60 | cudaEventRecord(start); 61 | 62 | // /* 63 | // ***************************************************** 64 | // TODO -- You must implement this function 65 | // ***************************************************** 66 | // */ 67 | SVD_and_PCA(M, N, D, &U, &SIGMA, &V_T, &SIGMAm, &SIGMAn, &D_HAT, &K, retention); 68 | 69 | cudaEventRecord(stop); 70 | cudaEventSynchronize(stop); 71 | cudaEventElapsedTime(&computation_time, start, stop); 72 | 73 | /* 74 | --Pre-defined functions -- 75 | checks for correctness of results computed by SVD and PCA 76 | and outputs the results 77 | */ 78 | write_result(M, N, D, U, SIGMA, V_T, SIGMAm, SIGMAn, K, D_HAT, computation_time); 79 | 80 | return 0; 81 | } 82 | -------------------------------------------------------------------------------- /lab3_io.h: -------------------------------------------------------------------------------- 1 | #ifndef LAB3_IO_H 2 | #define LAB3_IO_H 3 | 4 | #include 5 | #include 6 | 7 | /* 8 | M : number of rows (samples) in input matrix D 9 | N : number of columns (features) in input matrix D 10 | D : 1D Array of M x N input matrix in row-major, 11 | #elements in D is (M * N) 12 | -------------------------------------------------------------------------------------- 13 | | D[0][0] | D[0][1] | ... | D[0][N-1] | D[1][0] | ... | D[1][N-1] | ... | D[M-1][N-1] | 14 | -------------------------------------------------------------------------------------- 15 | */ 16 | void read_matrix (const char* input_filename, int* M, int *N, double** D); 17 | 18 | /* 19 | check correctess of Singular Vector Decomposition 20 | Arguments: 21 | M : number of rows (samples) in input matrix D 22 | N : number of columns (features) in input matrix D 23 | D : 1D Array of M x N input matrix in row-major, 24 | #elements in D is (M * N) 25 | -------------------------------------------------------------------------------------- 26 | | D[0][0] | D[0][1] | ... | D[0][N-1] | D[1][0] | ... | D[1][N-1] | ... | D[M-1][N-1] | 27 | -------------------------------------------------------------------------------------- 28 | U : 1D array of N x n real matrix (computed by SVD) in row-major 29 | -------------------------------------------------------------------------------------- 30 | | U[0][0] | U[0][1] | ... | U[0][N-1] | U[1][0] | ... | U[1][N-1] | ... | U[N-1][N-1] | 31 | -------------------------------------------------------------------------------------- 32 | SIGMA : 1D array of N x M (or M x N) diagonal matrix of positive real numbers (computed by SVD), 33 | format: consists only digonal elements 34 | #elements in SIGMA is N (digonals will be N in both cases) 35 | ------------------------------------------------------------------- 36 | | SIGMA[0][0] | SIGMA[1][1] | SIGMA[2][2] | ... | SIGMA[N-1][N-1] | 37 | ------------------------------------------------------------------- 38 | V_T : 1D array of M x M real matrix (computed by SVD) in row-major 39 | -------------------------------------------------------------------------------- 40 | | V_T[0][0] | V_T[0][1] | ... | V_T[0][M-1] | V_T[1][0] | ... | V_T[M-1][M-1] | 41 | -------------------------------------------------------------------------------- 42 | SIGMAm: #rows in SIGMA, to be decided as per the dimentions of matrix used for SVD 43 | SIGMAn: #columns in SIGMA, to be decided as per the dimentions of matrix used for SVD 44 | K : number of coulmns (features) in reduced matrix D_HAT 45 | D_HAT : reduced matrix (computed by PCA) in row-major 46 | ------------------------------------------------------------------------------------- 47 | | D_HAT[0][0] | D_HAT[0][1] | ... | D_HAT[0][K-1] | D_HAT[1][0] | ... | D[M-1][K-1] | 48 | ------------------------------------------------------------------------------------- 49 | computation_time : Time elapsed in computing SVD and PCA 50 | */ 51 | 52 | void write_result (int M, 53 | int N, 54 | double* D, 55 | double* U, 56 | double* SIGMA, 57 | double* V_T, 58 | int SIGMAm, 59 | int SIGMAn, 60 | int K, 61 | double* D_HAT, 62 | double computation_time); 63 | 64 | /* 65 | Function to check the format of output code. 66 | You can call it from main to check if the dimensions of 67 | output matches with our expected dimensions. 68 | This is a dummy function. It is not the function that 69 | will be used for evaluation. 70 | */ 71 | void format_checker (int M, 72 | int N, 73 | double* D, 74 | double* U, 75 | double* SIGMA, 76 | double* V_T, 77 | int SIGMAm, 78 | int SIGMAn, 79 | int K, 80 | double* D_HAT); 81 | 82 | #endif -------------------------------------------------------------------------------- /lab3_cuda.h: -------------------------------------------------------------------------------- 1 | #ifndef LAB3_CUDA_H 2 | #define LAB3_CUDA_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | /* 10 | To be implemented 11 | Note: 12 | Since PCA of matrix D can be computed by taking SVD of D or D_T, we will allow you to 13 | make the choice. If you compute SVD of D then, U is MxM matrix, SIGMA is MxN matrix and 14 | V_T is NxN matrix. On the other hand if you compute SVD of D_T, U is NxN, SIGMA is NxM, 15 | and V_T is MxM matrices. Note that dimensions of SIGMA are same as that of matrix being 16 | decomposed. For correctness checking, we need to know the matrix you chose for SVD. We 17 | will use the dimensions of SIGMA for this purpose since the dimensions of SIGMA is same 18 | as the matrix being decomposed. Variable SIGMAm and SIGMAn (to be computed by you) are 19 | number of rows and columns in SIGMA as well as in the matrix used for SVD. We will check 20 | the correctness of SVD accordingly assuming the dimensions of U, SIGMA and V_T as per 21 | these variables. Since only N digonal elements in SIGMA are non-zero, it should be returned 22 | as 1D vector of N elements (no need to store zeros in SIGMA). 23 | 24 | Arguments: 25 | M : number of rows (samples) in input matrix D (input) 26 | N : number of columns (features) in input matrix D (input) 27 | D : 1D Array of M x N input matrix in row-major, (input) 28 | #elements in D is (M * N) 29 | -------------------------------------------------------------------------------------- 30 | | D[0][0] | D[0][1] | ... | D[0][N-1] | D[1][0] | ... | D[1][N-1] | ... | D[M-1][N-1] | 31 | -------------------------------------------------------------------------------------- 32 | U : 1D array of N x N (or M x M) real matrix in row-major (to be computed) 33 | -------------------------------------------------------------------------------------- 34 | | U[0][0] | U[0][1] | ... | U[0][N-1] | U[1][0] | ... | U[1][N-1] | ... | U[N-1][N-1] | 35 | -------------------------------------------------------------------------------------- 36 | SIGMA : 1D array of N x M (or M x N) diagonal matrix of positive real numbers (to be computed) 37 | format: consists only digonal elements 38 | #elements in SIGMA is N (digonals will be N in both cases) 39 | ------------------------------------------------------------------- 40 | | SIGMA[0][0] | SIGMA[1][1] | SIGMA[2][2] | ... | SIGMA[N-1][N-1] | 41 | ------------------------------------------------------------------- 42 | V_T : 1D array of M x M (or N x N) real matrix in row-major (to be computed) 43 | ------------------------------------------------------------------------------- 44 | | V_T[0][0] | V_T[0][1] | ... | V_T[0][M-1] | V_T[1][0] | ... | V_T[M-1][M-1] | 45 | ------------------------------------------------------------------------------- 46 | SIGMAm: #rows in SIGMA, to be decided as per the dimentions of matrix used for SVD 47 | (to be computed) 48 | SIGMAn: #columns in SIGMA, to be decided as per the dimentions of matrix used for SVD 49 | (to be computed) 50 | D_HAT : 1D array of reduced M x K real matrix in row-major (to be computed) 51 | ----------------------------------------------------------------------------------------- 52 | | D_HAT[0][0] | D_HAT[0][1] | ... | D_HAT[0][K-1] | D_HAT[1][0] | ... | D_HAT[M-1][K-1] | 53 | ----------------------------------------------------------------------------------------- 54 | K : number of columns (features) in reduced matrix (to be computed) 55 | retention : percentage of inpdormation to be retained by PCA 56 | retention = 90 means 90% of information should be retained 57 | */ 58 | void SVD_and_PCA ( 59 | int M, 60 | int N, 61 | double* D, 62 | double** U, 63 | double** SIGMA, 64 | double** V_T, 65 | int *SIGMAm, 66 | int *SIGMAn, 67 | double** D_HAT, 68 | int *K, 69 | int retention); 70 | 71 | #endif 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | - **Contains: *A highly optimised parallel implementation of the Jacobi eigenvalue algorithm in CUDA C and a serial implementation of the same algorithm in C for speedup computations*** 3 | - **Input Data: Works on Input matrices of dimensions M (#samples) x N (#features) with N not exceeding 1024 (assuming GPU architecture supports BLOCK SIZE of 1024)** 4 | - CUDA C file: *lab3_cuda.cu* 5 | - I/O: I/O format can be understood from the included header files and sample testcase files. 6 | - **Primary Reference: *Novel GPU Implementation of Jacobi Algorithm for Karhunen-Loeve Transform of Dense Matrices (Mustafa U. Tamn, Onur Yilmaz, and Ali N. Akansu) [IEEE 2012]*** 7 | - Wiki reference: https://en.wikipedia.org/wiki/Jacobi_eigenvalue_algorithm 8 | - Assignment attempted as a part of coursework requirements in *COL380: Introduction to Parallel Programming and Distributed Computing (Sem-II, 2018-19)* (Instructor: Prof. Subodh V. Sharma) at Indian Institute of Technology (IIT), Delhi. 9 | - The problem statement is included. The following sections describe the assignment submission requirements and how to use the starter codes. 10 | 11 | ## Problem Statement & Starter Codes: col380_lab3_suite 12 | - Problem Statement: Implement Principal Component Analysis with Singular Vector Decomposition in CUDA 13 | - Cloned from: https://github.com/dvynjli/col380_lab3_suite/ 14 | 15 | ## Directories and files 16 | `testcase/`: contains python script `gen_testcase.py` for sample testcase generation 17 | `lab3_io.h` and `lab3_io.cu`: functions to read matrix from file and check the correctness of the result 18 | `main_cuda.cu`: function `main()` 19 | `lab3_cuda.h`: header file for the functions to be implemented 20 | `lab3_cuda.cu`: implement the function in this file 21 | Refer to respective files for furthur details. 22 | **Do not change the directory structure and prototype of functions.** 23 | 24 | ## Building and Executing 25 | ``` 26 | nvcc -lm main_cuda.cu lab3_cuda.cu lab3_io.cu -o pca 27 | ``` 28 | #### Command Line Arguments 29 | The program takes two command line arguments: 30 | - arg1: input filename (consist M, N and D) 31 | - arg2: retention (percentage of information to be retained by PCA) 32 | 33 | Note that the retention percentage is integer. Please refer to `main_cuda.cu` for more details. 34 | To run the program: 35 | ``` 36 | ./pca 37 | ``` 38 | Example: 39 | ``` 40 | ./pca testcase/testcase_1000_1000 90 41 | ``` 42 | 43 | ## Generating testcases 44 | Script `gen_testcase.py` generates testcases as per the parameters and output the generated testcase in file `testcase__` in the desired format. You might need to change the values of variables `M` and `N` in the script. Read the comments in the script for more information. 45 | ``` 46 | python3 gen_testcase.py 47 | ``` 48 | 49 | ## Input-Output Specifications 50 | #### Input dataset specifications 51 | - M : number of rows (samples) in input matrix D 52 | - N : number of columns (features) in input matrix D 53 | - D : input matrix, #elements in D is (M * N) 54 | 55 | The first line of the input file contains `M` followed by `N`. The second line contains elements of matrix `D`. All the values in one line are space separated. 56 | 57 | #### Output Specification 58 | Your program should perform SVD and PCA on the given input and store the results in the variables given in the program. We will check the correctness by calling the functions from the program. You should compute following matrices and values: 59 | - U : N x N real matrix (to be computed by SVD) 60 | - SIGMA : N x M diagonal matrix of positive real numbers ( to be computed by SVD) 61 | - V_T : M x M real matrix (to be computed by SVD) 62 | - K : number of columns (features) in reduced matrix D_HAT 63 | - D_HAT : reduced matrix (to be computed by PCA) 64 | 65 | Refer to `lab3_cuda.h` for more details. **Your program should not output anything on `stdout`.** 66 | 67 | ## Submission Instructions 68 | - You are supposed to submit only one file named `lab3_cuda.cu`. Please make sure all the functions you have used are in this file. 69 | - Do not submit other files 70 | - Your code should build and execute as per the instructions given above. Please make sure that your code doesn't need any Makefile. 71 | - Your program should not output anything in `stdout`. 72 | 73 | We will not consider the submissions that don't comply with these guidelines. 74 | -------------------------------------------------------------------------------- /testcases/iris_stndardized: -------------------------------------------------------------------------------- 1 | 150 4 2 | -0.9006812 1.0320572 -1.3412724 -1.3129767 -1.1430169 -0.1249576 -1.3412724 -1.3129767 -1.3853527 0.3378483 -1.3981381 -1.3129767 -1.5065205 0.1064454 -1.2844067 -1.3129767 -1.0218490 1.2634602 -1.3412724 -1.3129767 -0.5371776 1.9576691 -1.1706753 -1.0500308 -1.5065205 0.8006543 -1.3412724 -1.1815038 -1.0218490 0.8006543 -1.2844067 -1.3129767 -1.7488563 -0.3563606 -1.3412724 -1.3129767 -1.1430169 0.1064454 -1.2844067 -1.4444497 -0.5371776 1.4948632 -1.2844067 -1.3129767 -1.2641848 0.8006543 -1.2275410 -1.3129767 -1.2641848 -0.1249576 -1.3412724 -1.4444497 -1.8700241 -0.1249576 -1.5118695 -1.4444497 -0.0525061 2.1890721 -1.4550038 -1.3129767 -0.1736739 3.1146839 -1.2844067 -1.0500308 -0.5371776 1.9576691 -1.3981381 -1.0500308 -0.9006812 1.0320572 -1.3412724 -1.1815038 -0.1736739 1.7262661 -1.1706753 -1.1815038 -0.9006812 1.7262661 -1.2844067 -1.1815038 -0.5371776 0.8006543 -1.1706753 -1.3129767 -0.9006812 1.4948632 -1.2844067 -1.0500308 -1.5065205 1.2634602 -1.5687352 -1.3129767 -0.9006812 0.5692513 -1.1706753 -0.9185578 -1.2641848 0.8006543 -1.0569439 -1.3129767 -1.0218490 -0.1249576 -1.2275410 -1.3129767 -1.0218490 0.8006543 -1.2275410 -1.0500308 -0.7795133 1.0320572 -1.2844067 -1.3129767 -0.7795133 0.8006543 -1.3412724 -1.3129767 -1.3853527 0.3378483 -1.2275410 -1.3129767 -1.2641848 0.1064454 -1.2275410 -1.3129767 -0.5371776 0.8006543 -1.2844067 -1.0500308 -0.7795133 2.4204750 -1.2844067 -1.4444497 -0.4160097 2.6518780 -1.3412724 -1.3129767 -1.1430169 0.1064454 -1.2844067 -1.4444497 -1.0218490 0.3378483 -1.4550038 -1.3129767 -0.4160097 1.0320572 -1.3981381 -1.3129767 -1.1430169 0.1064454 -1.2844067 -1.4444497 -1.7488563 -0.1249576 -1.3981381 -1.3129767 -0.9006812 0.8006543 -1.2844067 -1.3129767 -1.0218490 1.0320572 -1.3981381 -1.1815038 -1.6276884 -1.7447784 -1.3981381 -1.1815038 -1.7488563 0.3378483 -1.3981381 -1.3129767 -1.0218490 1.0320572 -1.2275410 -0.7870848 -0.9006812 1.7262661 -1.0569439 -1.0500308 -1.2641848 -0.1249576 -1.3412724 -1.1815038 -0.9006812 1.7262661 -1.2275410 -1.3129767 -1.5065205 0.3378483 -1.3412724 -1.3129767 -0.6583454 1.4948632 -1.2844067 -1.3129767 -1.0218490 0.5692513 -1.3412724 -1.3129767 1.4015084 0.3378483 0.5352958 0.2646989 0.6745011 0.3378483 0.4215644 0.3961719 1.2803405 0.1064454 0.6490272 0.3961719 -0.4160097 -1.7447784 0.1372359 0.1332259 0.7956690 -0.5877635 0.4784301 0.3961719 -0.1736739 -0.5877635 0.4215644 0.1332259 0.5533333 0.5692513 0.5352958 0.5276449 -1.1430169 -1.5133754 -0.2608240 -0.2611930 0.9168369 -0.3563606 0.4784301 0.1332259 -0.7795133 -0.8191665 0.0803702 0.2646989 -1.0218490 -2.4389873 -0.1470926 -0.2611930 0.0686618 -0.1249576 0.2509673 0.3961719 0.1898297 -1.9761813 0.1372359 -0.2611930 0.3109975 -0.3563606 0.5352958 0.2646989 -0.2948418 -0.3563606 -0.0902269 0.1332259 1.0380048 0.1064454 0.3646987 0.2646989 -0.2948418 -0.1249576 0.4215644 0.3961719 -0.0525061 -0.8191665 0.1941016 -0.2611930 0.4321654 -1.9761813 0.4215644 0.3961719 -0.2948418 -1.2819724 0.0803702 -0.1297200 0.0686618 0.3378483 0.5921615 0.7905908 0.3109975 -0.5877635 0.1372359 0.1332259 0.5533333 -1.2819724 0.6490272 0.3961719 0.3109975 -0.5877635 0.5352958 0.0017530 0.6745011 -0.3563606 0.3078330 0.1332259 0.9168369 -0.1249576 0.3646987 0.2646989 1.1591726 -0.5877635 0.5921615 0.2646989 1.0380048 -0.1249576 0.7058929 0.6591178 0.1898297 -0.3563606 0.4215644 0.3961719 -0.1736739 -1.0505695 -0.1470926 -0.2611930 -0.4160097 -1.5133754 0.0235045 -0.1297200 -0.4160097 -1.5133754 -0.0333612 -0.2611930 -0.0525061 -0.8191665 0.0803702 0.0017530 0.1898297 -0.8191665 0.7627586 0.5276449 -0.5371776 -0.1249576 0.4215644 0.3961719 0.1898297 0.8006543 0.4215644 0.5276449 1.0380048 0.1064454 0.5352958 0.3961719 0.5533333 -1.7447784 0.3646987 0.1332259 -0.2948418 -0.1249576 0.1941016 0.1332259 -0.4160097 -1.2819724 0.1372359 0.1332259 -0.4160097 -1.0505695 0.3646987 0.0017530 0.3109975 -0.1249576 0.4784301 0.2646989 -0.0525061 -1.0505695 0.1372359 0.0017530 -1.0218490 -1.7447784 -0.2608240 -0.2611930 -0.2948418 -0.8191665 0.2509673 0.1332259 -0.1736739 -0.1249576 0.2509673 0.0017530 -0.1736739 -0.3563606 0.2509673 0.1332259 0.4321654 -0.3563606 0.3078330 0.1332259 -0.9006812 -1.2819724 -0.4314211 -0.1297200 -0.1736739 -0.5877635 0.1941016 0.1332259 0.5533333 0.5692513 1.2745500 1.7109016 -0.0525061 -0.8191665 0.7627586 0.9220638 1.5226762 -0.1249576 1.2176843 1.1850097 0.5533333 -0.3563606 1.0470872 0.7905908 0.7956690 -0.1249576 1.1608186 1.3164827 2.1285156 -0.1249576 1.6157442 1.1850097 -1.1430169 -1.2819724 0.4215644 0.6591178 1.7650120 -0.3563606 1.4451471 0.7905908 1.0380048 -1.2819724 1.1608186 0.7905908 1.6438441 1.2634602 1.3314157 1.7109016 0.7956690 0.3378483 0.7627586 1.0535367 0.6745011 -0.8191665 0.8764901 0.9220638 1.1591726 -0.1249576 0.9902215 1.1850097 -0.1736739 -1.2819724 0.7058929 1.0535367 -0.0525061 -0.5877635 0.7627586 1.5794286 0.6745011 0.3378483 0.8764901 1.4479556 0.7956690 -0.1249576 0.9902215 0.7905908 2.2496835 1.7262661 1.6726099 1.3164827 2.2496835 -1.0505695 1.7863413 1.4479556 0.1898297 -1.9761813 0.7058929 0.3961719 1.2803405 0.3378483 1.1039529 1.4479556 -0.2948418 -0.5877635 0.6490272 1.0535367 2.2496835 -0.5877635 1.6726099 1.0535367 0.5533333 -0.8191665 0.6490272 0.7905908 1.0380048 0.5692513 1.1039529 1.1850097 1.6438441 0.3378483 1.2745500 0.7905908 0.4321654 -0.5877635 0.5921615 0.7905908 0.3109975 -0.1249576 0.6490272 0.7905908 0.6745011 -0.5877635 1.0470872 1.1850097 1.6438441 -0.1249576 1.1608186 0.5276449 1.8861798 -0.5877635 1.3314157 0.9220638 2.4920192 1.7262661 1.5020128 1.0535367 0.6745011 -0.5877635 1.0470872 1.3164827 0.5533333 -0.5877635 0.7627586 0.3961719 0.3109975 -1.0505695 1.0470872 0.2646989 2.2496835 -0.1249576 1.3314157 1.4479556 0.5533333 0.8006543 1.0470872 1.5794286 0.6745011 0.1064454 0.9902215 0.7905908 0.1898297 -0.1249576 0.5921615 0.7905908 1.2803405 0.1064454 0.9333558 1.1850097 1.0380048 0.1064454 1.0470872 1.5794286 1.2803405 0.1064454 0.7627586 1.4479556 -0.0525061 -0.8191665 0.7627586 0.9220638 1.1591726 0.3378483 1.2176843 1.4479556 1.0380048 0.5692513 1.1039529 1.7109016 1.0380048 -0.1249576 0.8196243 1.4479556 0.5533333 -1.2819724 0.7058929 0.9220638 0.7956690 -0.1249576 0.8196243 1.0535367 0.4321654 0.8006543 0.9333558 1.4479556 0.0686618 -0.1249576 0.7627586 0.7905908 -------------------------------------------------------------------------------- /sequential_C_codes/lab3_cuda.c: -------------------------------------------------------------------------------- 1 | #include "lab3_cuda.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #define MAX_ITER 1000000 7 | // /* 8 | // ***************************************************** 9 | // TODO -- You must implement this function 10 | // ***************************************************** 11 | // */ 12 | double *s_initialize_identity(int size) 13 | { 14 | double *I = (double *)calloc(size * size, sizeof(double)); 15 | for (int i = 0; i < size; i++) 16 | I[i * size + i] = 1.0; 17 | return I; 18 | } 19 | 20 | void s_transpose(double *M, int m, int n, double *M_T) 21 | { 22 | int i, j; 23 | for (i = 0; i < m; i++) 24 | { 25 | for (j = 0; j < n; j++) 26 | { 27 | M_T[j * m + i] = M[i * n + j]; 28 | } 29 | } 30 | } 31 | 32 | void s_multiply(double *M_1, int m1, int n1, double *M_2, int m2, int n2, double *result) 33 | { 34 | double sum = 0.0; 35 | //compute M_2_T: 36 | double *M_2_T = (double *)malloc(sizeof(double) * n2 * m2); 37 | s_transpose(M_2, m2, n2, M_2_T); 38 | int i, j, k, temp1, temp2; 39 | for (i = 0; i < m1; i++) 40 | { 41 | temp1 = i * n1; 42 | for (j = 0; j < n2; j++) 43 | { 44 | sum = 0.0; 45 | temp2 = j * m2; 46 | for (k = 0; k < n1; k++) 47 | { 48 | sum += M_1[temp1 + k] * M_2_T[temp2 + k]; 49 | } 50 | result[i * n2 + j] = sum; 51 | } 52 | } 53 | free(M_2_T); 54 | } 55 | 56 | double s_maxind(double *A, int size, int k) 57 | { 58 | int m = k + 1; 59 | for (int i = k + 2; i < size; i++) 60 | { 61 | if (fabs(A[k * size + i]) > fabs(A[k * size + m])) 62 | { 63 | m = i; 64 | } 65 | } 66 | return m; 67 | } 68 | 69 | void s_update(int k, double t, double *e, bool *changed, int *state) 70 | { 71 | double y = e[k]; e[k] = y + t; 72 | if (changed[k] && (y==e[k])) 73 | { 74 | changed[k] = false; 75 | (*state)--; 76 | } 77 | else if (!changed[k] && (y!=e[k])) 78 | { 79 | changed[k]=true; 80 | (*state)++; 81 | } 82 | } 83 | 84 | void s_rotate(int k, int l, int i, int j, double *A, int P, double c, double s) 85 | { 86 | double k_l=c*A[k*P+l]-s*A[i*P+j]; 87 | double i_j=s*A[k*P+l]+c*A[i*P+j]; 88 | A[k*P+l]=k_l; 89 | A[i*P+j]=i_j; 90 | } 91 | 92 | double l2_matrix_diff_norm(double *E_, double *E, int M, int N) 93 | { 94 | double sum = 0.0; 95 | for (int i=0; i fabs(A[m * P + ind[m]])) 232 | { 233 | m = i; 234 | } 235 | } 236 | k = m; l = ind[k]; p = A[k*P+l]; 237 | y = 0.5*(e[l]-e[k]); d = fabs(y)+sqrt(p*p+y*y); 238 | r = sqrt(p*p+d*d); c = d/r; s = p/r; t = p*p/d; 239 | if (y<0) {s = -s; t=-t;} 240 | A[k*P+l]=0.0; s_update(k, -t, e, changed, &state); s_update(l, t, e, changed, &state); 241 | 242 | //rotate rows and cols k and l: 243 | for (int i=0; ie[m]) 289 | { 290 | m=j; 291 | } 292 | } 293 | if (m!=i) 294 | { 295 | double temp = e[m]; 296 | e[m] = e[i]; 297 | e[i] = temp; 298 | temp = indices[m]; 299 | indices[m] = indices[i]; 300 | indices[i] = temp; 301 | } 302 | } 303 | printf("Indices arr:\n"); 304 | for(int i=0; i 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std::chrono; 9 | 10 | #define EPSILON 1e-4 11 | #define THRESHOLD 1e-4 12 | #define MAX_BLOCK_SIZE 1024 13 | #define MAX_SWEEPS 30 14 | #define MAX_ITER 10000000 15 | #define MULTIPLY_BLOCK_SIZE 64 16 | 17 | #pragma GCC optimize("Ofast") 18 | #pragma GCC target("sse,sse2,sse3,ssse3,sse4,popcnt,abm,mmx,avx,tune=native") 19 | 20 | #define gpuErrchk(ans) \ 21 | { \ 22 | gpuAssert((ans), __FILE__, __LINE__); \ 23 | } 24 | inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) 25 | { 26 | if (code != cudaSuccess) 27 | { 28 | fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 29 | if (abort) 30 | exit(code); 31 | } 32 | } 33 | 34 | void s_initialize_identity(double *I, int size) 35 | { 36 | memset(I, 0, sizeof(double)*size*size); 37 | for (int i = 0; i < size; i++) 38 | I[i * size + i] = 1.0; 39 | } 40 | 41 | void s_transpose(double *M, int m, int n, double *M_T) 42 | { 43 | int i, j, index_; 44 | for (j=0; j fabs(A[k * size + m])) 85 | { 86 | m = i; 87 | } 88 | } 89 | return m; 90 | } 91 | 92 | void s_update(int k, double t, double *e, bool *changed, int *state) 93 | { 94 | double y = e[k]; 95 | e[k] = y + t; 96 | if (changed[k] && (y == e[k])) 97 | { 98 | changed[k] = false; 99 | (*state)--; 100 | } 101 | else if (!changed[k] && (y != e[k])) 102 | { 103 | changed[k] = true; 104 | (*state)++; 105 | } 106 | } 107 | 108 | void s_rotate(int k, int l, int i, int j, double *A, int P, double c, double s) 109 | { 110 | double k_l = c * A[k * P + l] - s * A[i * P + j]; 111 | double i_j = s * A[k * P + l] + c * A[i * P + j]; 112 | A[k * P + l] = k_l; 113 | A[i * P + j] = i_j; 114 | } 115 | 116 | void s_merge(double *e, int *indices_e, int left_index, int mid, int right_index) 117 | { 118 | int i = left_index, j = mid + 1, k = 0; 119 | double *sorted = (double *)malloc(sizeof(double) * (right_index - left_index + 1)); 120 | int *sorted_indices = (int *)malloc(sizeof(int) * (right_index - left_index + 1)); 121 | assert(sorted_indices!=NULL); 122 | while (i <= mid && j <= right_index) 123 | { 124 | if (fabs(e[i]) >= fabs(e[j])) 125 | { 126 | sorted_indices[k] = indices_e[i]; 127 | sorted[k++] = e[i++]; 128 | } 129 | else 130 | { 131 | sorted_indices[k] = indices_e[j]; 132 | sorted[k++] = e[j++]; 133 | } 134 | } 135 | while (i <= mid) 136 | { 137 | sorted_indices[k] = indices_e[i]; 138 | sorted[k++] = e[i++]; 139 | } 140 | while (j <= right_index) 141 | { 142 | sorted_indices[k] = indices_e[j]; 143 | sorted[k++] = e[j++]; 144 | } 145 | assert(k==right_index-left_index+1); 146 | memcpy(e + left_index, sorted, sizeof(double)*(right_index-left_index+1)); 147 | memcpy(indices_e + left_index, sorted_indices, sizeof(int)*(right_index-left_index+1)); 148 | free(sorted); 149 | free(sorted_indices); 150 | } 151 | 152 | void s_mergesort(double *e, int e_len, int *indices_e, int left_index, int right_index) 153 | { 154 | //sort e in desc based on abs value 155 | //rearrange corresponding indices_e appropriately 156 | assert(left_index <= right_index); 157 | if (left_index < right_index) 158 | { 159 | int mid = (left_index + right_index) / 2; 160 | s_mergesort(e, e_len, indices_e, left_index, mid); 161 | s_mergesort(e, e_len, indices_e, mid + 1, right_index); 162 | s_merge(e, indices_e, left_index, mid, right_index); 163 | } 164 | } 165 | 166 | double l2_matrix_diff_norm(double *E_, double *E, int M, int N) 167 | { 168 | double sum = 0.0; 169 | for (int i = 0; i < M; i++) 170 | { 171 | for (int j = 0; j < N; j++) 172 | sum += (E_[i * M + j] - E[i * M + j]) * (E_[i * M + j] - E[i * M + j]); 173 | } 174 | return sqrt(sum); 175 | } 176 | 177 | double l2_diff_norm(double *e_, double *e, int len) 178 | { 179 | double sum = 0.0; 180 | for (int i = 0; i < len; i++) 181 | { 182 | sum += (e_[i] - e[i]) * (e_[i] - e[i]); 183 | } 184 | return sqrt(sum); 185 | } 186 | 187 | void print_matrix(double *A, int M, int N, bool console) 188 | { 189 | return; 190 | for (int i = 0; i < M; i++) 191 | { 192 | for (int j = 0; j < N; j++) 193 | { 194 | if (!console) 195 | fprintf(stderr, "%f ", A[i * N + j]); 196 | else 197 | printf("%f ", A[i * N + j]); 198 | } 199 | if (!console) 200 | fprintf(stderr, "\n"); 201 | else 202 | printf("\n"); 203 | } 204 | } 205 | 206 | void s_compute_V(double **SIGMA, double *D_T, double **U, double **V_T, int N, int P) 207 | { 208 | //V_T = INV-SIGMA * U_T * M 209 | double *INV_SIGMA = (double *)calloc(N * P, sizeof(double)); //|=NXP 210 | for (int i = 0; i < P; i++) 211 | { 212 | INV_SIGMA[i * P + i] = 1.0 / ((*SIGMA)[i]); 213 | } 214 | double *U_T = (double *)malloc(sizeof(double) * P * P); 215 | s_transpose(*U, P, P, U_T); 216 | //first, multiply INV-SIGMA X U_T |=(NXP) 217 | double *product = (double *)malloc(sizeof(double) * N * P); 218 | s_multiply(INV_SIGMA, N, P, U_T, P, P, product); 219 | //now, multiply product X D_T |=(NXN) 220 | s_multiply(product, N, P, D_T, P, N, *V_T); 221 | free(INV_SIGMA); 222 | free(U_T); 223 | free(product); 224 | } 225 | 226 | double s_matrix_similarity_fabs(double *M_1, int m, int n, double *M_2) 227 | { 228 | double l2_diff = 0.0; 229 | for (int i = 0; i < m; i++) 230 | { 231 | for (int j = 0; j < n; j++) 232 | { 233 | l2_diff += (fabs(M_1[i * n + j]) - fabs(M_2[i * n + j])) * (fabs(M_1[i * n + j]) - fabs(M_2[i * n + j])); 234 | } 235 | } 236 | l2_diff = sqrt(l2_diff); 237 | return l2_diff; 238 | } 239 | 240 | 241 | double s_matrix_similarity(double *M_1, int m, int n, double *M_2) 242 | { 243 | double l2_diff = 0.0; 244 | for (int i = 0; i < m; i++) 245 | { 246 | for (int j = 0; j < n; j++) 247 | { 248 | l2_diff += (M_1[i * n + j] - M_2[i * n + j]) * (M_1[i * n + j] - M_2[i * n + j]); 249 | } 250 | } 251 | l2_diff = sqrt(l2_diff); 252 | return l2_diff; 253 | } 254 | 255 | 256 | double s_upper_triangular_sum(double *A, int P) 257 | { 258 | double sum = 0.0; 259 | for (int i = 0; i < P; i++) 260 | { 261 | for (int j = i + 1; j < P; j++) 262 | { 263 | sum += A[i * P + j] * A[i * P + j]; 264 | } 265 | } 266 | return sqrt(sum); 267 | } 268 | 269 | void s_set_array(double *A, int P, double *a) 270 | { 271 | //copying all the A-diagonal elements: 272 | for (int i = 0; i < P; i++) 273 | { 274 | a[i] = A[i * P + i]; 275 | } 276 | //copying upper triangular A elements: 277 | int index = P; 278 | for (int i = 0; i < P; i++) 279 | { 280 | for (int j = i + 1; j < P; j++) 281 | { 282 | a[index++] = A[i * P + j]; 283 | } 284 | } 285 | } 286 | 287 | __device__ int device_iter; 288 | 289 | template 290 | __global__ void kernel_MatMul(double *A, int rA, int cA, 291 | double *B, int rB, int cB, double *C) 292 | { 293 | assert(cA == rB); 294 | int bIDx = blockIdx.x, bIDy = blockIdx.y, tIDx = threadIdx.x, tIDy = threadIdx.y; 295 | int row_ = bIDy * BLOCK_SIZE + tIDy; 296 | int col_ = bIDx * BLOCK_SIZE + tIDx; 297 | __shared__ double A_sub[BLOCK_SIZE][BLOCK_SIZE]; 298 | __shared__ double B_sub[BLOCK_SIZE][BLOCK_SIZE]; 299 | double C_sub = 0.0; 300 | for (int m = 0; m < (BLOCK_SIZE + cA - 1) / BLOCK_SIZE; m++) 301 | { 302 | if (m * BLOCK_SIZE + tIDx < cA && row_ < rA) 303 | { 304 | A_sub[tIDy][tIDx] = A[row_ * cA + m * BLOCK_SIZE + tIDx]; 305 | } 306 | else 307 | { 308 | A_sub[tIDy][tIDx] = 0.0; 309 | } 310 | if (m * BLOCK_SIZE + tIDy < rB && col_ < cB) 311 | { 312 | B_sub[tIDy][tIDx] = B[(m * BLOCK_SIZE + tIDy) * cB + col_]; 313 | } 314 | else 315 | { 316 | B_sub[tIDy][tIDx] = 0.0; 317 | } 318 | __syncthreads(); 319 | #pragma unroll 320 | for (int k = 0; k < BLOCK_SIZE; k++) 321 | C_sub += A_sub[tIDy][k] * B_sub[k][tIDx]; 322 | __syncthreads(); 323 | } 324 | if (row_ < rA && col_ < cB) 325 | { 326 | C[cB * BLOCK_SIZE * bIDy + BLOCK_SIZE * bIDx + cB * tIDy + tIDx] = C_sub; 327 | } 328 | } 329 | 330 | __device__ void chess_tourney_params(int P, int *row_pair, int iter) 331 | { 332 | //NOTE: here, row_pair is thread-local 333 | int localID = threadIdx.x; 334 | int index1, index2; 335 | index1 = (localID + iter) % (P - 1); 336 | if (localID != 0) 337 | { 338 | index2 = (P - localID + iter - 1) % (P - 1); 339 | } 340 | else 341 | { 342 | index2 = P - 1; 343 | } 344 | row_pair[0] = min(index1, index2); 345 | row_pair[1] = max(index1, index2); 346 | } 347 | 348 | __global__ void kernel_compute_all_chess_params(int P, int *device_IterBlockToElem) 349 | { 350 | int blockID = blockIdx.x; 351 | //each ONE of the P-1 blocks is responsible for computing chess-tourney parameters for ONE of the P-1 iterations 352 | int index = blockID*P + threadIdx.x*2; 353 | assert(threadIdx.x < P/2); 354 | int *row_pair = (int *) malloc(sizeof(int)*2); 355 | chess_tourney_params(P, row_pair, blockID); 356 | device_IterBlockToElem[index] = row_pair[0]; //|=(P-1)X(P/2*2) 357 | device_IterBlockToElem[index+1] = row_pair[1]; 358 | free(row_pair); 359 | } 360 | 361 | __global__ void kernel_compute_params(double *device_A, int P, int iter, double *device_sine, double *device_cosine, int *device_IterBlockToElem) 362 | { 363 | /*1 Block, P/2 threads: threadID t handles params for its alloted pair (for a particular device_iter)*/ 364 | int localID = threadIdx.x; 365 | assert(localID < P / 2); 366 | int k, l; 367 | double elem, y, d, r, c, s; //,t 368 | k = device_IterBlockToElem[iter*P+localID*2]; //row 369 | l = device_IterBlockToElem[iter*P+localID*2+1]; //col 370 | elem = device_A[k * P + l]; 371 | y = (device_A[l * P + l] - device_A[k * P + k]) * 0.5; 372 | d = fabs(y) + sqrt(elem * elem + y * y); 373 | r = sqrt(elem * elem + d * d); 374 | if (r < EPSILON) 375 | { 376 | c = 1.0; 377 | s = 0.0; 378 | } 379 | else 380 | { 381 | c = d / r; 382 | s = y / fabs(y) * elem / r; //t=y/fabs(y)*p*p/d; 383 | } 384 | device_cosine[k * P + l] = c; 385 | device_sine[k * P + l] = s; 386 | } 387 | 388 | __global__ void kernel_row_update(int iter, double *device_A, double *device_X, int P, double *device_sine, double *device_cosine, int *device_IterBlockToElem) 389 | { 390 | int localID = threadIdx.x; 391 | int blockID = blockIdx.x; 392 | 393 | /*Based on blockID [total blocks=P/2], compute the corresponding two rows: p,q for device_iter*/ 394 | __shared__ int row_pair[2]; 395 | __shared__ double params[2]; //[sin_, cos_] 396 | 397 | if (localID == 0) //to minimize global memory access latency at the cost of divergence 398 | { 399 | row_pair[0] = device_IterBlockToElem[iter*P+blockID * 2]; 400 | row_pair[1] = device_IterBlockToElem[iter*P+blockID * 2 + 1]; 401 | params[0] = device_sine[row_pair[0] * P + row_pair[1]]; 402 | params[1] = device_cosine[row_pair[0] * P + row_pair[1]]; 403 | } 404 | __syncthreads(); //all "P" threads in the block are synchronized and have access to row_pair(k,l) and params 405 | 406 | //CHECKPOINT: Can you reduce shared-memory bank conflicts here? 407 | int k = row_pair[0], l = row_pair[1]; 408 | double sin_ = params[0], cos_ = params[1], elem_k=device_A[k*P+localID], elem_l=device_A[l * P + localID]; 409 | 410 | /*Concurrent modifications to all row pairs(k,l) [different blocks]*/ 411 | /*Concurrent modifications to different-column elements of a row pair: ["P" threads of the block]*/ 412 | 413 | /*X is col-major, i.e. write in X-transpose*/ 414 | device_X[localID * P + k] = elem_k * cos_ - elem_l * sin_; 415 | device_X[localID * P + l] = elem_k * sin_ + elem_l * cos_; 416 | } 417 | 418 | __global__ void kernel_col_update(int iter, double *device_A, double *device_X, int P, 419 | double *device_eigenvectors, double *device_sine, double *device_cosine, int *device_IterBlockToElem) 420 | { 421 | int localID = threadIdx.x; 422 | int blockID = blockIdx.x; 423 | 424 | /*Based on blockID [total blocks=P/2], compute the corresponding two cols: p,q for device_iter*/ 425 | __shared__ int col_pair[2]; 426 | __shared__ double params[2]; //[sin_, cos_] 427 | if (localID == 0) //to minimize global memory access latency at the cost of divergence 428 | { 429 | col_pair[0] = device_IterBlockToElem[iter*P+blockID * 2]; 430 | col_pair[1] = device_IterBlockToElem[iter*P+blockID * 2 + 1]; 431 | params[0] = device_sine[col_pair[0] * P + col_pair[1]]; 432 | params[1] = device_cosine[col_pair[0] * P + col_pair[1]]; 433 | } 434 | __syncthreads(); //all "P" threads in the block are synchronized and have access to row_pair(k,l) and params 435 | 436 | //CHECKPOINT: Can you reduce shared-memory bank conflicts here? Is this better than computing pair(p,q) all over again 437 | int k = col_pair[0], l = col_pair[1]; 438 | double sin_ = params[0], cos_ = params[1]; 439 | 440 | /*Concurrent modifications to all row pairs(k,l) [different blocks]*/ 441 | /*Concurrent modifications to different-column elements of a row pair: ["P" threads of the block]*/ 442 | double new_eigen_k, new_eigen_l; 443 | 444 | /* col-wise access (inefficient):*/ 445 | //device_A[localID * P + k] = device_X[k * P + localID] * cos_ - device_X[l * P + localID] * sin_; 446 | //device_A[localID * P + l] = device_X[k * P + localID] * sin_ + device_X[l * P + localID] * cos_; 447 | //new_eigen_k = device_eigenvectors[localID * P + k]*cos_ - device_eigenvectors[localID*P+l]*sin_; 448 | //new_eigen_l = device_eigenvectors[localID * P+k]*sin_ + device_eigenvectors[localID*P+l]*cos_; 449 | //device_eigenvectors[localID * P + k] = new_eigen_k; 450 | //device_eigenvectors[localID * P+l] = new_eigen_l; 451 | 452 | /*row-wise access (efficient):*/ 453 | int kp = k*P + localID, lp = l *P+localID; 454 | device_A[kp] = device_X[kp] * cos_ - device_X[lp] * sin_; 455 | device_A[lp] = device_X[kp] * sin_ + device_X[lp] * cos_; 456 | new_eigen_k = device_eigenvectors[kp]*cos_ - device_eigenvectors[lp]*sin_; 457 | new_eigen_l = device_eigenvectors[kp]*sin_ + device_eigenvectors[lp]*cos_; 458 | device_eigenvectors[kp] = new_eigen_k; 459 | device_eigenvectors[lp] = new_eigen_l; 460 | } 461 | 462 | double compute_offset(double *A, int P) 463 | { 464 | double sum = 0.0; 465 | for (int i = 0; i < P; i++) 466 | { 467 | for (int j = i + 1; j < P; j++) 468 | { 469 | sum += fabs(A[i * P + j]); 470 | } 471 | } 472 | return sum; 473 | } 474 | 475 | double findmaxUT(double *A, int P) 476 | { 477 | double temp = -1; 478 | for (int i = 0; i < P; i++) 479 | { 480 | for (int j = i + 1; j < P; j++) 481 | { 482 | temp = max(temp, fabs(A[i * P + j])); 483 | } 484 | } 485 | return temp; 486 | } 487 | 488 | void GPU_multiply(double *d_A, const int rA, const int cA, double *d_B, const int rB, const int cB, double *d_C, int block_size) 489 | { 490 | dim3 threads(block_size, block_size); 491 | int gridX, gridY; 492 | if (cB % threads.x==0) 493 | gridX = cB/threads.x; 494 | else 495 | gridX = ceil(cB*1.0/threads.x); 496 | if (rA % threads.y==0) 497 | gridY = rA/threads.y; 498 | else 499 | gridY = ceil(rA*1.0/threads.y); 500 | 501 | dim3 grid(gridX, gridY); 502 | if (block_size == 32) 503 | { 504 | kernel_MatMul<32><<>>(d_A, rA, cA, d_B, rB, cB, d_C); 505 | } 506 | else 507 | { 508 | kernel_MatMul<16><<>>(d_A, rA, cA, d_B, rB, cB, d_C); 509 | } 510 | gpuErrchk(cudaPeekAtLastError()); 511 | cudaDeviceSynchronize(); 512 | } 513 | 514 | void GPU_compute_V(double **SIGMA, double *d_D_T, double **U, double **V_T, int N, int P, cudaStream_t *stream1, cudaStream_t *stream2) 515 | { 516 | //V_T = INV-SIGMA * U_T * M 517 | //using pinned memory allocation for INV_SIGMA, U_T 518 | double *INV_SIGMA, *U_T; 519 | size_t double_NP = sizeof(double)*N*P, double_PP = sizeof(double)*P*P, double_NN = sizeof(double)*N*N; 520 | gpuErrchk(cudaMallocHost((void**)&INV_SIGMA, double_NP)); 521 | //INV_SIGMA = (double *)calloc(N*P, sizeof(double)); //|=NXP 522 | memset(INV_SIGMA, 0, double_NP); 523 | for (int i=0; i time_span, time_span2; 589 | double *device_D_T; 590 | gpuErrchk(cudaMalloc((void **)&device_D_T, double_NP)); 591 | 592 | int *device_IterBlockToElem; //to store mapping of P/2 "blocks" to element at (p,q), computed in the first kernel call 593 | gpuErrchk(cudaMalloc((void **)&device_IterBlockToElem, sizeof(int) *(P-1)*P / 2 * 2)); 594 | 595 | /********STREAM PARALLELIZATION********/ 596 | kernel_compute_all_chess_params<<>>(P, device_IterBlockToElem); 597 | s_transpose(D, N, P, D_T); 598 | cudaMemcpyAsync(device_D_T, D_T, double_NP, cudaMemcpyHostToDevice, stream2); 599 | 600 | /*********Implicit Stream Barrier**********/ 601 | cudaFreeHost(D_T); //D_T is not required to be stored on the host once copied to the device (~10GB) 602 | gpuErrchk(cudaMallocHost((void**)&A, double_PP)); 603 | eigenvectors = (double *)malloc(double_PP); 604 | gpuErrchk(cudaMallocHost((void**)&eigenvectors_T, double_PP)); 605 | double *device_D, *device_A, *device_X; 606 | gpuErrchk(cudaMalloc((void **)&device_D, double_NP)); 607 | gpuErrchk(cudaMalloc((void **)&device_A, double_PP)); 608 | gpuErrchk(cudaMalloc((void **)&device_X, double_PP)); 609 | gpuErrchk(cudaMemcpy(device_D, D, double_NP, cudaMemcpyHostToDevice)); 610 | 611 | //printf("starting multiplication of D_T*D=A:\n"); 612 | 613 | /* Parallelized matrix multiplication (D_T*D=A) */ 614 | GPU_multiply(device_D_T, P, N, device_D, N, P, device_A, 32); 615 | 616 | s_initialize_identity(eigenvectors_T, P); 617 | double *device_eigenvectors_T; 618 | gpuErrchk(cudaMalloc((void **)&device_eigenvectors_T, double_PP)); 619 | cudaMemcpy(device_eigenvectors_T, eigenvectors_T, double_PP, cudaMemcpyHostToDevice); 620 | 621 | double *device_sine, *device_cosine; 622 | gpuErrchk(cudaMalloc((void **)&device_sine, double_PP)); 623 | gpuErrchk(cudaMalloc((void **)&device_cosine, double_PP)); 624 | cudaMemset(device_sine, 0, double_PP); 625 | cudaMemset(device_cosine, 0, double_PP); 626 | 627 | int grid_size, block_size=P, iter = 0, counter = 0; 628 | double offset_ = THRESHOLD + 1; 629 | if (P%2==0) 630 | grid_size = P / 2; 631 | else 632 | grid_size = P/2+1; 633 | 634 | while (counter < MAX_SWEEPS && offset_ > THRESHOLD) //sweeps 635 | { 636 | iter = 0; 637 | while (iter < P - 1) 638 | { 639 | //Compute rotation parameters for all (p,q): q>p 640 | kernel_compute_params<<<1, grid_size>>>(device_A, P, iter, device_sine, device_cosine, device_IterBlockToElem); 641 | cudaDeviceSynchronize(); 642 | 643 | //row-update kernel 644 | kernel_row_update<<>>(iter, device_A, device_X, P, device_sine, device_cosine, device_IterBlockToElem); 645 | cudaDeviceSynchronize(); 646 | 647 | //col-update & eigen-vector update kernel 648 | kernel_col_update<<>>(iter, device_A, device_X, P, device_eigenvectors_T, device_sine, device_cosine, device_IterBlockToElem); 649 | cudaDeviceSynchronize(); 650 | iter++; 651 | } 652 | cudaMemcpy(A, device_A, double_PP, cudaMemcpyDeviceToHost); 653 | 654 | offset_ = compute_offset(A, P); 655 | printf("Sweep:%d, offset:%f\n", counter, offset_); 656 | counter++; 657 | } 658 | 659 | cudaMemcpy(eigenvectors_T, device_eigenvectors_T, double_PP, cudaMemcpyDeviceToHost); 660 | s_transpose(eigenvectors_T, P, P, eigenvectors); 661 | 662 | double *eigenvalues = (double *)malloc(sizeof(double) * P); 663 | int *e_indices = (int *)malloc(sizeof(int) * P); 664 | for (int i = 0; i < P; i++) 665 | { 666 | eigenvalues[i] = A[i * P + i]; 667 | e_indices[i] = i; 668 | } 669 | 670 | //sort eigenvalues in desc: 671 | s_mergesort(eigenvalues, P, e_indices, 0, P - 1); 672 | double temp_ = eigenvalues[0]; 673 | for (int i = 0; i < P; i++) 674 | { 675 | // printf("%f,", eigenvalues[i]); 676 | assert(temp_>=eigenvalues[i]); 677 | temp_=eigenvalues[i]; 678 | } 679 | 680 | cudaError_t err = cudaGetLastError(); 681 | if (err != cudaSuccess) 682 | { 683 | printf("Error: %s\n", cudaGetErrorString(err)); 684 | } 685 | 686 | //computing SIGMA: 687 | double sum_variance = 0.0; 688 | for (int i = 0; i < P; i++) 689 | { 690 | sum_variance+=eigenvalues[i]; 691 | (*SIGMA)[i] = sqrt(eigenvalues[i]); 692 | } 693 | 694 | //computing U: 695 | int index; 696 | for (int row = 0; row < P; row++) 697 | { 698 | index = row*P; 699 | for (int col = 0; col < P; col++) 700 | { 701 | (*U)[index + col] = eigenvectors[index + e_indices[col]]; 702 | } 703 | } 704 | 705 | //compute V_T: 706 | GPU_compute_V(SIGMA, device_D_T, U, V_T, N, P, &stream1, &stream2); 707 | 708 | //Parallelized (PCA): 709 | int K_ = 0; 710 | double retention_ = 0.0; 711 | int count_ = 0; 712 | while((retention_>(t_end - t_begin); 745 | printf("TOTAL TIME:%f\n", time_span.count()); 746 | return; 747 | 748 | /****************SERIAL JACOBI EIGENVALUE ALGORITHM (can be used for Speedup Computation):****************/ 749 | // t_begin = high_resolution_clock::now(); 750 | // //begin Jacobi eigenvalue algorithm: 751 | // int state = P, num_iter = 0, m, k, l; //m: pivot row identifier 752 | // double p, y, d, r, c, s, t; //p: pivot element, c: cos, s: sin 753 | // double *E = (double *)malloc(sizeof(double)*P*P); 754 | // s_initialize_identity(E, P); //P*P 755 | // double *E_ = (double *)malloc(sizeof(double) * P * P); 756 | // double *e = (double *)malloc(sizeof(double) * P); //init eigen-values array 757 | // double *e_ = (double *)malloc(sizeof(double) * P); 758 | // int *ind = (int *)malloc(sizeof(int) * P); //init maxindex array 759 | // bool *changed = (bool *)malloc(sizeof(bool) * P); //change in eigen_value[k] 760 | // double *A_s = (double *)calloc(P * P, sizeof(double)); 761 | // D_T = (double *)malloc(sizeof(double) * P * N); 762 | // s_transpose(D, N, P, D_T); 763 | // s_multiply(D_T, P, N, D, N, P, A_s); 764 | 765 | // printf("printing A_s:\n"); 766 | // //print_matrix(A_s, P, P, 1); 767 | // for (int i = 0; i < P; i++) 768 | // { 769 | // ind[i] = s_maxind(A_s, P, i); //NOTE: undefined for last row 770 | // e[i] = A_s[i * P + i]; 771 | // changed[i] = true; 772 | // //printf("%d, %d\n", i, ind[i]); 773 | // } 774 | // while (state && num_iter < MAX_ITER) 775 | // { 776 | // memcpy(E_, E, sizeof(double) * P * P); 777 | // memcpy(e_, e, sizeof(double) * P); 778 | // //find index (k,l) of pivot p 779 | // m = 0; 780 | // for (int i = 1; i < P - 1; i++) 781 | // { 782 | // //printf("i:%d, %d, %f\n", i, ind[i], A[i*P+ind[i]]); 783 | // if (fabs(A_s[i * P + ind[i]]) > fabs(A_s[m * P + ind[m]])) 784 | // { 785 | // m = i; 786 | // } 787 | // } 788 | // k = m; 789 | // l = ind[k]; 790 | // p = A_s[k * P + l]; 791 | // y = 0.5 * (e[l] - e[k]); 792 | // d = fabs(y) + sqrt(p * p + y * y); 793 | // r = sqrt(p * p + d * d); 794 | // c = d / r; 795 | // s = p / r; 796 | // t = p * p / d; 797 | // if (y < 0) 798 | // { 799 | // s = -s; 800 | // t = -t; 801 | // } 802 | // A_s[k * P + l] = 0.0; 803 | // s_update(k, -t, e, changed, &state); 804 | // s_update(l, t, e, changed, &state); 805 | 806 | // //rotate rows and cols k and l: 807 | // for (int i = 0; i < k; i++) 808 | // { 809 | // s_rotate(i, k, i, l, A_s, P, c, s); 810 | // } 811 | // for (int i = k + 1; i < l; i++) 812 | // { 813 | // s_rotate(k, i, i, l, A_s, P, c, s); 814 | // } 815 | // for (int i = l + 1; i < P; i++) 816 | // { 817 | // s_rotate(k, i, l, i, A_s, P, c, s); 818 | // } 819 | // //rotate eigenvectors: 820 | // for (int i = 0; i < P; i++) 821 | // { 822 | // double e_ik = c * E[i * P + k] - s * E[i * P + l]; 823 | // double e_il = s * E[i * P + k] + c * E[i * P + l]; 824 | // E[i * P + k] = e_ik; 825 | // E[i * P + l] = e_il; 826 | // } 827 | // ind[k] = s_maxind(A_s, P, k); 828 | // ind[l] = s_maxind(A_s, P, l); 829 | // double diff = l2_diff_norm(e_, e, P); 830 | // double diff_2 = l2_matrix_diff_norm(E_, E, P, P); 831 | // double upper_triangular_sum = s_upper_triangular_sum(A_s, P); 832 | // printf("\rITER:%d, state:%d, diff:%.10f up-sum:%f", num_iter, state, diff + diff_2, upper_triangular_sum); 833 | // fflush(stdout); 834 | // num_iter++; 835 | // } 836 | // //sort eigenvalues in desc: 837 | // int *indices = (int *)malloc(sizeof(int) * P); 838 | // for (int i = 0; i < P; i++) 839 | // { 840 | // indices[i] = i; 841 | // } 842 | // s_mergesort(e, P, indices, 0, P - 1); 843 | // printf("Indices arr:\n"); 844 | // for (int i = 0; i < P; i++) 845 | // { 846 | // printf("%d,", indices[i]); 847 | // } 848 | // printf("\n"); 849 | // printf("e arr:\n"); 850 | // for (int i = 0; i < P; i++) 851 | // { 852 | // printf("%f,", e[i]); 853 | // } 854 | // printf("\n"); 855 | 856 | // // //computing SIGMA: 857 | // // printf("printing sigma:\n"); 858 | // double sum_eigenvalues_s=0.0; 859 | // for (int i = 0; i < P; i++) 860 | // { 861 | // (*SIGMA)[i] = sqrt(e[i]); 862 | // sum_eigenvalues_s+=e[i]; 863 | // //printf("%f,", (*SIGMA)[i]); 864 | // } 865 | // printf("sum evals_s:%f\n", sum_eigenvalues_s); 866 | // printf("\n"); 867 | // //computing SIGMA_MATRIX: 868 | // double *temp_sigma = (double *)calloc(P * N, sizeof(double)); 869 | // for (int i = 0; i < P; i++) 870 | // { 871 | // //assert(e[i]>=0); 872 | // temp_sigma[i * N + i] = sqrt(e[i]); 873 | // } 874 | 875 | // //eigenvectors matrix (U for D_T*D): 876 | // printf("printing E:\n"); 877 | 878 | // //L2 879 | // double sum_temp=0.0; 880 | // for (int x=0; x>(t_end - t_begin); 945 | // printf("SEQUENTIAL TOTAL TIME:%f\n print matrix", time_span.count()); 946 | // print_matrix(D_T, P, N, 1); 947 | // return; 948 | } 949 | --------------------------------------------------------------------------------