├── LICENSE
├── README.md
├── lib_eigsolve
    ├── Makefile
    ├── dsyevd_gpu.F90
    ├── dsygst_gpu.F90
    ├── dsygvdx_gpu.F90
    ├── dsymv_gpu.F90
    ├── dsytd2_gpu.F90
    ├── dsytrd_gpu.F90
    ├── eigsolve_vars.F90
    ├── toolbox.F90
    ├── zheevd_gpu.F90
    ├── zhegst_gpu.F90
    ├── zhegvdx_gpu.F90
    ├── zhemv_gpu.F90
    ├── zhetd2_gpu.F90
    └── zhetrd_gpu.F90
└── test_driver
    ├── Makefile
    ├── test_dsygvdx.F90
    ├── test_zhegvdx.F90
    ├── toolbox.F90
    └── wallclock.c


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 NVIDIA Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Eigensolver_gpu
 2 | GPU Eigensolver for Quantum ESPRESSO package
 3 | 
 4 | ###
 5 | This library implements a generalized eigensolver for symmetric/hermitian-definite eigenproblems with functionality similar to
 6 | the DSYGVD/X or ZHEGVD/X functions available within LAPACK/MAGMA. This solver has less dependencies on CPU computation 
 7 | than comparable implementations within MAGMA, which may be of benefit to systems with limited CPU resources or to 
 8 | users without access to high-performing CPU LAPACK libraries. 
 9 | 
10 | ###
11 | This implementation can be considered as a "proof of concept" and has been written to target the Quantum ESPRESSO
12 | code. As such, this implementation is built only to handle one problem configuration of DSYGVD/X or ZHEGVD/X. Specifically, this
13 | solver computes eigenvalues and associated eigenvectors over a specified integer range for a 
14 | symmetric/hermitian-definite eigenproblem in the following form: 
15 | 
16 | 	A * x = lambda * B * x
17 | 
18 | where `A` and `B` are symmetric/hermitian-matrices and `B` is positive definite. The solver expects the upper-triangular parts of the 
19 | input `A` and `B` arguments to be populated. This configuration corresponds to calling DSYGVX/ZHEGVX within LAPACK with the configuration 
20 | arguments `ITYPE = 1`, `JOBZ = 'V'`, `RANGE = 'I'`, and `UPLO = 'U'`. 
21 | 
22 | See comments within `dsygvdx_gpu.F90` or `zhegvdx_gpu.F90` for specific details on usage.
23 | 
24 | For additional information about the solver with some performance results, see presentation at the following link: (will be added
25 | once available publically on the GTC On-Demand website)
26 | 
27 | ### Building
28 | * Compilation of this library requires the PGI compiler version 18.10 or higher.
29 | * Using the provided `Makefile` will generate a static library object `lib_eigsolve.a` which can included in your
30 | target application. 
31 | * Library requires linking to cuBLAS and cuSOLVER. Use `-Mcuda=cublas,cusolver` flag when linking your application to do this.
32 | * This library also requires linking to a CPU LAPACK library with an implementation of the `zstedc` function.
33 | * If NVTX is enabled with `-DUSE_NVTX` flag, also must link to NVTX. Use `-L${CUDAROOT}/lib64 -lnvToolsExt` flag when linking your application to do this
34 |   where `${CUDAROOT}` is the root directory of your CUDA installation.
35 | 
36 | An example of using this solver in a program can be found in the `test_driver` subdirectory. This program does a little performance testing
37 | and validation against existing functionality in a linked CPU LAPACK library, cuSOLVER, and MAGMA (if available). 
38 | 
39 | ### License
40 | This code is released under an MIT license which can be found in `LICENSE`. 
41 | 


--------------------------------------------------------------------------------
/lib_eigsolve/Makefile:
--------------------------------------------------------------------------------
 1 | # Flags for GPUs with Volta architecture. Change cc70 to cc60 if compiling for Pascal.
 2 | FLAGS = -O3 -mp -pgf90libs -Mcuda=cc70,cuda10.1,ptxinfo -Mlarge_arrays
 3 | FLAGS2 = -O3 -mp -pgf90libs -Mcuda=cc70,cuda10.1,ptxinfo,maxregcount:64 -Mlarge_arrays
 4 | 
 5 | # For performance reasons, some files are compiled with cc60, even on Volta. Do not update the cc value in FLAGS3! 
 6 | FLAGS3 = -O3 -mp -pgf90libs -Mcuda=cc60,cuda10.1,ptxinfo,nordc,maxregcount:255  -Mlarge_arrays
 7 | 
 8 | # Uncomment to enable NVTX markers
 9 | #OPTFLAGS = -DUSE_NVTX
10 | 
11 | all: lib_eigsolve.a
12 | 
13 | OBJS = eigsolve_vars.o toolbox.o zhegst_gpu.o zhemv_gpu.o zhetd2_gpu.o zhetrd_gpu.o zheevd_gpu.o zhegvdx_gpu.o \
14 |        dsygst_gpu.o dsymv_gpu.o dsytd2_gpu.o dsytrd_gpu.o dsyevd_gpu.o dsygvdx_gpu.o
15 | 
16 | zhetd2_gpu.o : zhetd2_gpu.F90
17 | 	pgf90 -c ${FLAGS2} ${OPTFLAGS}  $*.F90 -o $*.o
18 | zhemv_gpu.o : zhemv_gpu.F90
19 | 	pgf90 -c ${FLAGS3} ${OPTFLAGS}  $*.F90 -o $*.o
20 | dsytd2_gpu.o : dsytd2_gpu.F90
21 | 	pgf90 -c ${FLAGS2} ${OPTFLAGS}  $*.F90 -o $*.o
22 | dsymv_gpu.o : dsymv_gpu.F90
23 | 	pgf90 -c ${FLAGS3} ${OPTFLAGS}  $*.F90 -o $*.o
24 | %.o: %.cuf
25 | 	pgf90 -c ${FLAGS} ${OPTFLAGS} $*.cuf -o $*.o
26 | %.o: %.F90
27 | 	pgf90 -c ${FLAGS} ${OPTFLAGS} $*.F90 -o $*.o
28 | 
29 | lib_eigsolve.a: $(OBJS)
30 | 	ar -cr lib_eigsolve.a $(OBJS)
31 | 
32 | clean:
33 | 	rm -f lib_eigsolve.a *.mod *.o
34 | 


--------------------------------------------------------------------------------
/lib_eigsolve/dsyevd_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module dsyevd_gpu
 25 |   use cudafor
 26 |   use cublas
 27 |   implicit none
 28 | 
 29 |  contains
 30 | 
 31 |     ! Custom dsyevd routine
 32 |     subroutine dsyevd_gpu(jobz, uplo, il, iu, N, A, lda, Z, ldz, w, work, lwork, &
 33 |                           work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info)
 34 |       use dsytrd_gpu
 35 |       use eigsolve_vars
 36 |       use nvtx_inters
 37 |       implicit none
 38 |       character                                   :: uplo, jobz
 39 |       integer                                     :: N, NZ, lda, lwork, istat, info
 40 |       integer                                     :: lwork_h, liwork_h, ldz_h
 41 |       integer                                     :: i, j, k, nb1, nb2, ib, mi, ldt, ldz, il, iu
 42 |       real(8), dimension(1:lwork), device         :: work
 43 |       real(8), dimension(1:lwork_h)               :: work_h
 44 |       integer, dimension(1:liwork_h)              :: iwork_h
 45 | 
 46 |       real(8), dimension(1:lda, 1:N), device      :: A
 47 |       real(8), dimension(1:lda, 1:N), device      :: Z
 48 |       real(8), dimension(1:ldz_h, 1:N), pinned    :: Z_h
 49 |       real(8), dimension(1:N), device             :: w
 50 |       real(8), dimension(1:N), pinned             :: w_h
 51 | 
 52 |       integer                                     :: inde, indtau, indwrk, llwork, llwork_h, indwk2, indwk3, llwrk2
 53 |       real(8), parameter                          :: one = 1.0_8
 54 | 
 55 |       type(dim3) :: blocks, threads
 56 | 
 57 | 
 58 |       if (uplo .ne. 'U' .or. jobz .ne. 'V') then
 59 |         print*, "Provided itype/uplo not supported!"
 60 |         return
 61 |       endif
 62 | 
 63 |       nb1 = 32 ! Blocksize for tridiagonalization
 64 |       nb2 = min(64, N) ! Blocksize for rotation procedure, fixed at 64
 65 |       ldt = nb2
 66 |       NZ = iu - il + 1
 67 | 
 68 |       inde = 1
 69 |       indtau = inde + n
 70 |       indwrk = indtau + n
 71 |       llwork = lwork - indwrk + 1
 72 |       llwork_h = lwork_h - indwrk + 1
 73 |       indwk2 = indwrk + (nb2)*(nb2)
 74 |       indwk3 = indwk2 + (nb2)*(nb2)
 75 | 
 76 |       !JR Note: ADD SCALING HERE IF DESIRED. Not scaling for now.
 77 | 
 78 |       ! Call DSYTRD to reduce A to tridiagonal form
 79 |       call nvtxStartRange("dytrd", 0)
 80 |       call dsytrd_gpu('U', N, A, lda, w, work(inde), work(indtau), work(indwrk), llwork, nb1)
 81 |       call nvtxEndRange
 82 | 
 83 |       ! Copy diagonal and superdiagonal to CPU
 84 |       w_h(1:N) = w(1:N)
 85 |       work_h(inde:inde+N-1) = work(inde:inde+N-1)
 86 | 
 87 |       ! Restore lower triangular of A (works if called from zhegvd only!)
 88 |       !$cuf kernel do(2) <<<*,*>>>
 89 |       do j = 1,N
 90 |         do i = 1,N
 91 |           if (i > j) then
 92 |             A(i,j) = Z(i,j)
 93 |           endif
 94 |         end do
 95 |       end do
 96 | 
 97 |       ! Call DSTEDC to get eigenvalues/vectors of tridiagonal A on CPU
 98 |       call nvtxStartRange("dstedc", 1)
 99 |       call dstedc('I', N, w_h, work_h(inde), Z_h, ldz_h, work_h(indwrk), llwork_h, iwork_h, liwork_h, istat)
100 |       if (istat /= 0) then
101 |         write(*,*) "dsyevd_gpu error: dstedc failed!"
102 |         info = -1
103 |         return
104 |       endif
105 |       call nvtxEndRange
106 | 
107 |       ! Copy eigenvectors and eigenvalues to GPU
108 |       istat = cudaMemcpy2D(Z(1, 1), ldz, Z_h, ldz_h, N, NZ)
109 |       w(1:N) = w_h(1:N)
110 | 
111 |       !! Call DORMTR to rotate eigenvectors to obtain result for original A matrix
112 |       !! JR Note: Eventual function calls from DORMTR called directly here with associated indexing changes
113 |       call nvtxStartRange("dormtr", 2)
114 | 
115 |       istat = cudaEventRecord(event2, stream2)
116 | 
117 |       k = N-1
118 | 
119 |       do i = 1, k, nb2
120 |         ib = min(nb2, k-i+1)
121 | 
122 |         ! Form block reflector T in stream 1
123 |         call dlarft_gpu(i+ib-1, ib, A(1, 2+i-1), lda, work(indtau + i -1), work(indwrk), ldt, work(indwk2), ldt)
124 | 
125 |         mi = i + ib - 1
126 |         ! Apply reflector to eigenvectors in stream 2
127 |         call dlarfb_gpu(mi, NZ, ib, A(1,2+i-1), lda, work(indwrk), ldt, Z, ldz, work(indwk3), N, work(indwk2), ldt)
128 |       end do
129 | 
130 |       call nvtxEndRange
131 | 
132 |     end subroutine dsyevd_gpu
133 | 
134 |     subroutine dlarft_gpu(N, K, V, ldv, tau, T, ldt, W, ldw)
135 |       use cublas
136 |       use eigsolve_vars
137 |       implicit none
138 |       integer                               :: N, K, ldv, ldt, ldw
139 |       real(8), dimension(ldv, K), device    :: V
140 |       real(8), dimension(K), device         :: tau
141 |       real(8), dimension(ldt, K), device    :: T
142 |       real(8), dimension(ldw, K), device    :: W
143 | 
144 |       integer                               :: i, j, istat
145 |       type(dim3)                            :: threads
146 | 
147 |       istat = cublasSetStream(cuHandle, stream1)
148 | 
149 |       ! Prepare lower triangular part of block column for dsyrk call. 
150 |       ! Requires zeros in lower triangular portion and ones on diagonal.
151 |       ! Store existing entries (excluding diagonal) in W
152 |       !$cuf kernel do(2) <<<*, *, 0, stream1>>>
153 |       do j = 1, K
154 |         do i = N-K + 1, N
155 |           if (i-N+K == j) then
156 |             V(i, j) = 1.0d0
157 |           else if (i-N+k > j) then
158 |             W(i-N+k,j) = V(i,j)
159 |             V(i,j) = 0.0d0
160 |           endif
161 |         end do
162 |       end do
163 | 
164 |       istat = cudaEventRecord(event1, stream1)
165 |       istat = cudaStreamWaitEvent(stream1, event2, 0)
166 | 
167 |       ! Form preliminary T matrix
168 |       istat = cublasdsyrk_v2(cuHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, K, N, 1.0_8, V, ldv, 0.0_8, T, ldt)
169 | 
170 |       ! Finish forming T 
171 |       threads = dim3(64, 16, 1)
172 |       call finish_T_block_kernel<<<1, threads, 0, stream1>>>(K, T, ldt, tau)
173 | 
174 |     end subroutine dlarft_gpu
175 | 
176 |     subroutine dlarfb_gpu(M, N, K, V, ldv, T, ldt, C, ldc, work, ldwork, W, ldw)
177 |       use cublas
178 |       use eigsolve_vars
179 |       implicit none
180 |       integer                               :: M, N, K, ldv, ldt, ldc, ldw, ldwork, istat
181 |       integer                               :: i, j
182 |       real(8), dimension(ldv, K), device    :: V
183 |       real(8), dimension(ldt, K), device    :: T
184 |       real(8), dimension(ldw, K), device    :: W
185 |       real(8), dimension(ldc, N), device    :: C
186 |       real(8), dimension(ldwork, K), device :: work
187 | 
188 |       istat = cublasSetStream(cuHandle, stream2)
189 | 
190 |       istat = cudaStreamWaitEvent(stream2, event1, 0)
191 |       istat = cublasdgemm_v2(cuHandle, CUBLAS_OP_T, CUBLAS_OP_N, N, K, M, 1.0d0, C, ldc, v, ldv, 0.0d0, work, ldwork)
192 |       istat = cudaStreamSynchronize(stream1)
193 | 
194 |       istat = cublasdtrmm_v2(cuHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT, N, K, &
195 |         1.0d0, T, ldt, work, ldwork, work, ldwork)
196 | 
197 |       istat = cudaEventRecord(event2, stream2)
198 |       istat = cublasdgemm_v2(cuHandle, CUBLAS_OP_N, CUBLAS_OP_T, M, N, K, -1.0d0, V, ldv, work, ldwork, 1.0d0, c, ldc)
199 | 
200 |       ! Restore clobbered section of block column (except diagonal)
201 |       !$cuf kernel do(2) <<<*, *>>>
202 |       do j = 1, K
203 |         do i = M-K + 1, M
204 |           if (i-M+k > j) then
205 |             V(i,j) = W(i-M+k,j)
206 |           endif
207 |         end do
208 |       end do
209 | 
210 |     end subroutine dlarfb_gpu
211 | 
212 |     attributes(global) subroutine finish_T_block_kernel(N, T, ldt, tau)
213 |       implicit none
214 |       integer, value                     :: N, ldt
215 |       real(8), dimension(ldt, K), device :: T
216 |       real(8), dimension(K), device      :: tau
217 |       ! T_s contains only lower triangular elements of T in linear array, by row
218 |       real(8), dimension(2080), shared   :: T_s 
219 |       ! (i,j) --> ((i-1)*i/2 + j)
220 |       #define IJ2TRI(i,j) (ISHFT((i-1)*i,-1) + j)
221 | 
222 | 
223 |       integer     :: tid, tx, ty, i, j, k, diag
224 |       complex(8)  :: cv
225 | 
226 |       tx = threadIdx%x
227 |       ty = threadIdx%y
228 |       tid = (threadIdx%y - 1) * blockDim%x + tx ! Linear thread id
229 | 
230 |       ! Load T into shared memory
231 |       if (tx <= N) then
232 |         do j = ty, N, blockDim%y
233 |           cv = tau(j)
234 |           if (tx > j) then
235 |             T_s(IJ2TRI(tx,j)) = -cv*T(tx,j)
236 |           else if (tx == j) then
237 |             T_s(IJ2TRI(tx,j)) = cv
238 |           endif
239 |         end do
240 |       end if
241 | 
242 |       call syncthreads()
243 | 
244 |       ! Perform column by column update by first thread column
245 |       do i = N-1, 1, -1
246 |         if (ty == 1) then
247 |           if (tx > i .and. tx <= N) then
248 |             cv = 0.0d0
249 |             do j = i+1, tx
250 |                 cv = cv + T_s(IJ2TRI(j, i)) * T_s(IJ2TRI(tx, j))
251 |             end do
252 |           endif
253 |           
254 |         endif
255 | 
256 |         call syncthreads()
257 |         if (ty == 1 .and. tx > i .and. tx <= N) then
258 |           T_s(IJ2TRI(tx, i)) = cv
259 |         endif
260 |         call syncthreads()
261 | 
262 |       end do
263 | 
264 |       call syncthreads()
265 | 
266 | 
267 |       ! Write T_s to global
268 |       if (tx <= N) then
269 |         do j = ty, N, blockDim%y
270 |           if (tx >= j) then
271 |             T(tx,j) = T_s(IJ2TRI(tx,j))
272 |           endif
273 |         end do
274 |       end if
275 | 
276 |     end subroutine finish_T_block_kernel
277 | 
278 | end module dsyevd_gpu
279 | 


--------------------------------------------------------------------------------
/lib_eigsolve/dsygst_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module dsygst_gpu
 25 |   use cudafor
 26 |   use cublas
 27 | 
 28 |   contains
 29 | 
 30 |     ! dsygst completed in blocks, using 2 ztrsms to solve subblock problem on GPU
 31 |     subroutine dsygst_gpu(itype, uplo, N, A, lda, B, ldb, nb)
 32 |       use eigsolve_vars
 33 |       implicit none
 34 |       integer, intent(in)                                   :: itype, N, lda, ldb, nb
 35 |       character, intent(in)                                 :: uplo
 36 |       real(8), device, dimension(1:ldb, 1:N), intent(in)    :: B
 37 |       real(8), device, dimension(1:lda, 1:N)                :: A
 38 |       real(8), parameter                                    :: one = 1.d0, half = 0.5d0
 39 | 
 40 |       integer                                               :: i, j
 41 |       integer                                               :: k, kb, istat
 42 | 
 43 |       if (itype .ne. 1 .or. uplo .ne. 'U') then
 44 |         print*, "Provided itype/uplo not supported!"
 45 |         return
 46 |       endif
 47 | 
 48 |       istat = cudaEventRecord(event2, stream2)
 49 | 
 50 |       do k = 1, N, nb
 51 |         kb = min(N-k+1, nb)
 52 | 
 53 |         istat = cublasSetStream(cuHandle, stream1)
 54 |         
 55 |         istat = cudaStreamWaitEvent(stream1, event2, 0)
 56 |         ! Populate subblock with complete symmetric entries (needed for DTRSM calls)
 57 |         !$cuf kernel do(2) <<<*,*, 0, stream1>>>
 58 |         do j = k,k+kb-1
 59 |           do i = k,k+kb-1
 60 |             if (j < i) then
 61 |               A(i,j) = A(j,i)
 62 |             endif
 63 |           end do
 64 |         end do
 65 | 
 66 |         ! Solve subblock problem (this version results in fully populated A subblock)
 67 |         istat =  cublasdtrsm_v2(cuHandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_OP_N, kb, kb, &
 68 |                                 one, B(k,k), ldb, A(k,k), lda)  
 69 |         istat =  cublasdtrsm_v2(cuHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_OP_N, kb, kb, &
 70 |                                 one, B(k,k), ldb, A(k,k), lda)  
 71 | 
 72 |         istat = cudaEventRecord(event1, stream1)
 73 | 
 74 |         if (k + kb .le. N) then
 75 |           istat = cublasSetStream(cuHandle, stream2)
 76 |           istat =  cublasdtrsm_v2(cuHandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, CUBLAS_OP_N, kb, N-k-kb+1, one, &
 77 |                      B(k, k), ldb, A(k, k+kb), lda) 
 78 | 
 79 |           istat = cudaStreamWaitEvent(stream2, event1, 0)
 80 | 
 81 |           ! Since the A subblock is fully populated, use gemm instead of hemm here
 82 |           istat =  cublasdgemm_v2(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, kb, N-k-kb+1, kb, -half, A(k,k), &
 83 |                      lda, B(k, k+kb), ldb, one, A(k, k+kb), lda)
 84 |           istat = cublasdsyr2k_v2(cuHandle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_T, N-k-kb+1, kb, -one, A(k, k+kb), lda, &
 85 |                       B(k, k+kb), ldb, one, A(k+kb, k+kb), lda)
 86 | 
 87 |           istat = cudaEventRecord(event2, stream2)
 88 | 
 89 |           istat = cublasdgemm_v2(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, kb, N-k-kb+1, kb, -half, A(k,k), &
 90 |                      lda, B(k, k+kb), ldb, one, A(k, k+kb), lda)
 91 | 
 92 |           istat = cublasdtrsm_v2(cuHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_OP_N, kb, N-k-kb+1, one, &
 93 |                      B(k+kb, k+kb), ldb, A(k, k+kb), lda) 
 94 |         end if
 95 | 
 96 |       end do
 97 | 
 98 |     end subroutine dsygst_gpu
 99 | 
100 | end module dsygst_gpu
101 | 


--------------------------------------------------------------------------------
/lib_eigsolve/dsygvdx_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module dsygvdx_gpu
 25 |   use cudafor
 26 |   use cublas
 27 |   implicit none
 28 | 
 29 |   contains
 30 | 
 31 |     ! dsygvdx_gpu
 32 |     ! This solver computes eigenvalues and associated eigenvectors over a specified integer range for a
 33 |     ! symmetric-positive-definite eigenproblem in the following form:
 34 |     !     A * x = lambda * B * x
 35 |     ! where A and B are symmetric matrices and B is positive definite. The solver expects the upper-triangular parts of the
 36 |     ! input A and B arguments to be populated. This configuration corresponds to calling DSYGVX within LAPACK with the configuration
 37 |     ! arguments 'ITYPE = 1', 'JOBZ = 'V'', 'RANGE = 'I'', and 'UPLO = 'U''.
 38 |     !
 39 |     ! Input: 
 40 |     ! On device:
 41 |     !   -  A(lda, N), B(ldb, N) are real(8) matrices on device with upper triangular portion populated
 42 |     !   -  il, iu are integers specifying range of eigenvalues/vectors to compute. Range is [il, iu]
 43 |     !   -  work is a real(8) array for real workspace of length lwork. 
 44 |     !   -  lwork is an integer specifying length of work. lwork >=  2*64*64 + 66*N
 45 |     !
 46 |     ! On host:
 47 |     !   -  work_h is a double-complex array for complex workspace of length lwork_h. 
 48 |     !   -  lwork_h is an integer specifying length of work_h. lwork_h >= 1 + 6*N + 2*N*N
 49 |     !   -  iwork_h is a integer array for integer workspace of length liwork_h. 
 50 |     !   -  liwork_h is an integer specifying length of iwork_h. liwork_h >= 3 + 5*N
 51 |     !   -  (optional) _skip_host_copy is an optional logical argument. If .TRUE., memcopy of final updated eigenvectors from 
 52 |     !      device to host will be skipped.
 53 |     !
 54 |     ! Output:
 55 |     ! On device:
 56 |     !   - A(lda, N), B(ldb, N) are modified on exit. The upper triangular part of A, including the diagonal is destroyed. 
 57 |     !     B is overwritten by the triangular Cholesky factor U corresponding to  B = U**H * U
 58 |     !   - Z(ldz, N) is a real(8) matrix on the device. On exit, the first iu - il + 1 columns of Z
 59 |     !     contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu].
 60 |     !   - w(N) is a real(8) array on the device. On exit, the first iu - il + 1 values of w contain the computed
 61 |     !     eigenvalues
 62 |     !
 63 |     ! On host:
 64 |     !   - Z_h(ldz_h, N) is a real(8) matrix on the host. On exit, the first iu - il + 1 columns of Z
 65 |     !     contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu]. This is a copy of the Z
 66 |     !     matrix on the device. This is only true if optional argument _skip_host_copy is not provided or is set to .FALSE.
 67 |     !   - w(N) is a real(8) array on the host. On exit, the first iu - il + 1 values of w contain the computed
 68 |     !     eigenvalues. This is a copy of the w array on the host.
 69 |     !   - info is an integer. info will equal zero if the function completes succesfully. Otherwise, there was an error.
 70 |     !
 71 |     subroutine dsygvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, &
 72 |                           work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info, _skip_host_copy)
 73 |       use eigsolve_vars
 74 |       use nvtx_inters
 75 |       use dsygst_gpu
 76 |       use dsyevd_gpu
 77 |       implicit none
 78 |       integer                                     :: N, m, lda, ldb, ldz, il, iu, ldz_h, info, nb
 79 |       integer                                     :: lwork_h, liwork_h, lwork, istat
 80 |       real(8), dimension(1:lwork), device         :: work
 81 |       real(8), dimension(1:lwork_h), pinned       :: work_h
 82 |       integer, dimension(1:liwork_h), pinned      :: iwork_h
 83 |       logical, optional                           :: _skip_host_copy
 84 | 
 85 |       real(8), dimension(1:lda, 1:N), device      :: A
 86 |       real(8), dimension(1:ldb, 1:N), device      :: B
 87 |       real(8), dimension(1:ldz, 1:N), device      :: Z
 88 |       real(8), dimension(1:ldz_h, 1:N), pinned    :: Z_h
 89 |       real(8), dimension(1:N), device             :: w
 90 |       real(8), dimension(1:N), pinned             :: w_h
 91 | 
 92 |       real(8), parameter :: one = 1.d0
 93 |       integer :: i, j
 94 |       logical :: skip_host_copy
 95 | 
 96 |       info = 0
 97 |       skip_host_copy = .FALSE.
 98 |       if(present(_skip_host_copy)) skip_host_copy = _skip_host_copy
 99 | 
100 |       ! Check workspace sizes
101 |       if (lwork < 2*64*64 + 66*N) then
102 |         print*, "dsygvdx_gpu error: lwork must be at least 2*64*64 + 66*N"
103 |         info = -1
104 |         return
105 |       else if (lwork_h < 1 + 6*N + 2*N*N) then
106 |         print*, "dsygvdx_gpu error: lwork_h must be at least 1 + 6*N + 2*N*N"
107 |         info = -1
108 |         return
109 |       else if (liwork_h < N) then 
110 |         print*, "dsygvdx_gpu error: liwork_h must be at least 3 + 5*N"
111 |         info = -1
112 |         return
113 |       endif
114 | 
115 |       m = iu - il + 1 ! Number of eigenvalues/vectors to compute
116 | 
117 |       if(initialized == 0) call init_eigsolve_gpu
118 | 
119 |       ! Compute cholesky factorization of B
120 |       call nvtxStartRange("cusolverdnDpotrf", 0)
121 |       istat = cusolverDnDpotrf(cusolverHandle, CUBLAS_FILL_MODE_UPPER, N, B, ldb, work, lwork, devInfo_d)
122 |       istat = devInfo_d
123 |       call nvtxEndRange
124 |       if (istat .ne. 0) then
125 |         print*, "dsygvdx_gpu error: cusolverDnDpotrf failed!"
126 |         info = -1
127 |         return
128 |       endif
129 | 
130 |       ! Store lower triangular part of A in Z
131 |       !$cuf kernel do(2) <<<*,*, 0, stream1>>>
132 |       do j = 1,N
133 |         do i = 1,N
134 |           if (i > j) then
135 |             Z(i,j) = A(i,j)
136 |           endif
137 |         end do
138 |       end do
139 | 
140 | 
141 |       ! Reduce to standard eigenproblem
142 |       nb = 448
143 |       call nvtxStartRange("dsygst_gpu", 1)
144 |       call dsygst_gpu(1, 'U', N, A, lda, B, ldb, nb)
145 |       call nvtxEndRange
146 | 
147 |       ! Tridiagonalize and compute eigenvalues/vectors
148 |       call nvtxStartRange("dsyevd_gpu", 2)
149 |       call dsyevd_gpu('V', 'U', il, iu, N, A, lda, Z, ldz, w, work, lwork, &
150 |                       work_h, lwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info)
151 |       call nvtxEndRange
152 | 
153 |       ! Triangle solve to get eigenvectors for original general eigenproblem
154 |       call nvtxStartRange("cublasDtrsm", 3)
155 |       call cublasDtrsm('L', 'U', 'N', 'N', N, (iu - il + 1), one, B, ldb, Z, ldz) 
156 |       call nvtxEndRange
157 | 
158 |       ! Copy final eigenvectors to host
159 |       if (not(skip_host_copy)) then
160 |         istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m)
161 |         if (istat .ne. 0) then
162 |           print*, "dsygvdx_gpu error: cudaMemcpy2D failed!"
163 |           info = -1
164 |           return
165 |         endif
166 |       endif
167 | 
168 |     end subroutine dsygvdx_gpu
169 | 
170 | end module dsygvdx_gpu
171 | 


--------------------------------------------------------------------------------
/lib_eigsolve/dsymv_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module dsymv_gpu
 25 |   use cudafor
 26 | 
 27 |   contains
 28 | 
 29 | #define BX 32
 30 | #define BY 8
 31 | #define NTILES 4
 32 | 
 33 |   attributes(global) subroutine dsymv_gpu(N, A, lda, x, y)
 34 |     use cudafor
 35 |     implicit none
 36 | 
 37 |     integer, value                                    :: N, lda
 38 |     real(8), dimension(lda, N), device, intent(in)    :: A
 39 |     real(8), dimension(N), device, intent(in)         :: x
 40 |     real(8), dimension(N), device                     :: y 
 41 | 
 42 |     real(8), dimension(BX+1, BX), shared              :: Ar_s
 43 |     real(8), dimension(BX), shared                    :: r_s
 44 | 
 45 |     integer                                           :: tx, ty, ii, jj, i, j, k, istat
 46 |     real(8)                                           :: rv1, rv2, mysum
 47 |     real(8)                                           :: Ar, xl
 48 | 
 49 |     ! ii,jj is index of top left corner of block
 50 |     ii = (blockIdx%y-1) * blockDim%x + 1
 51 | 
 52 |     mysum = 0.0_8
 53 | 
 54 |     tx = threadIdx%x
 55 |     ty = threadIdx%y
 56 | 
 57 |     if (ii + (blockIdx%x-1)*blockDim%x > N) return
 58 | 
 59 |     i = ii + tx - 1
 60 |     if (i <= N) then
 61 |       xl =  x(i) ! read part of x for lower triangular multiply
 62 |     endif
 63 | 
 64 |     ! Loop over columns (skip all lower triangular blocks)
 65 |     do jj = ii + (blockIdx%x-1)*blockDim%x, N, gridDim%x*blockDim%x
 66 |       j = jj + ty - 1
 67 | 
 68 |       ! Load block into shared memory
 69 |       ! CASE 1: Diagonal block
 70 |       if (ii == jj) then
 71 | 
 72 |         ! Load full block into shared memory
 73 |         do k = 0,NTILES-1
 74 |           if (i <= N .and. j + k * blockDim%y <= N) then
 75 |             Ar_s(tx, ty + k * blockDim%y) = A(i,j + k * blockDim%y)
 76 |           endif
 77 |         end do
 78 |         
 79 |         call syncthreads()
 80 | 
 81 |         ! Reflect to populate lower triangular part with true values of A
 82 |         do k = 0,NTILES-1
 83 |           if (tx > ty + k * blockDim%y) then
 84 |             Ar_s(tx, ty + k * blockDim%y) = Ar_s(ty + k * blockDim%y, tx)
 85 |           endif
 86 |         end do
 87 | 
 88 |         call syncthreads()
 89 | 
 90 |         do k = 0,NTILES-1
 91 |           if (i <= N .and. j + k * blockDim%y <= N ) then
 92 |             mysum = mysum + Ar_s(tx, ty + k * blockDim%y) * x(j + k*blockDim%y)
 93 |           endif
 94 |         end do
 95 | 
 96 |         !call syncthreads()
 97 | 
 98 |       ! CASE 2: Upper triangular block
 99 |       else if (ii < jj) then
100 |         do k = 0,NTILES-1
101 |           if (j + k * blockDim%y <= N) then
102 |             Ar = A(i, j + k * blockDim%y)
103 |           endif
104 | 
105 |           if (i <= N .and. j + k * blockDim%y <= N ) then
106 |             mysum = mysum + Ar * x(j + k*blockDim%y)
107 |           endif
108 | 
109 |           ! Perform product for symmetric lower block here
110 |           if (i <= N .and. j + k*blockDim%y <= N) then
111 |             rv1 = Ar * xl
112 |           else
113 |             rv1 = 0.0_8
114 |           endif
115 | 
116 |           !Partial sum within warps using shuffle
117 |           rv2 = __shfl_down(rv1,1)
118 |           rv1 = rv1 + rv2
119 |           rv2 = __shfl_down(rv1,2)
120 |           rv1 = rv1 + rv2
121 |           rv2 = __shfl_down(rv1,4)
122 |           rv1 = rv1 + rv2
123 |           rv2 = __shfl_down(rv1,8)
124 |           rv1 = rv1 + rv2
125 |           rv2 = __shfl_down(rv1,16)
126 |           rv1 = rv1 + rv2
127 | 
128 |           if (tx == 1) then
129 |             r_s(ty + k*blockDim%y) = rv1
130 |           endif
131 |         enddo
132 | 
133 |         call syncthreads()
134 | 
135 |         if (ty == 1 .and. jj+tx-1 <= N) then
136 |           istat = atomicadd(y(jj + tx -1), r_s(tx))
137 |         endif
138 |         !call syncthreads()
139 | 
140 |       endif
141 | 
142 |       call syncthreads()
143 | 
144 |     end do
145 | 
146 |     if (i <= N) then
147 |       istat = atomicadd(y(i), mysum)
148 |     endif
149 |     
150 |   end subroutine dsymv_gpu
151 | 
152 | end module dsymv_gpu
153 | 
154 | 


--------------------------------------------------------------------------------
/lib_eigsolve/dsytd2_gpu.F90:
--------------------------------------------------------------------------------
  1 | module dsytd2_gpu
  2 |   contains
  3 |     attributes(global) subroutine dsytd2_gpu(n,a,lda,d,e,tau)
  4 |       use cudafor
  5 |       implicit none
  6 |       integer, value    :: lda
  7 |       real(8),device    :: a(lda,32),tau(32)
  8 |       real(8),device    :: d(32),e(32)
  9 |       real(8),shared    :: a_s(32,32)
 10 |       real(8),shared    :: alpha
 11 |       real(8),shared    :: taui
 12 |       real(8)           :: beta
 13 |       real(8)           :: alphar
 14 |       real(8)           :: xnorm,x,y,z,w
 15 |       real(8)           :: wc
 16 |       integer, value    :: n
 17 |       integer           :: tx,ty,tl,i,j,ii
 18 | 
 19 |       tx=threadIdx%x
 20 |       ty=threadIdx%y
 21 |       ! Linear id of the thread (tx,ty)
 22 |       tl=tx+ blockDim%x*(ty-1)
 23 | 
 24 |       ! Load a_d in shared memory
 25 |       if (tx <= N .and. ty <= N) then
 26 |          a_s(tx           ,ty           )=a(tx           ,ty)
 27 |       endif
 28 | 
 29 |        call syncthreads()
 30 |       ! Symmetric matrix from upper triangular
 31 |       if (tx >ty) then
 32 |          a_s(tx,ty)=a_s(ty,tx)
 33 |       end if
 34 | 
 35 | 
 36 |       call syncthreads()
 37 | 
 38 |       ! For each column working backward
 39 |       do i=n-1,1,-1
 40 |         ! Generate elementary reflector
 41 |         ! Sum the vectors above the diagonal, only one warp active
 42 |         ! Reduce in a warp
 43 |         if (tl <=32) then
 44 |           if (tl <i) then
 45 |             w=a_s(tl,i+1)*a_s(tl,i+1)
 46 |           else 
 47 |             w=0._8
 48 |           endif
 49 | 
 50 |            xnorm=__shfl_down(w,1)
 51 |            w=w+xnorm
 52 |            xnorm=__shfl_down(w,2)
 53 |            w=w+xnorm
 54 |            xnorm=__shfl_down(w,4)
 55 |            w=w+xnorm
 56 |            xnorm=__shfl_down(w,8)
 57 |            w=w+xnorm
 58 |            xnorm=__shfl_down(w,16)
 59 |            w=w+xnorm
 60 |         end if
 61 | 
 62 |         if(tl==1) then
 63 |           alpha=a_s(i,i+1)
 64 |           alphar=dble(alpha)
 65 |           xnorm=dsqrt(w)
 66 |           
 67 |           if (xnorm .eq. 0_8) then
 68 |           ! H=1
 69 |             taui= 0._8
 70 |             alpha = 1.d0 ! To prevent scaling by dscal in this case
 71 |           else
 72 |             !Compute sqrt(alphar^2+xnorm^2) with  dlapy2(alphar,xnorm)
 73 |             x=abs(alphar)
 74 |             y=abs(xnorm)
 75 |             w=max(x,y)
 76 |             z=min(x,y)
 77 | 
 78 |             if (z .eq. 0.d0) then
 79 |               beta=-sign(w, alphar)
 80 |             else
 81 |               beta=-sign(w*sqrt(1.d0 + (z/w)**2), alphar)
 82 |             endif
 83 | 
 84 |             taui= (beta-alphar)/beta
 85 |             alpha = 1.d0/(alphar - beta) ! scale factor for dscal
 86 | 
 87 |           end if
 88 |         end if
 89 | 
 90 |         call syncthreads()
 91 | 
 92 |         ! dscal
 93 |         if (tl<i) then
 94 |           a_s(tl,i+1)=a_s(tl,i+1)*alpha
 95 |         end if
 96 | 
 97 |         if (tl==1) then 
 98 |           if (xnorm .ne. 0_8) then
 99 |             alpha=beta
100 |           else
101 |             alpha=a_s(i,i+1) ! reset alpha to original value
102 |           endif
103 | 
104 |           e(i)=alpha
105 |         end if
106 | 
107 |         if(taui.ne.(0.d0,0.d0)) then
108 |           a_s(i,i+1)=1.d0
109 |           call syncthreads()
110 |           if(tl<=i) then
111 |             tau(tl)=0.d0
112 |             do j=1,i
113 |               tau(tl)=tau(tl)+taui*a_s(tl,j)*a_s(j,i+1)
114 |             end do
115 |           end if
116 | 
117 |           call syncthreads()
118 |        
119 |           if (tl <=32) then
120 |             if (tl <=i) then
121 |               x=-.5d0*taui*tau(tl)*a_s(tl,i+1)
122 |             else
123 |               x=0._8
124 |             endif
125 | 
126 |             z=__shfl_xor(x,1)
127 |             x=x+z
128 |             z=__shfl_xor(x,2)
129 |             x=x+z
130 |             z=__shfl_xor(x,4)
131 |             x=x+z
132 |             z=__shfl_xor(x,8)
133 |             x=x+z
134 |             z=__shfl_xor(x,16)
135 |             x=x+z
136 | 
137 |           end if
138 | 
139 |           call syncthreads()
140 | 
141 |           if (tl <=i) then
142 |             tau(tl)=tau(tl)+x*a_s(tl,i+1)
143 |           end if
144 | 
145 |           if( tl==1) alpha=x
146 | 
147 |           call syncthreads()
148 | 
149 |           if( tx<=i .and. ty<=i) then
150 |             a_s(tx,ty)=a_s(tx,ty)-a_s(tx,i+1)*tau(ty)-a_s(ty,i+1)*tau(tx)
151 |           end if
152 |           call syncthreads()
153 | 
154 |         endif
155 | 
156 |         if (tl==1) then
157 |           a_s(i,i+1)=e(i)
158 |           d(i+1)=a_s(i+1,i+1)
159 |           tau(i)=taui
160 |         end if
161 | 
162 |         call syncthreads()
163 | 
164 |       end do
165 | 
166 |       if (tl==1) then
167 |         d(1) = a_s(1,1)
168 |       endif
169 | 
170 | 
171 |       call syncthreads()
172 | 
173 |       ! Back to device memory
174 |       if (tx <= N .and. ty <= N) then
175 |         a(tx,ty)=a_s(tx,ty)
176 |       endif
177 | 
178 | 
179 |     end subroutine dsytd2_gpu
180 | end module dsytd2_gpu
181 |   
182 | 


--------------------------------------------------------------------------------
/lib_eigsolve/dsytrd_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module dsytrd_gpu
 25 |   use cudafor
 26 |   use cublas
 27 | 
 28 |   contains
 29 |   
 30 |     subroutine dsytrd_gpu(uplo, N, A, lda, d, e, tau, work, lwork, nb)
 31 |       use eigsolve_vars
 32 |       use dsytd2_gpu
 33 |       implicit none
 34 |       character                                 :: uplo
 35 |       integer                                   :: N, lda, lwork, nb, nx, ldwork, istat
 36 |       integer                                   :: i, j, k, kk
 37 |       real(8), dimension(1:N), device           :: d
 38 |       real(8), dimension(1:N-1), device         :: e
 39 |       real(8), dimension(1:lwork), device       :: work
 40 |       real(8), dimension(1:lda, 1:N), device    :: A
 41 |       real(8), dimension(1:N-1), device         :: tau
 42 |       real(8), parameter                        :: one = 1.0_8
 43 |       type(dim3)                                :: threads
 44 | 
 45 |       if (uplo .ne. 'U') then
 46 |         print*, "Provided uplo type not supported!"
 47 |         return
 48 |       endif
 49 | 
 50 |       if (lwork < (nb+2)*N .and. N > nb) then
 51 |         write(*,*), "Provided work array must be sized (nb+2)*N or greater!"
 52 |         return
 53 |       endif
 54 | 
 55 |       ldwork = N
 56 | 
 57 |       istat = cublasSetStream(cuHandle, stream1)
 58 | 
 59 |       kk = N-((N-32) / nb) * nb
 60 |       k = N+1
 61 |       do i = N-nb+1, kk+1, -nb
 62 |         ! Reduce columns i:i+nb-1 to tridiagonal form 
 63 |         call dlatrd_gpu(uplo, i+nb-1, nb, A, lda, e, tau, work, ldwork)
 64 | 
 65 |         ! Update trailing submatrix
 66 |         call cublasdsyr2k(uplo, 'N', i-1, nb, -one, A(1, i), lda, work, ldwork, one, a, lda)
 67 | 
 68 |         k = k - nb
 69 | 
 70 |       end do
 71 |       
 72 |       ! Finish any remaining columns to get final 32x32 block
 73 |       nb = k - 32 - 1
 74 |       i = k - nb
 75 |       
 76 |       if (nb > 0) then
 77 |         ! Reduce columns i:i+nb-1 to tridiagonal form 
 78 |         call dlatrd_gpu(uplo, i+nb-1, nb, A, lda, e, tau, work, ldwork)
 79 | 
 80 |         ! Update trailing submatrix
 81 |         call cublasdsyr2k(uplo, 'N', i-1, nb, -one, A(1, i), lda, work, ldwork, one, a, lda)
 82 |       endif
 83 | 
 84 |       ! Final block
 85 |       threads = dim3(32, 32, 1)
 86 |       call dsytd2_gpu<<<1, threads>>>(min(32, N), A, lda, d, e, tau)
 87 | 
 88 |       ! Copy superdiagonal back into A, store diagonal in d
 89 |       !$cuf kernel do(1) <<<*,*>>>
 90 |       do j = 33, N
 91 |         !A(j-1, j) = e(j-1) ! JR Not strictly needed so skipping this copy
 92 |         d(j) = A(j,j)
 93 |       end do
 94 | 
 95 |     end subroutine dsytrd_gpu
 96 | 
 97 | 
 98 |     subroutine dlatrd_gpu(uplo, N, nb, A, lda, e, tau, W, ldw)
 99 |       use eigsolve_vars
100 |       use dsymv_gpu
101 |       implicit none
102 |       character                                  :: uplo
103 |       integer                                    :: N, nb, lda, ldw, istat
104 |       integer                                    :: i, j, k, iw
105 |       integer                                    :: blocks, threads
106 |       real(8), dimension(1:lda, 1:N), device     :: A
107 |       real(8), dimension(1:ldw, 1:nb), device    :: W
108 |       real(8), dimension(N-1), device            :: tau
109 |       real(8), dimension(N-1), device            :: e
110 |       real(8), parameter                         :: one = 1.0d0, zero = 0.0d0, half = 0.5d0
111 | 
112 |       type(dim3)                                 :: threads2D, blocks2D
113 | 
114 |       if (uplo .ne. 'U') then
115 |         print*, "Provided uplo type not supported!"
116 |         return
117 |       endif
118 | 
119 |       threads2D = dim3(32,8,1)
120 |       threads = 256
121 | 
122 |       if (N <= 0) return
123 | 
124 |       ! Complete first iteration outside loop
125 |       if (N > 1) then
126 |         iw = nb
127 |         ! Generate elementary reflector H(i) to annihilate A(1:i-2, i)
128 |         call dlarfg_kernel<<<1, threads>>>(N-1, e(N-1), A(1, N), tau(N-1))
129 | 
130 |         !$cuf kernel do(1) <<<*,*>>>
131 |         do k = 1, N-1
132 |           W(k,iw) = 0.d0
133 |         end do
134 | 
135 |         blocks2D = dim3(10, ceiling(real(N-1)/32), 1) !JR TODO: What is optimal number of columns for our problem size?
136 |         call dsymv_gpu<<<blocks2D, threads2D>>>(N-1, A, lda, A(1, N), W(1, iw))
137 | 
138 |         call finish_W_col_kernel<<<1, threads>>>(N-1, tau(N-1), A(1, N), W(1, iw))
139 |       endif
140 | 
141 |       do i = N-1, N-nb+1, -1
142 |         iw = i-N+nb
143 | 
144 |         blocks2D = dim3(ceiling(real(max(i, N-i))/32), ceiling(real(N-i)/8), 1)
145 |         !call dsyr2_mv_kernel<<<blocks2D, threads2D>>>(i, N-i, A(1, i+1), lda, W(1, iw+1), ldw, A(1, i), W(1, iw), ldw)
146 |         call dsyr2_mv_dlarfg_kernel<<<blocks2D, threads2D>>>(i, N-i, A(1, i+1), lda, W(1, iw+1), ldw, A(1, i), W(1, iw), ldw, e(i-1), tau(i-1), finished(1))
147 | 
148 |         if (i > 1) then
149 |           ! Generate elementary reflector H(i) to annihilate A(1:i-2, i)
150 |           !call dlarfg_kernel<<<1, threads>>>(i-1, e(i-1), A(1, i), tau(i-1))
151 | 
152 |           blocks2D = dim3(10, ceiling(real(i-1)/32), 1) !JR TODO: What is optimal number of columns for our problem size?
153 |           call dsymv_gpu<<<blocks2D, threads2D>>>(i-1, A, lda, A(1, i), W(1, iw))
154 | 
155 |           blocks2D = dim3(ceiling(real(i-1)/32), ceiling(real(2*(n-i))/8), 1)
156 |           call stacked_dgemv_T<<<blocks2D, threads2D>>>(n-i, i-1, A(1,i+1), lda, W(1, iw+1), ldw, A(1,i), W(i+1, iw), W(i+1, iw+1))
157 |           !call stacked_dgemv_N<<<blocks2D, threads2D>>>(i-1, n-i, A(1,i+1), lda, W(1, iw+1), ldw, W(i+1,iw), W(i+1, iw+1), W(1, iw))
158 |           call stacked_dgemv_N_finish_W<<<blocks2D, threads2D>>>(i-1, n-i, A(1,i+1), lda, W(1, iw+1), ldw, W(i+1,iw), W(i+1, iw+1), W(1, iw), tau(i-1), A(1, i), finished(1))
159 | 
160 |           !call finish_W_col_kernel<<<1, threads>>>(i-1, tau(i-1), A(1, i), W(1, iw))
161 | 
162 |         end if
163 |       end do
164 |     end subroutine dlatrd_gpu
165 | 
166 |     attributes(global) subroutine dsyr2_mv_kernel(N, M, V, ldv, W, ldw, x, W2, ldw2)
167 |       implicit none
168 |       integer, value                                      :: N, M, ldv, ldw, ldw2
169 |       real(8), dimension(1:ldv, 1:M), device, intent(in)  :: V
170 |       real(8), dimension(1:ldw, 1:M), device, intent(in)  :: W
171 |       real(8), dimension(1:ldw2, 2), device               :: W2
172 |       real(8), dimension(1:N), device                     :: x
173 | 
174 |       integer                                             :: i, j, istat
175 |       real(8)                                             :: rv
176 | 
177 |       i = (blockIdx%x - 1) * blockDim%x + threadIdx%x
178 |       j = (blockIdx%y - 1) * blockDim%y + threadIdx%y
179 | 
180 |       if (i <= N .and. j <= M) then
181 | 
182 |         rv = -W(N, j) * V(i,j) - V(N, j) * W(i,j)
183 | 
184 |         ! Update x
185 |         istat = atomicadd(x(i), rv)
186 |       endif
187 | 
188 |       if (threadIdx%y == 1) then
189 |         ! Zero out column for zhemv call
190 |         if (i <= N) W2(i, 1) = 0
191 |         ! Zero out workspace for intermediate zgemv results
192 |         if (i <= M) then
193 |           W2(N + i, 1) = 0
194 |           W2(N + i, 2) = 0
195 |         endif
196 |       endif
197 | 
198 |     end subroutine dsyr2_mv_kernel
199 | 
200 |     attributes(global) subroutine dlarfg_kernel(N, e, x, tau)
201 |       implicit none
202 |       integer, value                   :: N
203 |       real(8), device                  :: tau
204 |       real(8), device                  :: e
205 |       real(8), dimension(N), device    :: x
206 | 
207 |       integer                          :: tid, i, j, nb, istat, laneID
208 |       real(8)                          :: rv1, rv2, rv3, scal, scal2, alphar, beta, rsum
209 |       real(8), shared                  :: xnorm
210 |       real(8), shared                  :: alpha_s
211 | 
212 |       tid = threadIdx%x
213 |       laneID = iand(tid, 31)
214 | 
215 |       if (tid == 1) then
216 |         alpha_s = x(N)
217 |         xnorm = 0.0_8
218 |       endif
219 | 
220 |       call syncthreads()
221 | 
222 |       alphar = alpha_s
223 |       rsum = 0.0_8
224 | 
225 |       nb = ceiling(real(N)/blockDim%x) ! number of blocks down column
226 | 
227 |       i = tid
228 |       do j = 1, nb
229 | 
230 |         ! All threads perform their product, zero if out of bounds
231 |         if (i <= N-1) then
232 |           rv1 = x(i)
233 |           rv1 = rv1 * rv1
234 |         else
235 |           rv1 = 0.0_8
236 |         endif
237 | 
238 |         rsum = rsum + rv1
239 | 
240 |         i = i + blockDim%x
241 |       end do
242 | 
243 |       ! Partial sum within warps using shuffle
244 |       rv1 = rsum
245 |       rv2 = __shfl_down(rv1,1)
246 |       rv1 = rv1 + rv2
247 |       rv2 = __shfl_down(rv1,2)
248 |       rv1 = rv1 + rv2
249 |       rv2 = __shfl_down(rv1,4)
250 |       rv1 = rv1 + rv2
251 |       rv2 = __shfl_down(rv1,8)
252 |       rv1 = rv1 + rv2
253 |       rv2 = __shfl_down(rv1,16)
254 |       rv1 = rv1 + rv2
255 | 
256 |       if (laneID == 1) then
257 |         istat = atomicadd(xnorm, rv1)
258 |       endif
259 | 
260 |       call syncthreads()
261 | 
262 |       if (xnorm == 0.0_8) then
263 |         if (tid == 1) then
264 |           tau = 0.0_8
265 |         endif
266 |       else
267 |         if (tid == 1) then
268 |           xnorm = sqrt(xnorm)
269 |           rv1 = abs(alphar)
270 | 
271 |           ! not taking abs of xnorm
272 |           scal = max(rv1, xnorm)
273 |           scal2 = min(rv1, xnorm)
274 |           
275 |           if (scal2 .eq. 0.0d0) then
276 |             beta = -sign(scal, alphar)
277 |           else
278 |             beta = -sign(scal * sqrt(1.0d0 + (scal2/scal)**2), alphar)
279 |           endif
280 | 
281 |           tau = (beta - alphar)/beta
282 | 
283 |           e = beta ! store beta in e vector
284 |           alpha_s = 1.d0/(alphar - beta) !scaling factor for dscal
285 |         endif
286 | 
287 |         call syncthreads()
288 | 
289 |         do i = tid, N, blockDim%x
290 | 
291 |           if (i <= N-1) then
292 |             x(i) = alpha_s * x(i)
293 |           elseif (i == N) then
294 |             x(i) = 1.0_8
295 |           endif
296 |         
297 |         end do
298 | 
299 |       endif
300 | 
301 |     end subroutine dlarfg_kernel
302 | 
303 |     attributes(global) subroutine dsyr2_mv_dlarfg_kernel(N, M, V, ldv, W, ldw, x, W2, ldw2, e, tau, finished)
304 |       implicit none
305 |       integer, value                                      :: N, M, ldv, ldw, ldw2
306 |       real(8), dimension(1:ldv, 1:M), device, intent(in)  :: V
307 |       real(8), dimension(1:ldw, 1:M), device, intent(in)  :: W
308 |       real(8), dimension(1:ldw2, 2), device               :: W2
309 |       real(8), dimension(1:N), device                     :: x
310 |       real(8), device                                     :: tau
311 |       real(8), device                                     :: e
312 | 
313 |       integer                                             :: i, j, tx, ty, tid, nb, laneid, istat, nBlocks
314 |       integer, device                                     :: finished
315 |       integer, shared                                     :: nFinished 
316 |       real(8)                                             :: rv
317 |       real(8)                                             :: rv1, rv2, rv3, scal, scal2, alphar, beta, rsum
318 |       real(8), shared                                     :: xnorm
319 |       real(8), shared                                     :: alpha_s
320 | 
321 |       tx = threadIdx%x
322 |       ty = threadIdx%y
323 |       i = (blockIdx%x - 1) * blockDim%x + tx
324 |       j = (blockIdx%y - 1) * blockDim%y + ty
325 | 
326 |       nBlocks = gridDim%x * gridDim%y
327 | 
328 |       if (i <= N .and. j <= M) then
329 | 
330 |         rv = -W(N, j) * V(i,j) - V(N, j) * W(i,j)
331 | 
332 |         ! Update x
333 |         istat = atomicadd(x(i), rv)
334 |       endif
335 | 
336 |       if (ty == 1) then
337 |         ! Zero out column for dgemv call
338 |         if (i <= N) W2(i, 1) = 0
339 |         ! Zero out workspace for intermediate dgemv results
340 |         if (i <= M) then
341 |           W2(N + i, 1) = 0
342 |           W2(N + i, 2) = 0
343 |         endif
344 |       endif
345 | 
346 |       call threadfence()
347 | 
348 |       nFinished = 0
349 |       call syncthreads()
350 |       if (tx + ty == 2) nFinished = atomicinc(finished, nBlocks-1)
351 |       call syncthreads()
352 | 
353 |       if ( nFinished < nBlocks - 1) return
354 | 
355 |       ! Begin dlarfg work with last block
356 |       if (N == 1) return
357 | 
358 |       tid = tx + (ty - 1) * blockDim%x
359 |       laneID = iand(tid, 31)
360 | 
361 |       if (tid == 1) then
362 |         alpha_s = x(N-1)
363 |         xnorm = 0.0_8
364 |       endif
365 | 
366 |       call syncthreads()
367 | 
368 |       alphar = alpha_s
369 |       rsum = 0.0_8
370 | 
371 |       nb = ceiling(real(N-1)/blockDim%x*blockDim%y) ! number of blocks down column
372 | 
373 |       i = tid
374 |       do j = 1, nb
375 | 
376 |         ! All threads perform their product, zero if out of bounds
377 |         if (i <= N-2) then
378 |           rv1 = x(i)
379 |           rv1 = rv1 * rv1
380 |         else
381 |           rv1 = 0.0_8
382 |         endif
383 | 
384 |         rsum = rsum + rv1
385 | 
386 |         i = i + blockDim%x*blockDim%y
387 |       end do
388 | 
389 |       ! Partial sum within warps using shuffle
390 |       rv1 = rsum
391 |       rv2 = __shfl_down(rv1,1)
392 |       rv1 = rv1 + rv2
393 |       rv2 = __shfl_down(rv1,2)
394 |       rv1 = rv1 + rv2
395 |       rv2 = __shfl_down(rv1,4)
396 |       rv1 = rv1 + rv2
397 |       rv2 = __shfl_down(rv1,8)
398 |       rv1 = rv1 + rv2
399 |       rv2 = __shfl_down(rv1,16)
400 |       rv1 = rv1 + rv2
401 | 
402 |       if (laneID == 1) then
403 |         istat = atomicadd(xnorm, rv1)
404 |       endif
405 | 
406 |       call syncthreads()
407 | 
408 |       if (xnorm == 0.0_8) then
409 |         if (tid == 1) then
410 |           tau = 0.0_8
411 |         endif
412 |       else
413 |         if (tid == 1) then
414 |           xnorm = sqrt(xnorm)
415 |           rv1 = abs(alphar)
416 | 
417 |           ! not taking abs of xnorm
418 |           scal = max(rv1, xnorm)
419 |           scal2 = min(rv1, xnorm)
420 |           
421 |           if (scal2 .eq. 0.0d0) then
422 |             beta = -sign(scal, alphar)
423 |           else
424 |             beta = -sign(scal * sqrt(1.0d0 + (scal2/scal)**2), alphar)
425 |           endif
426 | 
427 |           tau = (beta - alphar)/beta
428 | 
429 |           e = beta ! store beta in e vector
430 |           alpha_s = 1.d0/(alphar - beta) !scaling factor for dscal
431 |         endif
432 | 
433 |         call syncthreads()
434 | 
435 |         do i = tid, N-1, blockDim%x*blockDim%y
436 | 
437 |           if (i <= N-2) then
438 |             x(i) = alpha_s * x(i)
439 |           elseif (i == N-1) then
440 |             x(i) = 1.0_8
441 |           endif
442 |         
443 |         end do
444 | 
445 |       endif
446 | 
447 |     end subroutine dsyr2_mv_dlarfg_kernel
448 | 
449 |     attributes(global) subroutine stacked_dgemv_T(M, N, V, ldv, W, ldw, x, z1, z2)
450 |       use cudafor
451 |       implicit none
452 |       integer, value                                  :: M, N, ldv, ldw
453 |       real(8), dimension(ldv, M), device, intent(in)  :: V
454 |       real(8), dimension(ldw, M), device, intent(in)  :: W
455 |       real(8), dimension(N), device, intent(in)       :: x
456 |       real(8), dimension(M), device                   :: z1, z2
457 |       !complex(8), dimension(M), device, intent(in)        :: z1, z2
458 | 
459 |       !real(8), dimension(32), shared                     :: r_s
460 |       !real(8), dimension(32), shared                     :: i_s
461 | 
462 |       integer :: i, j, tx, ty, istat
463 |       real(8) :: rv1, rv2, xr
464 | 
465 |       tx = threadIdx%x
466 |       ty = threadIdx%y
467 | 
468 |       i = (blockIdx%y - 1) * blockDim%y + ty 
469 |       j = (blockIdx%x - 1) * blockDim%x + tx
470 | 
471 |       !if (i > 2*M .or. j > N) return
472 |       if (i > 2*M) return
473 | 
474 |       xr = x(j)
475 | 
476 |       if (j > N) then
477 |         rv1 = 0.d0
478 |       else
479 |         if (i > M) then
480 |           rv2 = W(j, i-M)
481 |         else
482 |           rv2 = V(j, i)
483 |         endif
484 | 
485 |         rv1 = rv2 * xr
486 |       endif
487 | 
488 |       !Partial sum within warps using shuffle
489 |       rv2 = __shfl_down(rv1,1)
490 |       rv1 = rv1 + rv2
491 |       rv2 = __shfl_down(rv1,2)
492 |       rv1 = rv1 + rv2
493 |       rv2 = __shfl_down(rv1,4)
494 |       rv1 = rv1 + rv2
495 |       rv2 = __shfl_down(rv1,8)
496 |       rv1 = rv1 + rv2
497 |       rv2 = __shfl_down(rv1,16)
498 |       rv1 = rv1 + rv2
499 | 
500 |       if (tx == 1) then
501 |         if (i > M) then
502 |           istat = atomicadd(z2(i-M), rv1)
503 |         else
504 |           istat = atomicadd(z1(i), rv1)
505 |         endif
506 |       endif
507 | 
508 |       return
509 |     end subroutine stacked_dgemv_T
510 | 
511 |     attributes(global) subroutine stacked_dgemv_N(M, N, V, ldv, W, ldw, z1, z2, y)
512 |       use cudafor
513 |       implicit none
514 |       integer, value                                     :: M, N, ldv, ldw
515 |       real(8), dimension(ldv, N), device, intent(in)     :: V
516 |       real(8), dimension(ldw, N), device, intent(in)     :: W
517 |       real(8), dimension(N), device, intent(in)          :: z1, z2
518 |       real(8), dimension(M), device                      :: y
519 | 
520 |       integer :: i, j, tx, ty, istat
521 |       real(8) :: rv1, rv2, xr
522 | 
523 |       tx = threadIdx%x
524 |       ty = threadIdx%y
525 | 
526 |       i = (blockIdx%x - 1) * blockDim%x + tx
527 |       j = (blockIdx%y - 1) * blockDim%y + ty
528 | 
529 |       if (i > M .or. j > 2*N) return
530 | 
531 |       if (j > N) then
532 |         xr = z2(j-N)
533 |         rv2 = V(i, j-N)
534 |       else
535 |         xr = z1(j)
536 |         rv2 = W(i, j)
537 |       endif
538 | 
539 |       rv1 = -rv2 * xr
540 | 
541 |       istat = atomicadd(y(i), rv1)
542 | 
543 |       return
544 | 
545 |     end subroutine stacked_dgemv_N
546 | 
547 |     attributes(global) subroutine finish_W_col_kernel(N, tau, x, y)
548 |       implicit none
549 |       integer, value                               :: N
550 |       real(8), device                              :: tau
551 |       real(8), dimension(N), device, intent(in)    :: x
552 |       real(8), dimension(N), device                :: y
553 | 
554 |       integer                                      :: tid, i, j, k, nb, istat, laneID
555 |       real(8)                                      :: rv1, rv2, rsum, mytau
556 | 
557 |       real(8), shared                              :: alphar
558 |       !real(8), shared                              :: alpha
559 |       real(8)                                      :: alpha
560 | 
561 |       tid = threadIdx%x
562 |       laneID = iand(tid, 31)
563 | 
564 |       if (tid == 1) then
565 |         alphar = 0.0_8
566 |       endif
567 | 
568 |       call syncthreads()
569 | 
570 |       rsum = 0.0_8
571 |       mytau = tau
572 | 
573 |       nb = ceiling(real(N)/blockDim%x) ! number of blocks down column
574 | 
575 |       i = tid
576 |       do j = 1, nb
577 | 
578 |         ! All threads perform their product, zero if out of bounds
579 |         if (i <= N) then
580 |           rv1 = mytau * y(i) * x(i)
581 |         else
582 |           rv1 = 0.0d0
583 |         endif
584 | 
585 |         rsum = rsum + rv1
586 | 
587 |         i = i + blockDim%x
588 | 
589 |       end do
590 | 
591 |       ! Partial sum within warps using shuffle
592 |       rv1 = rsum
593 |       rv2 = __shfl_down(rv1,1)
594 |       rv1 = rv1 + rv2
595 |       rv2 = __shfl_down(rv1,2)
596 |       rv1 = rv1 + rv2
597 |       rv2 = __shfl_down(rv1,4)
598 |       rv1 = rv1 + rv2
599 |       rv2 = __shfl_down(rv1,8)
600 |       rv1 = rv1 + rv2
601 |       rv2 = __shfl_down(rv1,16)
602 |       rv1 = rv1 + rv2
603 | 
604 |       if (laneID == 1) then
605 |         istat = atomicadd(alphar, rv1)
606 |       endif
607 | 
608 |       call syncthreads()
609 | 
610 |       alpha = -0.5d0* mytau * alphar
611 | 
612 |       do i = tid, N, blockDim%x
613 |         y(i) = mytau*y(i) + alpha * x(i) !daxpy
614 |       end do
615 | 
616 |     end subroutine finish_W_col_kernel
617 | 
618 |     attributes(global) subroutine stacked_dgemv_N_finish_W(M, N, V, ldv, W, ldw, z1, z2, y, tau, x, finished)
619 |       use cudafor
620 |       implicit none
621 |       integer, value                                     :: M, N, ldv, ldw
622 |       real(8), dimension(ldv, N), device, intent(in)     :: V
623 |       real(8), dimension(ldw, N), device, intent(in)     :: W
624 |       real(8), dimension(N), device, intent(in)          :: z1, z2
625 |       real(8), dimension(M), device                      :: y
626 |       real(8), device                                    :: tau
627 |       real(8), dimension(M), device, intent(in)          :: x
628 |       integer, device                                    :: finished
629 | 
630 |       integer :: i, j, tx, ty, istat, nBlocks, tid, laneID, nb
631 |       integer, shared :: nFinished
632 |       real(8) :: rv1, rv2, rsum, xr, mytau
633 |       real(8), shared                              :: alphar
634 |       !real(8), shared                              :: alpha
635 |       real(8)                                      :: alpha
636 | 
637 |       tx = threadIdx%x
638 |       ty = threadIdx%y
639 | 
640 |       i = (blockIdx%x - 1) * blockDim%x + tx
641 |       j = (blockIdx%y - 1) * blockDim%y + ty
642 | 
643 |       nBlocks = gridDim%x * gridDim%y
644 | 
645 |       if (i <= M .and. j <= 2*N) then
646 | 
647 |         if (j > N) then
648 |           xr = z2(j-N)
649 |           rv2 = V(i, j-N)
650 |         else
651 |           xr = z1(j)
652 |           rv2 = W(i, j)
653 |         endif
654 | 
655 |         rv1 = -rv2 * xr
656 | 
657 |         istat = atomicadd(y(i), rv1)
658 |       endif
659 | 
660 |       call threadfence()
661 | 
662 |       nFinished = 0
663 |       call syncthreads()
664 |       if (tx + ty == 2) nFinished = atomicinc(finished, nBlocks-1)
665 |       call syncthreads()
666 | 
667 |       if ( nFinished < nBlocks - 1) return
668 | 
669 |       ! Begin finish_W_col work with last block
670 |       tid = threadIdx%x + (threadIdx%y - 1) * blockDim%x
671 |       laneID = iand(tid, 31)
672 | 
673 |       if (tid == 1) then
674 |         alphar = 0.0_8
675 |       endif
676 | 
677 |       call syncthreads()
678 | 
679 |       rsum = 0.0_8
680 |       mytau = tau
681 | 
682 |       nb = ceiling(real(M)/(blockDim%x*blockDim%y)) ! number of blocks down column
683 | 
684 |       i = tid
685 |       do j = 1, nb
686 | 
687 |         ! All threads perform their product, zero if out of bounds
688 |         if (i <= M) then
689 |           rv1 = mytau * y(i) * x(i)
690 |         else
691 |           rv1 = 0.0d0
692 |         endif
693 | 
694 |         rsum = rsum + rv1
695 | 
696 |         i = i + blockDim%x*blockDim%y
697 | 
698 |       end do
699 | 
700 |       ! Partial sum within warps using shuffle
701 |       rv1 = rsum
702 |       rv2 = __shfl_down(rv1,1)
703 |       rv1 = rv1 + rv2
704 |       rv2 = __shfl_down(rv1,2)
705 |       rv1 = rv1 + rv2
706 |       rv2 = __shfl_down(rv1,4)
707 |       rv1 = rv1 + rv2
708 |       rv2 = __shfl_down(rv1,8)
709 |       rv1 = rv1 + rv2
710 |       rv2 = __shfl_down(rv1,16)
711 |       rv1 = rv1 + rv2
712 | 
713 |       if (laneID == 1) then
714 |         istat = atomicadd(alphar, rv1)
715 |       endif
716 | 
717 |       call syncthreads()
718 | 
719 |       alpha = -0.5d0* mytau * alphar
720 | 
721 |       do i = tid, M, blockDim%x * blockDim%y
722 |         y(i) = mytau*y(i) + alpha * x(i) !daxpy
723 |       end do
724 | 
725 |     end subroutine stacked_dgemv_N_finish_W
726 | 
727 | end module dsytrd_gpu
728 | 
729 | 


--------------------------------------------------------------------------------
/lib_eigsolve/eigsolve_vars.F90:
--------------------------------------------------------------------------------
 1 | !
 2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 3 | ! 
 4 | ! 
 5 | ! Permission is hereby granted, free of charge, to any person obtaining a
 6 | ! copy of this software and associated documentation files (the "Software"),
 7 | ! to deal in the Software without restriction, including without limitation
 8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | ! and/or sell copies of the Software, and to permit persons to whom the
10 | ! Software is furnished to do so, subject to the following conditions:
11 | ! 
12 | ! The above copyright notice and this permission notice shall be included in
13 | ! all copies or substantial portions of the Software.
14 | ! 
15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | ! DEALINGS IN THE SOFTWARE.
22 | !
23 | 
24 | ! Module containing various handles used for GPU eigensolver
25 | module eigsolve_vars
26 |   use cudafor
27 |   use cublas
28 |   use cusolverDn
29 |   integer                        :: initialized = 0
30 |   type(cublasHandle)             :: cuHandle
31 |   type(cusolverDnHandle)         :: cusolverHandle
32 |   type(cudaEvent)                :: event1, event2, event3
33 |   integer(kind=cuda_stream_kind) :: stream1, stream2, stream3
34 |   integer, device                :: devInfo_d
35 |   integer, device, allocatable   :: finished(:)
36 | 
37 |   contains
38 | 
39 |     subroutine init_eigsolve_gpu()
40 |       use cudafor
41 |       use cublas
42 |       implicit none
43 |       integer istat
44 |       if( initialized == 0 ) then
45 |          ! Configure shared memory to use 8 byte banks
46 |          istat = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte) 
47 | 
48 |          istat = cublasCreate(cuHandle)
49 |          istat = cusolverDnCreate(cusolverHandle)
50 |          istat = cudaStreamCreate(stream1)
51 |          istat = cudaStreamCreate(stream2)
52 |          istat = cudaStreamCreate(stream3)
53 |          istat = cudaEventCreate(event1)
54 |          istat = cudaEventCreate(event2)
55 |          initialized = 1
56 |          allocate(finished(1))
57 |          finished(1) = 0
58 |       endif
59 |     end subroutine init_eigsolve_gpu
60 | 
61 | end module eigsolve_vars
62 | 


--------------------------------------------------------------------------------
/lib_eigsolve/toolbox.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | ! Interface for NVTX markers
 25 | module nvtx_inters
 26 |   use iso_c_binding
 27 |   use cudafor
 28 |   implicit none
 29 | 
 30 |   integer,private :: col(7) = [ Z'0000ff00', Z'000000ff', Z'00ffff00',Z'00ff00ff',Z'0000ffff', &
 31 |        Z'00ff0000', Z'00ffffff']
 32 |   character(len=256),private :: tempName
 33 | 
 34 |   type, bind(C):: nvtxEventAttributes
 35 |      integer(C_INT16_T):: version=1
 36 |      integer(C_INT16_T):: size=48 !
 37 |      integer(C_INT):: category=0
 38 |      integer(C_INT):: colorType=1 ! NVTX_COLOR_ARGB = 1
 39 |      integer(C_INT):: color
 40 |      integer(C_INT):: payloadType=0 ! NVTX_PAYLOAD_UNKNOWN = 0
 41 |      integer(C_INT):: reserved0
 42 |      integer(C_INT64_T):: payload   ! union uint,int,double
 43 |      integer(C_INT):: messageType=1  ! NVTX_MESSAGE_TYPE_ASCII     = 1 
 44 |      type(C_PTR):: message  ! ascii char
 45 |   end type nvtxEventAttributes
 46 |   
 47 | #ifdef USE_NVTX
 48 |   interface nvtxRangePush
 49 |      ! push range with custom label and standard color
 50 |      subroutine nvtxRangePushA(name) bind(C, name='nvtxRangePushA')
 51 |        use iso_c_binding
 52 |        character(kind=C_CHAR,len=*) :: name
 53 |      end subroutine nvtxRangePushA
 54 |      
 55 |      ! push range with custom label and custom color
 56 |      subroutine nvtxRangePushEx(event) bind(C, name='nvtxRangePushEx')
 57 |        use iso_c_binding
 58 |        import:: nvtxEventAttributes
 59 |        type(nvtxEventAttributes):: event
 60 |      end subroutine nvtxRangePushEx
 61 |   end interface nvtxRangePush
 62 |   
 63 |   interface nvtxRangePop
 64 |      subroutine nvtxRangePop() bind(C, name='nvtxRangePop')
 65 |      end subroutine nvtxRangePop
 66 |   end interface nvtxRangePop
 67 | #endif
 68 |   
 69 | contains
 70 |   
 71 |   subroutine nvtxStartRange(name,id)
 72 |     character(kind=c_char,len=*) :: name
 73 |     integer, optional:: id
 74 | #ifdef USE_NVTX
 75 |     type(nvtxEventAttributes):: event
 76 |     integer :: istat
 77 |     istat = cudaDeviceSynchronize()
 78 | 
 79 |     tempName=trim(name)//c_null_char
 80 | 
 81 |     if ( .not. present(id)) then
 82 |        call nvtxRangePush(tempName)
 83 |     else
 84 |        event%color=col(mod(id,7)+1)
 85 |        event%message=c_loc(tempName)
 86 |        call nvtxRangePushEx(event)
 87 |     end if
 88 | #endif
 89 |   end subroutine nvtxStartRange
 90 | 
 91 |   subroutine nvtxEndRange
 92 | #ifdef USE_NVTX
 93 |     integer :: istat
 94 |     istat = cudaDeviceSynchronize()
 95 |     call nvtxRangePop
 96 | #endif
 97 |   end subroutine nvtxEndRange
 98 | 
 99 | end module nvtx_inters
100 | 


--------------------------------------------------------------------------------
/lib_eigsolve/zheevd_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module zheevd_gpu
 25 |   use cudafor
 26 |   use cublas
 27 |   implicit none
 28 | 
 29 |   contains
 30 | 
 31 |     ! Custom zheevd/x routine
 32 |     subroutine zheevd_gpu(jobz, uplo, il, iu, N, A, lda, Z, ldz, w, work, lwork, rwork, lrwork, &
 33 |                           work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info)
 34 |       use zhetrd_gpu
 35 |       use eigsolve_vars
 36 |       use nvtx_inters
 37 |       implicit none
 38 |       character                                   :: uplo, jobz
 39 |       integer                                     :: N, NZ, lda, lwork, lrwork, liwork, istat, info
 40 |       integer                                     :: lwork_h, lrwork_h, liwork_h, ldz_h
 41 |       integer                                     :: i, j, k, nb1, nb2, ib, mi, ldt, ldz, il, iu
 42 |       real(8), dimension(1:lrwork), device        :: rwork
 43 |       real(8), dimension(1:lrwork_h), pinned      :: rwork_h
 44 |       complex(8), dimension(1:lwork), device      :: work
 45 |       complex(8), dimension(1:lwork_h), pinned    :: work_h
 46 |       integer, dimension(1:liwork_h), pinned      :: iwork_h
 47 | 
 48 |       complex(8), dimension(1:lda, 1:N), device   :: A
 49 |       complex(8), dimension(1:ldz, 1:N), device   :: Z
 50 |       complex(8), dimension(1:ldz_h, 1:N), pinned :: Z_h
 51 |       real(8), dimension(1:N), device             :: w
 52 |       real(8), dimension(1:N), pinned             :: w_h
 53 | 
 54 |       integer                                     :: inde, indtau, indwrk, indrwk, indwk2, indwk3, llwork, llrwk
 55 |       complex(8), parameter                       :: cone = cmplx(1,0,8)
 56 |       real(8), parameter                          :: one = 1.0_8
 57 | 
 58 |       if (uplo .ne. 'U' .or. jobz .ne. 'V') then
 59 |         print*, "Provided itype/uplo not supported!"
 60 |         return
 61 |       endif
 62 | 
 63 |       nb1 = 32 ! Blocksize for tridiagonalization
 64 |       nb2 = min(64, N) ! Blocksize for rotation procedure, fixed at 64
 65 |       ldt = nb2
 66 |       NZ = iu - il + 1
 67 | 
 68 |       inde = 1
 69 |       indtau = 1
 70 |       indwrk = indtau + n
 71 |       indrwk = inde + n
 72 |       indwk2 = indwrk + (nb2)*(nb2)
 73 |       indwk3 = indwk2 + (nb2)*(nb2)
 74 |       llwork = lwork - indwrk + 1
 75 |       llrwk = lrwork_h - indrwk + 1
 76 | 
 77 |       !JR Note: ADD SCALING HERE IF DESIRED. Not scaling for now.
 78 | 
 79 |       ! Call ZHETRD to reduce A to tridiagonal form
 80 |       call nvtxStartRange("zhetrd", 0)
 81 |       call zhetrd_gpu('U', N, A, lda, w, rwork(inde), work(indtau), work(indwrk), llwork, nb1)
 82 |       call nvtxEndRange
 83 | 
 84 |       ! Copy diagonal and superdiagonal to CPU
 85 |       w_h(1:N) = w(1:N)
 86 |       rwork_h(inde:inde+N-1) = rwork(inde:inde+N-1)
 87 | 
 88 |       ! Restore lower triangular of A (works if called from zhegvd only!)
 89 |       !$cuf kernel do(2) <<<*,*>>>
 90 |       do j = 1,N
 91 |         do i = 1,N
 92 |           if (i > j) then
 93 |             A(i,j) = Z(i,j)
 94 |           endif
 95 |         end do
 96 |       end do
 97 | 
 98 | 
 99 |       ! Call ZSTEDC to get eigenvalues/vectors of tridiagonal A on CPU
100 |       call nvtxStartRange("zstedc", 1)
101 |       call zstedc('I', N, w_h, rwork_h(inde), Z_h, ldz_h, work_h, lwork_h, rwork_h(indrwk), llrwk, iwork_h, liwork_h, istat)
102 |       if (istat /= 0) then
103 |         write(*,*) "zheevd_gpu error: zstedc failed!"
104 |         info = -1
105 |         return
106 |       endif
107 |       call nvtxEndRange
108 | 
109 |       ! Copy eigenvectors and eigenvalues to GPU
110 |       istat = cudaMemcpy2D(Z(1, 1), ldz, Z_h(1, il), ldz_h, N, NZ)
111 |       w(1:N) = w_h(1:N)
112 | 
113 |       ! Call ZUNMTR to rotate eigenvectors to obtain result for original A matrix
114 |       ! JR Note: Eventual function calls from ZUNMTR called directly here with associated indexing changes
115 |       call nvtxStartRange("zunmtr", 2)
116 | 
117 |       istat = cudaEventRecord(event2, stream2)
118 | 
119 |       k = N-1
120 | 
121 |       do i = 1, k, nb2
122 |         ib = min(nb2, k-i+1)
123 | 
124 |         ! Form block reflector T in stream 1
125 |         call zlarft_gpu(i+ib-1, ib, A(1, 2+i-1), lda, work(indtau + i -1), work(indwrk), ldt, work(indwk2), ldt)
126 | 
127 |         mi = i + ib - 1
128 |         ! Apply reflector to eigenvectors in stream 2
129 |         call zlarfb_gpu(mi, NZ, ib, A(1,2+i-1), lda, work(indwrk), ldt, Z, ldz, work(indwk3), N, work(indwk2), ldt)
130 |       end do
131 | 
132 |       call nvtxEndRange
133 | 
134 |     end subroutine zheevd_gpu
135 | 
136 |     subroutine zlarft_gpu(N, K, V, ldv, tau, T, ldt, W, ldw)
137 |       use cublas
138 |       use eigsolve_vars
139 |       implicit none
140 |       integer                               :: N, K, ldv, ldt, ldw
141 |       complex(8), dimension(ldv, K), device :: V
142 |       complex(8), dimension(K), device      :: tau
143 |       complex(8), dimension(ldt, K), device :: T
144 |       complex(8), dimension(ldw, K), device :: W
145 | 
146 |       integer                               :: i, j, istat
147 |       type(dim3)                            :: threads
148 | 
149 |       istat = cublasSetStream(cuHandle, stream1)
150 | 
151 |       ! Prepare lower triangular part of block column for zherk call. 
152 |       ! Requires zeros in lower triangular portion and ones on diagonal.
153 |       ! Store existing entries (excluding diagonal) in W
154 |       !$cuf kernel do(2) <<<*, *, 0, stream1>>>
155 |       do j = 1, K
156 |         do i = N-K + 1, N
157 |           if (i-N+K == j) then
158 |             V(i, j) = dcmplx(1, 0)
159 |           else if (i-N+k > j) then
160 |             W(i-N+k,j) = V(i,j)
161 |             V(i,j) = dcmplx(0, 0)
162 |           endif
163 |         end do
164 |       end do
165 | 
166 |       istat = cudaEventRecord(event1, stream1)
167 |       istat = cudaStreamWaitEvent(stream1, event2, 0)
168 | 
169 |       ! Form preliminary T matrix
170 |       istat = cublaszherk_v2(cuHandle, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, K, N, 1.0_8, V, ldv, 0.0_8, T, ldt)
171 | 
172 |       ! Finish forming T 
173 |       threads = dim3(64, 16, 1)
174 |       call finish_T_block_kernel<<<1, threads, 0, stream1>>>(K, T, ldt, tau)
175 | 
176 |     end subroutine zlarft_gpu
177 | 
178 |     subroutine zlarfb_gpu(M, N, K, V, ldv, T, ldt, C, ldc, work, ldwork, W, ldw)
179 |       use cublas
180 |       use eigsolve_vars
181 |       implicit none
182 |       integer                                  :: M, N, K, ldv, ldt, ldc, ldw, ldwork, istat
183 |       integer                                  :: i, j
184 |       complex(8), dimension(ldv, K), device    :: V
185 |       complex(8), dimension(ldt, K), device    :: T
186 |       complex(8), dimension(ldw, K), device    :: W
187 |       complex(8), dimension(ldc, N), device    :: C
188 |       complex(8), dimension(ldwork, K), device :: work
189 | 
190 |       istat = cublasSetStream(cuHandle, stream2)
191 | 
192 |       istat = cudaStreamWaitEvent(stream2, event1, 0)
193 |       istat = cublaszgemm_v2(cuHandle, CUBLAS_OP_C, CUBLAS_OP_N, N, K, M, dcmplx(1,0), C, ldc, v, ldv, dcmplx(0,0), work, ldwork)
194 |       istat = cudaStreamSynchronize(stream1)
195 | 
196 | 
197 |       istat = cublasztrmm_v2(cuHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER, CUBLAS_OP_C, CUBLAS_DIAG_NON_UNIT, N, K, &
198 |         dcmplx(1,0), T, ldt, work, ldwork, work, ldwork)
199 | 
200 |       istat = cudaEventRecord(event2, stream2)
201 |       istat = cublaszgemm_v2(cuHandle, CUBLAS_OP_N, CUBLAS_OP_C, M, N, K, dcmplx(-1,0), V, ldv, work, ldwork, dcmplx(1,0), C, ldc)
202 | 
203 |       ! Restore clobbered section of block column (except diagonal)
204 |       !$cuf kernel do(2) <<<*, *>>>
205 |       do j = 1, K
206 |         do i = M-K + 1, M
207 |           if (i-M+k > j) then
208 |             V(i,j) = W(i-M+k,j)
209 |           endif
210 |         end do
211 |       end do
212 | 
213 |     end subroutine zlarfb_gpu
214 | 
215 |     attributes(global) subroutine finish_T_block_kernel(N, T, ldt, tau)
216 |       implicit none
217 |       integer, value                        :: N, ldt
218 |       complex(8), dimension(ldt, K), device :: T
219 |       complex(8), dimension(K), device      :: tau
220 |       ! T_s contains only lower triangular elements of T in linear array, by row
221 |       complex(8), dimension(2080), shared   :: T_s 
222 |       ! (i,j) --> ((i-1)*i/2 + j)
223 |       #define IJ2TRI(i,j) (ISHFT((i-1)*i,-1) + j)
224 | 
225 | 
226 |       integer     :: tid, tx, ty, i, j, k, diag
227 |       complex(8)  :: cv
228 | 
229 |       tx = threadIdx%x
230 |       ty = threadIdx%y
231 |       tid = (threadIdx%y - 1) * blockDim%x + tx ! Linear thread id
232 | 
233 |       ! Load T into shared memory
234 |       if (tx <= N) then
235 |         do j = ty, N, blockDim%y
236 |           cv = tau(j)
237 |           if (tx > j) then
238 |             T_s(IJ2TRI(tx,j)) = -cv*T(tx,j)
239 |           else if (tx == j) then
240 |             T_s(IJ2TRI(tx,j)) = cv
241 |           endif
242 |         end do
243 |       end if
244 | 
245 |       call syncthreads()
246 | 
247 |       ! Perform column by column update by first thread column
248 |       do i = N-1, 1, -1
249 |         if (ty == 1) then
250 |           if (tx > i .and. tx <= N) then
251 |             cv = cmplx(0,0)
252 |             do j = i+1, tx
253 |                 cv = cv + T_s(IJ2TRI(j, i)) * T_s(IJ2TRI(tx, j))
254 |             end do
255 |           endif
256 |           
257 |         endif
258 | 
259 |         call syncthreads()
260 |         if (ty == 1 .and. tx > i .and. tx <= N) then
261 |           T_s(IJ2TRI(tx, i)) = cv
262 |         endif
263 |         call syncthreads()
264 | 
265 |       end do
266 | 
267 |       call syncthreads()
268 | 
269 | 
270 |       ! Write T_s to global
271 |       if (tx <= N) then
272 |         do j = ty, N, blockDim%y
273 |           if (tx >= j) then
274 |             T(tx,j) = T_s(IJ2TRI(tx,j))
275 |           endif
276 |         end do
277 |       end if
278 | 
279 |     end subroutine finish_T_block_kernel
280 | 
281 | end module zheevd_gpu
282 | 


--------------------------------------------------------------------------------
/lib_eigsolve/zhegst_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module zhegst_gpu
 25 |   use cudafor
 26 |   use cublas
 27 | 
 28 |   contains
 29 | 
 30 |     ! zhegst completed in blocks, using 2 ztrsms to solve subblock problem on GPU
 31 |     subroutine zhegst_gpu(itype, uplo, N, A, lda, B, ldb, nb)
 32 |       use eigsolve_vars
 33 |       implicit none
 34 |       integer, intent(in)                                   :: itype, N, lda, ldb, nb
 35 |       character, intent(in)                                 :: uplo
 36 |       complex(8), device, dimension(1:ldb, 1:N), intent(in) :: B
 37 |       complex(8), device, dimension(1:lda, 1:N)             :: A
 38 |       complex(8), parameter                                 :: cone = cmplx(1.,0,8), chalf = cmplx(0.5,0,8)
 39 |       real(8), parameter                                    :: one = 1.0_8 
 40 | 
 41 |       integer                                               :: i, j
 42 |       integer                                               :: k, kb, istat
 43 | 
 44 |       if (itype .ne. 1 .or. uplo .ne. 'U') then
 45 |         print*, "Provided itype/uplo not supported!"
 46 |         return
 47 |       endif
 48 | 
 49 |       istat = cudaEventRecord(event2, stream2)
 50 | 
 51 |       do k = 1, N, nb
 52 |         kb = min(N-k+1, nb)
 53 | 
 54 |         istat = cublasSetStream(cuHandle, stream1)
 55 |         
 56 |         istat = cudaStreamWaitEvent(stream1, event2, 0)
 57 |         ! Populate subblock with complete hermitian entries (needed for ZTRSM calls)
 58 |         !$cuf kernel do(2) <<<*,*, 0, stream1>>>
 59 |         do j = k,k+kb-1
 60 |           do i = k,k+kb-1
 61 |             if (j < i) then
 62 |               A(i,j) = conjg(A(j,i))
 63 |             endif
 64 |           end do
 65 |         end do
 66 | 
 67 |         ! Solve subblock problem (this version results in fully populated A subblock)
 68 |         istat =  cublasztrsm_v2(cuHandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, CUBLAS_OP_N, kb, kb, &
 69 |                                 cone, B(k,k), ldb, A(k,k), lda)  
 70 |         istat =  cublasztrsm_v2(cuHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_OP_N, kb, kb, &
 71 |                                 cone, B(k,k), ldb, A(k,k), lda)  
 72 | 
 73 |         ! For accuracy, force diagonal to remain real
 74 |         !$cuf kernel do(2) <<<*,*, 0, stream1>>>
 75 |         do j = k,k+kb-1
 76 |           do i = k,k+kb-1
 77 |             if (i==j) then
 78 |               A(i,j) = dble(A(i,j))
 79 |             endif
 80 |           end do
 81 |         end do
 82 | 
 83 |         istat = cudaEventRecord(event1, stream1)
 84 | 
 85 |         if (k + kb .le. N) then
 86 |           istat = cublasSetStream(cuHandle, stream2)
 87 |           istat =  cublasztrsm_v2(cuHandle, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, CUBLAS_OP_N, kb, N-k-kb+1, cone, &
 88 |                      B(k, k), ldb, A(k, k+kb), lda) 
 89 | 
 90 |           istat = cudaStreamWaitEvent(stream2, event1, 0)
 91 | 
 92 |           ! Since the A subblock is fully populated, use gemm instead of hemm here
 93 |           istat =  cublaszgemm_v2(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, kb, N-k-kb+1, kb, -chalf, A(k,k), &
 94 |                      lda, B(k, k+kb), ldb, cone, A(k, k+kb), lda)
 95 |           istat = cublaszher2k_v2(cuHandle, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_C, N-k-kb+1, kb, -cone, A(k, k+kb), lda, &
 96 |                       B(k, k+kb), ldb, one, A(k+kb, k+kb), lda)
 97 | 
 98 |           istat = cudaEventRecord(event2, stream2)
 99 | 
100 |           istat = cublaszgemm_v2(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, kb, N-k-kb+1, kb, -chalf, A(k,k), &
101 |                      lda, B(k, k+kb), ldb, cone, A(k, k+kb), lda)
102 | 
103 |           istat = cublasztrsm_v2(cuHandle, CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, CUBLAS_OP_N, kb, N-k-kb+1, cone, &
104 |                      B(k+kb, k+kb), ldb, A(k, k+kb), lda) 
105 |         end if
106 | 
107 |       end do
108 | 
109 |     end subroutine zhegst_gpu
110 | 
111 | end module zhegst_gpu
112 | 


--------------------------------------------------------------------------------
/lib_eigsolve/zhegvdx_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module zhegvdx_gpu
 25 |   use cudafor
 26 |   use cublas
 27 |   implicit none
 28 | 
 29 |   contains
 30 | 
 31 |     ! zhegvdx_gpu
 32 |     ! This solver computes eigenvalues and associated eigenvectors over a specified integer range for a
 33 |     ! hermetian-definite eigenproblem in the following form:
 34 |     !     A * x = lambda * B * x
 35 |     ! where A and B are hermetian-matrices and B is positive definite. The solver expects the upper-triangular parts of the
 36 |     ! input A and B arguments to be populated. This configuration corresponds to calling ZHEGVX within LAPACK with the configuration
 37 |     ! arguments 'ITYPE = 1', 'JOBZ = 'V'', 'RANGE = 'I'', and 'UPLO = 'U''.
 38 |     !
 39 |     ! Input: 
 40 |     ! On device:
 41 |     !   -  A(lda, N), B(ldb, N) are double-complex matrices on device  with upper triangular portion populated
 42 |     !   -  il, iu are integers specifying range of eigenvalues/vectors to compute. Range is [il, iu]
 43 |     !   -  work is a double-complex array for complex workspace of length lwork. 
 44 |     !   -  lwork is an integer specifying length of work. lwork >=  2*64*64 + 65*N
 45 |     !   -  rwork is a real(8) array for real workspace of length lrwork. 
 46 |     !   -  lrwork is an integer specifying length of rwork. lrwork >= N
 47 |     !
 48 |     ! On host:
 49 |     !   -  work_h is a double-complex array for complex workspace of length lwork_h. 
 50 |     !   -  lwork_h is an integer specifying length of work_h. lwork_h >= N
 51 |     !   -  rwork_h is a real(8) array for complex workspace of length lrwork_h. 
 52 |     !   -  lrwork_h is an integer specifying length of rwork_h. lrwork_h >= 1 + 5*N + 2*N*N
 53 |     !   -  iwork_h is a integer array for integer workspace of length liwork_h. 
 54 |     !   -  liwork_h is an integer specifying length of iwork_h. liwork_h >= 3 + 5*N
 55 |     !   -  (optional) _skip_host_copy is an optional logical argument. If .TRUE., memcopy of final updated eigenvectors from 
 56 |     !      device to host will be skipped.
 57 |     !
 58 |     ! Output:
 59 |     ! On device:
 60 |     !   - A(lda, N), B(ldb, N) are modified on exit. The upper triangular part of A, including the diagonal is destroyed. 
 61 |     !     B is overwritten by the triangular Cholesky factor U corresponding to  B = U**H * U
 62 |     !   - Z(ldz, N) is a double-complex matrix on the device. On exit, the first iu - il + 1 columns of Z
 63 |     !     contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu].
 64 |     !   - w(N) is a real(8) array on the device. On exit, the first iu - il + 1 values of w contain the computed
 65 |     !     eigenvalues
 66 |     !
 67 |     ! On host:
 68 |     !   - Z_h(ldz_h, N) is a double-complex matrix on the host. On exit, the first iu - il + 1 columns of Z
 69 |     !     contains normalized eigenvectors corresponding to eigenvalues in the range [il, iu]. This is a copy of the Z
 70 |     !     matrix on the device. This is only true if optional argument _skip_host_copy is not provided or is set to .FALSE.
 71 |     !   - w_h(N) is a real(8) array on the host. On exit, the first iu - il + 1 values of w contain the computed
 72 |     !     eigenvalues. This is a copy of the w array on the host.
 73 |     !   - info is an integer. info will equal zero if the function completes succesfully. Otherwise, there was an error.
 74 |     !
 75 |     subroutine zhegvdx_gpu(N, A, lda, B, ldb, Z, ldz, il, iu, w, work, lwork, rwork, lrwork, &
 76 |                           work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info, _skip_host_copy)
 77 |       use eigsolve_vars
 78 |       use nvtx_inters
 79 |       use zhegst_gpu
 80 |       use zheevd_gpu
 81 |       implicit none
 82 |       integer                                     :: N, m, lda, ldb, ldz, il, iu, ldz_h, info, nb
 83 |       integer                                     :: lwork_h, lrwork_h, liwork_h, lwork, lrwork, liwork, istat
 84 |       real(8), dimension(1:lrwork), device        :: rwork
 85 |       real(8), dimension(1:lrwork_h), pinned      :: rwork_h
 86 |       complex(8), dimension(1:lwork), device      :: work
 87 |       complex(8), dimension(1:lwork_h), pinned    :: work_h
 88 |       integer, dimension(1:liwork_h), pinned      :: iwork_h
 89 |       logical, optional                           :: _skip_host_copy
 90 | 
 91 |       complex(8), dimension(1:lda, 1:N), device   :: A
 92 |       complex(8), dimension(1:ldb, 1:N), device   :: B
 93 |       complex(8), dimension(1:ldz, 1:N), device   :: Z
 94 |       complex(8), dimension(1:ldz_h, 1:N), pinned :: Z_h
 95 |       real(8), dimension(1:N), device             :: w
 96 |       real(8), dimension(1:N), pinned             :: w_h
 97 | 
 98 |       complex(8), parameter :: cone = cmplx(1,0,8)
 99 |       integer :: i, j
100 |       logical :: skip_host_copy
101 | 
102 |       info = 0
103 |       skip_host_copy = .FALSE.
104 |       if(present(_skip_host_copy)) skip_host_copy = _skip_host_copy
105 | 
106 |       ! Check workspace sizes
107 |       if (lwork < 2*64*64 + 65*N) then
108 |         print*, "zhegvdx_gpu error: lwork must be at least 2*64*64 + 65*N"
109 |         info = -1
110 |         return
111 |       else if (lrwork < N) then
112 |         print*, "zhegvdx_gpu error: lrwork must be at least N"
113 |         info = -1
114 |         return
115 |       else if (lwork_h < N) then 
116 |         print*, "zhegvdx_gpu error: lwork_h must be at least N"
117 |         info = -1
118 |         return
119 |       else if (lrwork_h < 1 + 5*N + 2*N*N) then 
120 |         print*, "zhegvdx_gpu error: lrwork_h must be at least 1 + 5*N + 2*N*N"
121 |         info = -1
122 |         return
123 |       else if (liwork_h < N) then 
124 |         print*, "zhegvdx_gpu error: liwork_h must be at least 3 + 5*N"
125 |         info = -1
126 |         return
127 |       endif
128 | 
129 |       m = iu - il + 1 ! Number of eigenvalues/vectors to compute
130 | 
131 |       if(initialized == 0) call init_eigsolve_gpu
132 | 
133 |       ! Compute cholesky factorization of B
134 |       call nvtxStartRange("cusolverdnZpotrf", 0)
135 |       istat = cusolverDnZpotrf(cusolverHandle, CUBLAS_FILL_MODE_UPPER, N, B, ldb, work, lwork, devInfo_d)
136 |       istat = devInfo_d
137 |       call nvtxEndRange
138 |       if (istat .ne. 0) then
139 |         print*, "zhegvdx_gpu error: cusolverDnZpotrf failed!"
140 |         info = -1
141 |         return
142 |       endif
143 | 
144 |       ! Store lower triangular part of A in Z
145 |       !$cuf kernel do(2) <<<*,*, 0, stream1>>>
146 |       do j = 1,N
147 |         do i = 1,N
148 |           if (i > j) then
149 |             Z(i,j) = A(i,j)
150 |           endif
151 |         end do
152 |       end do
153 | 
154 | 
155 |       ! Reduce to standard eigenproblem
156 |       nb = 448
157 |       call nvtxStartRange("zhegst_gpu", 1)
158 |       call zhegst_gpu(1, 'U', N, A, lda, B, ldb, nb)
159 |       call nvtxEndRange
160 | 
161 |       ! Tridiagonalize and compute eigenvalues/vectors
162 |       call nvtxStartRange("zheevd_gpu", 2)
163 |       call zheevd_gpu('V', 'U', il, iu, N, A, lda, Z, ldz, w, work, lwork, rwork, lrwork, &
164 |                       work_h, lwork_h, rwork_h, lrwork_h, iwork_h, liwork_h, Z_h, ldz_h, w_h, info)
165 |       call nvtxEndRange
166 | 
167 |       ! Triangle solve to get eigenvectors for original general eigenproblem
168 |       call nvtxStartRange("cublasZtrsm", 3)
169 |       call cublasZtrsm('L', 'U', 'N', 'N', N, (iu - il + 1), cone, B, ldb, Z, ldz) 
170 |       call nvtxEndRange
171 | 
172 |       ! Copy final eigenvectors to host
173 |       if (not(skip_host_copy)) then
174 |         istat = cudaMemcpy2D(Z_h, ldz_h, Z, ldz, N, m)
175 |         if (istat .ne. 0) then
176 |           print*, "zhegvdx_gpu error: cudaMemcpy2D failed!"
177 |           info = -1
178 |           return
179 |         endif
180 |       endif
181 | 
182 |     end subroutine zhegvdx_gpu
183 | 
184 | end module zhegvdx_gpu
185 | 


--------------------------------------------------------------------------------
/lib_eigsolve/zhemv_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module zhemv_gpu
 25 |   use cudafor
 26 | 
 27 |   contains
 28 | 
 29 | #define BX 32
 30 | #define BY 8
 31 | #define NTILES 4
 32 | 
 33 |   attributes(global) subroutine zhemv_gpu(N, A, lda, x, y)
 34 |     use cudafor
 35 |     implicit none
 36 | 
 37 |     integer, value                                    :: N, lda
 38 |     complex(8), dimension(lda, N), device, intent(in) :: A
 39 |     complex(8), dimension(N), device, intent(in)      :: x
 40 |     !DIR$ IGNORE_TKR y
 41 |     real(8), dimension(2*N), device                   :: y 
 42 | 
 43 |     real(8), dimension(BX+1, BX), shared              :: Ar_s
 44 |     real(8), dimension(BX+1, BX), shared              :: Ai_s
 45 |     real(8), dimension(BX), shared                    :: r_s
 46 |     real(8), dimension(BX), shared                    :: i_s
 47 | 
 48 |     integer                                           :: tx, ty, ii, jj, i, j, k, istat
 49 |     real(8)                                           :: rv1, rv2, iv1, iv2, myrsum, myisum
 50 |     real(8)                                           :: Ar, Ai, xrl, xil
 51 |     complex(8)                                        :: val
 52 | 
 53 |     ! ii,jj is index of top left corner of block
 54 |     ii = (blockIdx%y-1) * blockDim%x + 1
 55 |     !print*, "ii ", ii
 56 | 
 57 |     myrsum = 0.0_8
 58 |     myisum = 0.0_8
 59 | 
 60 |     tx = threadIdx%x
 61 |     ty = threadIdx%y
 62 | 
 63 |     if (ii + (blockIdx%x-1)*blockDim%x > N) return
 64 | 
 65 | 
 66 |     i = ii + tx - 1
 67 |     if (i <= N) then
 68 |       val =  x(i) ! read part of x for lower triangular multiply
 69 |     endif
 70 |     xrl = dble(val)
 71 |     xil = dimag(val)
 72 | 
 73 |     ! Loop over columns (skip all lower triangular blocks)
 74 |     do jj = ii + (blockIdx%x-1)*blockDim%x, N, gridDim%x*blockDim%x
 75 |       j = jj + ty - 1
 76 | 
 77 |       ! Load block into shared memory
 78 |       ! CASE 1: Diagonal block
 79 |       if (ii == jj) then
 80 | 
 81 |         ! Load full block into shared memory
 82 |         do k = 0,NTILES-1
 83 |           if (i <= N .and. j + k * blockDim%y <= N) then
 84 |             val = A(i, j + k*blockDim%y)
 85 |             Ar_s(tx, ty + k * blockDim%y) = dble(val)
 86 |             Ai_s(tx, ty + k * blockDim%y) = dimag(val)
 87 |           endif
 88 |         end do
 89 |         
 90 |         call syncthreads()
 91 | 
 92 |         ! Reflect to populate lower triangular part with true values of A
 93 |         do k = 0,NTILES-1
 94 |           if (tx > ty + k * blockDim%y) then
 95 |             Ar_s(tx, ty + k * blockDim%y) = Ar_s(ty + k * blockDim%y, tx)
 96 |             Ai_s(tx, ty + k * blockDim%y) = -Ai_s(ty + k * blockDim%y, tx)
 97 |           endif
 98 |         end do
 99 | 
100 |         call syncthreads()
101 | 
102 |         do k = 0,NTILES-1
103 |           if (i <= N .and. j + k * blockDim%y <= N ) then
104 |             Ar = Ar_s(tx, ty + k * blockDim%y); Ai = Ai_s(tx, ty + k * blockDim%y)
105 |             val = x(j + k*blockDim%y)
106 |             rv1 = dble(val) ; iv1 = dimag(val)
107 |             myrsum = myrsum + Ar * rv1 - Ai * iv1
108 |             myisum = myisum + Ar * iv1 + Ai * rv1
109 |           endif
110 |         end do
111 | 
112 |         !call syncthreads()
113 | 
114 |       ! CASE 2: Upper triangular block
115 |       else if (ii < jj) then
116 |         do k = 0,NTILES-1
117 |           if (j + k * blockDim%y <= N) then
118 |             val = A(i, j + k * blockDim%y)
119 |             Ar = dble(val)
120 |             Ai = dimag(val)
121 |           endif
122 | 
123 |           if (i <= N .and. j + k * blockDim%y <= N ) then
124 |             val = x(j + k*blockDim%y)
125 |             rv1 = dble(val) ; iv1 = dimag(val)
126 |             myrsum = myrsum + Ar * rv1 - Ai * iv1
127 |             myisum = myisum + Ar * iv1 + Ai * rv1
128 |           endif
129 | 
130 |           ! Perform product for symmetric lower block here
131 |           ! Don't need sync threads since thread is accessing own value
132 |           !call syncthreads()
133 |           if (i <= N .and. j + k*blockDim%y <= N) then
134 |             rv1 = Ar * xrl + Ai * xil
135 |             iv1 = Ar * xil - Ai * xrl
136 |           else
137 |             rv1 = 0.0_8
138 |             iv1 = 0.0_8
139 |           endif
140 | 
141 |           !Partial sum within warps using shuffle
142 |           rv2 = __shfl_down(rv1,1)
143 |           rv1 = rv1 + rv2
144 |           rv2 = __shfl_down(rv1,2)
145 |           rv1 = rv1 + rv2
146 |           rv2 = __shfl_down(rv1,4)
147 |           rv1 = rv1 + rv2
148 |           rv2 = __shfl_down(rv1,8)
149 |           rv1 = rv1 + rv2
150 |           rv2 = __shfl_down(rv1,16)
151 |           rv1 = rv1 + rv2
152 | 
153 |           if (tx == 1) then
154 |             r_s(ty + k*blockDim%y) = rv1
155 |           endif
156 | 
157 |           !Partial sum within warps using shuffle
158 |           iv2 = __shfl_down(iv1,1)
159 |           iv1 = iv1 + iv2
160 |           iv2 = __shfl_down(iv1,2)
161 |           iv1 = iv1 + iv2
162 |           iv2 = __shfl_down(iv1,4)
163 |           iv1 = iv1 + iv2
164 |           iv2 = __shfl_down(iv1,8)
165 |           iv1 = iv1 + iv2
166 |           iv2 = __shfl_down(iv1,16)
167 |           iv1 = iv1 + iv2
168 | 
169 |           if (tx == 1) then
170 |             i_s(ty + k*blockDim%y) = iv1
171 |           endif
172 |         enddo
173 | 
174 |         call syncthreads()
175 | 
176 |         if (ty == 1 .and. jj+tx-1 <= N) then
177 |           istat = atomicadd(y(2*(jj + tx -1)-1), r_s(tx))
178 |           istat = atomicadd(y(2*(jj + tx -1)), i_s(tx))
179 |         endif
180 |         !call syncthreads()
181 | 
182 |       endif
183 | 
184 |       call syncthreads()
185 | 
186 |     end do
187 | 
188 |     if (i <= N) then
189 |       istat = atomicadd(y(2*i - 1), myrsum)
190 |       istat = atomicadd(y(2*i), myisum)
191 |     endif
192 |     
193 |   end subroutine zhemv_gpu
194 | 
195 | end module zhemv_gpu
196 | 
197 | 


--------------------------------------------------------------------------------
/lib_eigsolve/zhetd2_gpu.F90:
--------------------------------------------------------------------------------
  1 | module zhetd2_gpu
  2 |   contains
  3 |     attributes(global) subroutine zhetd2_gpu(n,a,lda,d,e,tau)
  4 |       use cudafor
  5 |       implicit none
  6 |       integer, value    :: lda
  7 |       complex(8),device :: a(lda,32),tau(32)
  8 |       real(8),device    :: d(32),e(32)
  9 |       complex(8),shared :: a_s(32,32)
 10 |       complex(8),shared :: alpha
 11 |       complex(8),shared :: taui
 12 |       real(8)           :: beta
 13 |       real(8)           :: alphar,alphai
 14 |       real(8)           :: xnorm,x,y,z,w
 15 |       complex(8)        :: wc
 16 |       integer, value    :: n
 17 |       integer           :: tx,ty,tl,i,j,ii
 18 | 
 19 |       tx=threadIdx%x
 20 |       ty=threadIdx%y
 21 |       ! Linear id of the thread (tx,ty)
 22 |       tl=tx+ blockDim%x*(ty-1)
 23 | 
 24 |       ! Load a_d in shared memory
 25 |       if (tx <= N .and. ty <= N) then
 26 |          a_s(tx           ,ty           )=a(tx           ,ty)
 27 |       endif
 28 | 
 29 |        call syncthreads()
 30 |       ! Hermitian matrix from upper triangular
 31 |       if (tx >ty) then
 32 |          a_s(tx,ty)=conjg(a_s(ty,tx))
 33 |       end if
 34 | 
 35 |       ! Enforce diagonal element to be real
 36 |       if (tl==1) a_s(n,n)=dble(a_s(n,n))
 37 | 
 38 |       call syncthreads()
 39 | 
 40 |       ! For each column working backward
 41 |       do i=n-1,1,-1
 42 |         ! Generate elementary reflector
 43 |         ! Sum the vectors above the diagonal, only one warp active
 44 |         ! Reduce in a warp
 45 |         if (tl <=32) then
 46 |           if (tl <i) then
 47 |             w=a_s(tl,i+1)*conjg(a_s(tl,i+1))
 48 |           else 
 49 |             w=0._8
 50 |           endif
 51 | 
 52 |            xnorm=__shfl_down(w,1)
 53 |            w=w+xnorm
 54 |            xnorm=__shfl_down(w,2)
 55 |            w=w+xnorm
 56 |            xnorm=__shfl_down(w,4)
 57 |            w=w+xnorm
 58 |            xnorm=__shfl_down(w,8)
 59 |            w=w+xnorm
 60 |            xnorm=__shfl_down(w,16)
 61 |            w=w+xnorm
 62 |         end if
 63 | 
 64 |         if(tl==1) then
 65 |           alpha=a_s(i,i+1)
 66 |           alphar=dble(alpha)
 67 |           alphai=dimag(alpha)
 68 |           xnorm=dsqrt(w)
 69 |           
 70 |           if (xnorm .eq. 0_8 .and. alphai .eq. 0._8) then
 71 |           ! H=1
 72 |             taui= 0._8
 73 |             alpha = dcmplx(1.d0, 0.d0) ! To prevent scaling by zscal in this case
 74 |           else
 75 |             !Compute sqrt(alphar^2+alphai^2+xnorm^2) with  dlapy3(alphar,alphai,xnorm)
 76 |             x=abs(alphar)
 77 |             y=abs(alphai)
 78 |             z=abs(xnorm)
 79 |             w=max(x,y,z)
 80 |             beta=-sign(w*sqrt((x/w)**2+(y/w)**2+(z/w)**2),alphar)
 81 | 
 82 |             taui=dcmplx( (beta-alphar)/beta, -alphai/beta)
 83 | 
 84 |             !zladiv(dcmplx(one),alpha-beta)
 85 |             x= dble(alpha-beta)
 86 |             y=dimag(alpha-beta)
 87 |             if( abs(y) .lt. abs(x) ) then
 88 |               w=y/x
 89 |               z=x+y*w
 90 |               alpha=dcmplx(1/z,-w/z)
 91 |             else
 92 |               w=x/y
 93 |               z=y+x*w
 94 |               alpha=dcmplx(w/z,-1/z)
 95 |             end if
 96 |           end if
 97 |         end if
 98 | 
 99 |         call syncthreads()
100 | 
101 |         ! zscal
102 |         if (tl<i) then
103 |           a_s(tl,i+1)=a_s(tl,i+1)*alpha
104 |         end if
105 | 
106 |         if (tl==1) then 
107 |           if (xnorm .ne. 0_8 .or. alphai .ne. 0._8) then
108 |             alpha=dcmplx(beta,0._8)
109 |           else
110 |             alpha=a_s(i,i+1) ! reset alpha to original value
111 |           endif
112 | 
113 |           e(i)=alpha
114 |         end if
115 | 
116 |         if(taui.ne.(0.d0,0.d0)) then
117 |           a_s(i,i+1)=dcmplx(1.d0,0.d0)
118 |           call syncthreads()
119 |           if(tl<=i) then
120 |           tau(tl)=dcmplx(0.d0,0.d0)
121 |           do j=1,i
122 |             tau(tl)=tau(tl)+taui*a_s(tl,j)*a_s(j,i+1)
123 |           end do
124 |         end if
125 | 
126 |         call syncthreads()
127 |        
128 |         if (tl <=32) then
129 |           if (tl <=i) then
130 |             wc=taui*conjg(tau(tl))*a_s(tl,i+1)
131 |             x=-.5d0*dble(wc)
132 |             y=-.5d0*dimag(wc)
133 |           else
134 |             x=0._8
135 |             y=0._8
136 |           endif
137 | 
138 |           z=__shfl_xor(x,1)
139 |           x=x+z
140 |           z=__shfl_xor(x,2)
141 |           x=x+z
142 |           z=__shfl_xor(x,4)
143 |           x=x+z
144 |           z=__shfl_xor(x,8)
145 |           x=x+z
146 |           z=__shfl_xor(x,16)
147 |           x=x+z
148 | 
149 |           w=__shfl_xor(y,1)
150 |           y=y+w
151 |           w=__shfl_xor(y,2)
152 |           y=y+w
153 |           w=__shfl_xor(y,4)
154 |           y=y+w
155 |           w=__shfl_xor(y,8)
156 |           y=y+w
157 |           w=__shfl_xor(y,16)
158 |           y=y+w
159 |         end if
160 | 
161 |        call syncthreads()
162 | 
163 |        if (tl <=i) then
164 |          tau(tl)=tau(tl)+dcmplx(x,y)*a_s(tl,i+1)
165 |        end if
166 | 
167 |        if( tl==1) alpha=dcmplx(x,y)
168 | 
169 |         call syncthreads()
170 | 
171 |         if( tx<=i .and. ty<=i) then
172 |           a_s(tx,ty)=a_s(tx,ty)-a_s(tx,i+1)*dconjg(tau(ty))-dconjg(a_s(ty,i+1))*tau(tx)
173 |         end if
174 |         call syncthreads()
175 | 
176 |         else
177 |           if(tl==1)a_s(i,i)=dble(a_s(i,i))
178 |         endif
179 | 
180 |         if (tl==1) then
181 |           a_s(i,i+1)=e(i)
182 |           d(i+1)=a_s(i+1,i+1)
183 |           tau(i)=taui
184 |         end if
185 | 
186 |         call syncthreads()
187 | 
188 |       end do
189 | 
190 |       if (tl==1) then
191 |         d(1) = a_s(1,1)
192 |       endif
193 | 
194 | 
195 |       call syncthreads()
196 | 
197 |       ! Back to device memory
198 |       if (tx <= N .and. ty <= N) then
199 |         a(tx,ty)=a_s(tx,ty)
200 |       endif
201 | 
202 | 
203 |     end subroutine zhetd2_gpu
204 | end module zhetd2_gpu
205 | 


--------------------------------------------------------------------------------
/lib_eigsolve/zhetrd_gpu.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module zhetrd_gpu
 25 |   use cudafor
 26 |   use cublas
 27 | 
 28 |   contains
 29 |   
 30 |     subroutine zhetrd_gpu(uplo, N, A, lda, d, e, tau, work, lwork, nb)
 31 |       use eigsolve_vars
 32 |       use zhetd2_gpu
 33 |       implicit none
 34 |       character                                 :: uplo
 35 |       integer                                   :: N, lda, lwork, nb, nx, ldwork, istat
 36 |       integer                                   :: i, j, k, kk
 37 |       real(8), dimension(1:N), device           :: d
 38 |       real(8), dimension(1:N-1), device         :: e
 39 |       complex(8), dimension(1:lwork), device    :: work
 40 |       complex(8), dimension(1:lda, 1:N), device :: A
 41 |       complex(8), dimension(1:N-1), device      :: tau
 42 |       complex(8), parameter                     :: cone = cmplx(1,0,8)
 43 |       real(8), parameter                        :: one = 1.0_8
 44 |       type(dim3)                                :: threads
 45 | 
 46 |       if (uplo .ne. 'U') then
 47 |         print*, "Provided uplo type not supported!"
 48 |         return
 49 |       endif
 50 | 
 51 |       if (lwork < (nb+2)*N .and. N > nb) then
 52 |         write(*,*), "Provided work array must be sized (nb+2)*N or greater!"
 53 |         return
 54 |       endif
 55 | 
 56 |       ldwork = N
 57 | 
 58 |       istat = cublasSetStream(cuHandle, stream1)
 59 | 
 60 |       kk = N-((N-32) / nb) * nb
 61 |       k = N+1
 62 |       do i = N-nb+1, kk+1, -nb
 63 |         ! Reduce columns i:i+nb-1 to tridiagonal form 
 64 |         call zlatrd_gpu(uplo, i+nb-1, nb, A, lda, e, tau, work, ldwork)
 65 | 
 66 |         ! Update trailing submatrix
 67 |         call cublaszher2k(uplo, 'N', i-1, nb, -cone, A(1, i), lda, work, ldwork, one, a, lda)
 68 | 
 69 |         k = k - nb
 70 | 
 71 |       end do
 72 |       
 73 |       ! Finish any remaining columns to get final 32x32 block
 74 |       nb = k - 32 - 1
 75 |       i = k - nb
 76 |       
 77 |       if (nb > 0) then
 78 |         ! Reduce columns i:i+nb-1 to tridiagonal form 
 79 |         call zlatrd_gpu(uplo, i+nb-1, nb, A, lda, e, tau, work, ldwork)
 80 | 
 81 |         ! Update trailing submatrix
 82 |         call cublaszher2k(uplo, 'N', i-1, nb, -cone, A(1, i), lda, work, ldwork, one, a, lda)
 83 |       endif
 84 | 
 85 |       ! Final block
 86 |       threads = dim3(32, 32, 1)
 87 |       call zhetd2_gpu<<<1, threads>>>(min(32, N), A, lda, d, e, tau)
 88 | 
 89 |       ! Copy superdiagonal back into A, store diagonal in d
 90 |       !$cuf kernel do(1) <<<*,*>>>
 91 |       do j = 33, N
 92 |         !A(j-1, j) = e(j-1) ! JR Not strictly needed so skipping this copy
 93 |         d(j) = A(j,j)
 94 |       end do
 95 | 
 96 |     end subroutine zhetrd_gpu
 97 | 
 98 | 
 99 |     subroutine zlatrd_gpu(uplo, N, nb, A, lda, e, tau, W, ldw)
100 |       use eigsolve_vars
101 |       use zhemv_gpu
102 |       implicit none
103 |       character                                  :: uplo
104 |       integer                                    :: N, nb, lda, ldw, istat
105 |       integer                                    :: i, j, k, iw
106 |       integer                                    :: blocks, threads
107 |       complex(8), dimension(1:lda, 1:N), device  :: A
108 |       complex(8), dimension(1:ldw, 1:nb), device :: W
109 |       complex(8), dimension(N-1), device         :: tau
110 |       real(8), dimension(N-1), device            :: e
111 |       complex(8), parameter                      :: cone = cmplx(1, 0, 8), czero = cmplx(0, 0, 8), chalf = cmplx(0.5, 0, 8)
112 | 
113 |       type(dim3)                                 :: threads2D, blocks2D
114 | 
115 |       if (uplo .ne. 'U') then
116 |         print*, "Provided uplo type not supported!"
117 |         return
118 |       endif
119 | 
120 |       threads2D = dim3(32,8,1)
121 |       threads = 256
122 | 
123 |       if (N <= 0) return
124 | 
125 |       ! Complete first iteration outside loop
126 |       if (N > 1) then
127 |         iw = nb
128 |         ! Generate elementary reflector H(i) to annihilate A(1:i-2, i)
129 |         call zlarfg_kernel<<<1, threads>>>(N-1, e(N-1), A(1, N), tau(N-1))
130 | 
131 |         !$cuf kernel do(1) <<<*,*>>>
132 |         do k = 1, N-1
133 |           W(k,iw) = dcmplx(0,0)
134 |         end do
135 | 
136 |         blocks2D = dim3(10, ceiling(real(N-1)/32), 1) !JR TODO: What is optimal number of columns for our problem size?
137 |         call zhemv_gpu<<<blocks2D, threads2D>>>(N-1, A, lda, A(1, N), W(1, iw))
138 | 
139 |         call finish_W_col_kernel<<<1, threads>>>(N-1, tau(N-1), A(1, N), W(1, iw))
140 |       endif
141 | 
142 |       do i = N-1, N-nb+1, -1
143 |         iw = i-N+nb
144 | 
145 |         blocks2D = dim3(ceiling(real(max(i, N-i))/32), ceiling(real(N-i)/8), 1)
146 |         !call zher2_mv_kernel<<<blocks2D, threads2D>>>(i, N-i, A(1, i+1), lda, W(1, iw+1), ldw, A(1, i), W(1, iw), ldw)
147 |         call zher2_mv_zlarfg_kernel<<<blocks2D, threads2D>>>(i, N-i, A(1, i+1), lda, W(1, iw+1), ldw, A(1, i), W(1, iw), ldw, e(i-1), tau(i-1), A(1, i), finished(1))
148 | 
149 |         if (i > 1) then
150 |           ! Generate elementary reflector H(i) to annihilate A(1:i-2, i)
151 |           !call zlarfg_kernel<<<1, threads>>>(i-1, e(i-1), A(1, i), tau(i-1))
152 | 
153 |           blocks2D = dim3(min(10, ceiling(real(i-1)/32)), ceiling(real(i-1)/32), 1) !JR TODO: What is optimal number of columns for our problem size?
154 |           call zhemv_gpu<<<blocks2D, threads2D>>>(i-1, A, lda, A(1, i), W(1, iw))
155 | 
156 |           blocks2D = dim3(ceiling(real(i-1)/32), ceiling(real(2*(n-i))/8), 1)
157 |           call stacked_zgemv_C<<<blocks2D, threads2D>>>(n-i, i-1, A(1,i+1), lda, W(1, iw+1), ldw, A(1,i), W(i+1, iw), W(i+1, iw+1))
158 |           !call stacked_zgemv_N<<<blocks2D, threads2D>>>(i-1, n-i, A(1,i+1), lda, W(1, iw+1), ldw, W(i+1,iw), W(i+1, iw+1), W(1, iw))
159 |           call stacked_zgemv_N_finish_W<<<blocks2D, threads2D>>>(i-1, n-i, A(1,i+1), lda, W(1, iw+1), ldw, W(i+1,iw), W(i+1, iw+1), W(1, iw), tau(i-1), A(1, i), W(1, iw), finished(1))
160 | 
161 |           !call finish_W_col_kernel<<<1, threads>>>(i-1, tau(i-1), A(1, i), W(1, iw))
162 | 
163 |         end if
164 |       end do
165 |     end subroutine zlatrd_gpu
166 | 
167 |     attributes(global) subroutine zher2_mv_kernel(N, M, V, ldv, W, ldw, x, W2, ldw2)
168 |       implicit none
169 |       integer, value                                        :: N, M, ldv, ldw, ldw2
170 |       complex(8), dimension(1:ldv, 1:M), device, intent(in) :: V
171 |       complex(8), dimension(1:ldw, 1:M), device, intent(in) :: W
172 |       complex(8), dimension(1:ldw2, 2), device               :: W2
173 |       !DIR$ IGNORE_TKR x
174 |       real(8), dimension(1:2*N), device                     :: x
175 | 
176 |       integer                                               :: i, j, istat
177 |       complex(8)                                            :: val
178 |       real(8)                                               :: rv, iv
179 | 
180 |       i = (blockIdx%x - 1) * blockDim%x + threadIdx%x
181 |       j = (blockIdx%y - 1) * blockDim%y + threadIdx%y
182 | 
183 |       if (i <= N .and. j <= M) then
184 | 
185 |         val = - conjg(W(N, j)) * V(i,j) - conjg(V(N, j)) * W(i,j)
186 |         rv = dble(val)
187 |         iv = dimag(val)
188 | 
189 |         ! Zero out imaginary part on diagonal
190 |         if (i == N) then
191 |           iv = 0.d0
192 |         endif
193 | 
194 |         ! Update x
195 |         istat = atomicadd(x(2*i -1), rv)
196 |         istat = atomicadd(x(2*i), iv)
197 |       endif
198 | 
199 |       if (threadIdx%y == 1) then
200 |         ! Zero out column for zhemv call
201 |         if (i <= N) W2(i, 1) = 0
202 |         ! Zero out workspace for intermediate zgemv results
203 |         if (i <= M) then
204 |           W2(N + i, 1) = 0
205 |           W2(N + i, 2) = 0
206 |         endif
207 |       endif
208 | 
209 |     end subroutine zher2_mv_kernel
210 | 
211 |     attributes(global) subroutine zlarfg_kernel(N, e, x, tau)
212 |       implicit none
213 |       integer, value                   :: N
214 |       complex(8), device               :: tau
215 |       real(8), device                  :: e
216 |       complex(8), dimension(N), device :: x
217 | 
218 |       integer                          :: tid, i, j, nb, istat, laneID
219 |       real(8)                          :: rv1, rv2, rv3, scal, invscal, alphar, alphai, beta, rsum, isum
220 |       complex(8)                       :: cv1
221 |       real(8), shared                  :: xnorm
222 |       complex(8), shared               :: alpha_s
223 | 
224 |       tid = threadIdx%x
225 |       laneID = iand(tid, 31)
226 | 
227 |       if (tid == 1) then
228 |         alpha_s = x(N)
229 |         xnorm = 0.0_8
230 |       endif
231 | 
232 |       call syncthreads()
233 | 
234 |       alphar = dble(alpha_s)
235 |       alphai = dimag(alpha_s)
236 |       rsum = 0.0_8
237 | 
238 |       nb = ceiling(real(N)/blockDim%x) ! number of blocks down column
239 | 
240 |       i = tid
241 |       do j = 1, nb
242 | 
243 |         ! All threads perform their product, zero if out of bounds
244 |         if (i <= N-1) then
245 |           cv1 = x(i)
246 |           rv2 = dble(cv1); rv3 = dimag(cv1)
247 |           rv1 = rv2*rv2 + rv3*rv3
248 |         else
249 |           rv1 = 0.0_8
250 |         endif
251 | 
252 |         rsum = rsum + rv1
253 | 
254 |         i = i + blockDim%x
255 |       end do
256 | 
257 |       ! Partial sum within warps using shuffle
258 |       rv1 = rsum
259 |       rv2 = __shfl_down(rv1,1)
260 |       rv1 = rv1 + rv2
261 |       rv2 = __shfl_down(rv1,2)
262 |       rv1 = rv1 + rv2
263 |       rv2 = __shfl_down(rv1,4)
264 |       rv1 = rv1 + rv2
265 |       rv2 = __shfl_down(rv1,8)
266 |       rv1 = rv1 + rv2
267 |       rv2 = __shfl_down(rv1,16)
268 |       rv1 = rv1 + rv2
269 | 
270 | 
271 |       if (laneID == 1) then
272 |         istat = atomicadd(xnorm, rv1)
273 |       endif
274 | 
275 |       call syncthreads()
276 | 
277 |       if (xnorm == 0.0_8 .and. alphai == 0.0_8) then
278 |         if (tid == 1) then
279 |           tau = 0.0_8
280 |         endif
281 |       else
282 |         if (tid == 1) then
283 |           xnorm = sqrt(xnorm)
284 | 
285 |           rv1 = abs(alphar)
286 |           rv2 = abs(alphai)
287 |           ! not taking abs of xnorm
288 |           scal = max(rv1, rv2, xnorm)
289 |           invscal = 1.d0/scal
290 | 
291 |           rv1 = rv1 * invscal
292 |           rv2 = rv2 * invscal
293 |           xnorm = xnorm * invscal
294 |           
295 |           beta = -sign(scal * sqrt(rv1*rv1 + rv2*rv2 + xnorm*xnorm), alphar)
296 | 
297 |           tau = dcmplx((beta - alphar)/beta, -alphai/beta)
298 | 
299 |           !zladiv
300 |           rv1 = dble(alpha_s - beta)
301 |           rv2 = dimag(alpha_s - beta)
302 | 
303 |           if (abs(rv2) .lt. abs(rv1)) then
304 |             xnorm = rv2/rv1
305 |             invscal = 1.d0/(rv1 + rv2*xnorm)
306 |             alpha_s = dcmplx(invscal, -xnorm * invscal)
307 |           else
308 |             xnorm = rv1/rv2
309 |             invscal = 1.d0/(rv2 + rv1*xnorm)
310 |             alpha_s = dcmplx(xnorm * invscal, -invscal)
311 |           endif
312 | 
313 |           e = beta ! store beta in e vector
314 |         endif
315 | 
316 |         call syncthreads()
317 | 
318 |         do i = tid, N, blockDim%x
319 |           cv1 = x(i)
320 | 
321 |           if (i <= N-1) then
322 |             cv1 = alpha_s * cv1
323 |           elseif (i == N) then
324 |             !x(i) = 1.0_8
325 |             cv1 = dcmplx(1.0_8, 0.0_8)
326 |           endif
327 | 
328 |           x(i) = cv1
329 |         end do
330 | 
331 |       endif
332 | 
333 |     end subroutine zlarfg_kernel
334 | 
335 |     attributes(global) subroutine zher2_mv_zlarfg_kernel(N, M, V, ldv, W, ldw, x, W2, ldw2, e, tau, x2, finished)
336 |       implicit none
337 |       integer, value                                        :: N, M, ldv, ldw, ldw2
338 |       complex(8), dimension(1:ldv, 1:M), device, intent(in) :: V
339 |       complex(8), dimension(1:ldw, 1:M), device, intent(in) :: W
340 |       complex(8), dimension(1:ldw2, 2), device              :: W2
341 |       !DIR$ IGNORE_TKR x
342 |       real(8), dimension(1:2*N), device                     :: x
343 |       complex(8), dimension(1:N), device                    :: x2
344 |       complex(8), device                                    :: tau
345 |       real(8), device                                       :: e
346 | 
347 |       integer                                               :: i, j, tx, ty, tid, nb, laneid, istat, nBlocks
348 |       integer, device                                       :: finished
349 |       integer, shared                                       :: nFinished 
350 |       complex(8)                                            :: val
351 |       real(8)                                               :: rv, iv
352 |       real(8)                                               :: rv1, rv2, rv3, scal, invscal, alphar, alphai, beta, rsum, isum
353 |       complex(8)                                            :: cv1
354 |       real(8), shared                                       :: xnorm
355 |       complex(8), shared                                    :: alpha_s
356 | 
357 |       tx = threadIdx%x
358 |       ty = threadIdx%y
359 |       i = (blockIdx%x - 1) * blockDim%x + tx
360 |       j = (blockIdx%y - 1) * blockDim%y + ty
361 | 
362 |       nBlocks = gridDim%x * gridDim%y
363 | 
364 |       !if (i > N .or. j > M) return
365 |       if (i <= N .and. j <= M) then
366 | 
367 |         val = - conjg(W(N, j)) * V(i,j) - conjg(V(N, j)) * W(i,j)
368 |         rv = dble(val)
369 |         iv = dimag(val)
370 | 
371 |         ! Zero out imaginary part on diagonal
372 |         if (i == N) then
373 |           iv = 0.d0
374 |         endif
375 | 
376 |         ! Update x
377 |         istat = atomicadd(x(2*i -1), rv)
378 |         istat = atomicadd(x(2*i), iv)
379 |       endif
380 | 
381 |       if (ty == 1) then
382 |         ! Zero out column for zhemv call
383 |         if (i <= N) W2(i, 1) = 0
384 |         ! Zero out workspace for intermediate zgemv results
385 |         if (i <= M) then
386 |           W2(N + i, 1) = 0
387 |           W2(N + i, 2) = 0
388 |         endif
389 |       endif
390 | 
391 |       call threadfence()
392 | 
393 |       nFinished = 0
394 |       call syncthreads()
395 |       if (tx + ty == 2) nFinished = atomicinc(finished, nBlocks-1)
396 |       call syncthreads()
397 | 
398 |       if ( nFinished < nBlocks - 1) return
399 | 
400 |       ! Begin zlarfg work with last block
401 |       if (N == 1) return
402 | 
403 |       tid = tx + (ty - 1) * blockDim%x
404 |       laneID = iand(tid, 31)
405 | 
406 |       if (tid == 1) then
407 |         alpha_s = x2(N-1)
408 |         xnorm = 0.0_8
409 |       endif
410 | 
411 |       call syncthreads()
412 | 
413 |       alphar = dble(alpha_s)
414 |       alphai = dimag(alpha_s)
415 |       rsum = 0.0_8
416 | 
417 |       nb = ceiling(real(N-1)/(blockDim%x*blockDim%y)) ! number of blocks down column
418 | 
419 |       i = tid
420 |       do j = 1, nb
421 | 
422 |         ! All threads perform their product, zero if out of bounds
423 |         if (i <= N-2) then
424 |           cv1 = x2(i)
425 |           rv2 = dble(cv1); rv3 = dimag(cv1)
426 |           rv1 = rv2*rv2 + rv3*rv3
427 |         else
428 |           rv1 = 0.0_8
429 |         endif
430 | 
431 |         rsum = rsum + rv1
432 | 
433 |         i = i + blockDim%x*blockDim%y
434 |       end do
435 | 
436 |       ! Partial sum within warps using shuffle
437 |       rv1 = rsum
438 |       rv2 = __shfl_down(rv1,1)
439 |       rv1 = rv1 + rv2
440 |       rv2 = __shfl_down(rv1,2)
441 |       rv1 = rv1 + rv2
442 |       rv2 = __shfl_down(rv1,4)
443 |       rv1 = rv1 + rv2
444 |       rv2 = __shfl_down(rv1,8)
445 |       rv1 = rv1 + rv2
446 |       rv2 = __shfl_down(rv1,16)
447 |       rv1 = rv1 + rv2
448 | 
449 |       if (laneID == 1) then
450 |         istat = atomicadd(xnorm, rv1)
451 |       endif
452 | 
453 |       call syncthreads()
454 | 
455 |       if (xnorm == 0.0_8 .and. alphai == 0.0_8) then
456 |         if (tid == 1) then
457 |           tau = 0.0_8
458 |         endif
459 |       else
460 |         if (tid == 1) then
461 |           xnorm = sqrt(xnorm)
462 | 
463 |           rv1 = abs(alphar)
464 |           rv2 = abs(alphai)
465 |           ! not taking abs of xnorm
466 |           scal = max(rv1, rv2, xnorm)
467 |           invscal = 1.d0/scal
468 | 
469 |           rv1 = rv1 * invscal
470 |           rv2 = rv2 * invscal
471 |           xnorm = xnorm * invscal
472 |           
473 |           beta = -sign(scal * sqrt(rv1*rv1 + rv2*rv2 + xnorm*xnorm), alphar)
474 | 
475 |           tau = dcmplx((beta - alphar)/beta, -alphai/beta)
476 | 
477 |           !zladiv
478 |           rv1 = dble(alpha_s - beta)
479 |           rv2 = dimag(alpha_s - beta)
480 | 
481 |           if (abs(rv2) .lt. abs(rv1)) then
482 |             xnorm = rv2/rv1
483 |             invscal = 1.d0/(rv1 + rv2*xnorm)
484 |             alpha_s = dcmplx(invscal, -xnorm * invscal)
485 |           else
486 |             xnorm = rv1/rv2
487 |             invscal = 1.d0/(rv2 + rv1*xnorm)
488 |             alpha_s = dcmplx(xnorm * invscal, -invscal)
489 |           endif
490 | 
491 |           e = beta ! store beta in e vector
492 |         endif
493 | 
494 |         call syncthreads()
495 | 
496 |         do i = tid, N-1, blockDim%x*blockDim%y
497 |           cv1 = x2(i)
498 | 
499 |           if (i <= N-2) then
500 |             cv1 = alpha_s * cv1
501 |           elseif (i == N-1) then
502 |             !x(i) = 1.0_8
503 |             cv1 = dcmplx(1.0_8, 0.0_8)
504 |           endif
505 | 
506 |           x2(i) = cv1
507 |         end do
508 | 
509 |       endif
510 | 
511 |     end subroutine zher2_mv_zlarfg_kernel
512 | 
513 |     attributes(global) subroutine stacked_zgemv_C(M, N, V, ldv, W, ldw, x, z1, z2)
514 |       use cudafor
515 |       implicit none
516 |       integer, value                                     :: M, N, ldv, ldw
517 |       complex(8), dimension(ldv, M), device, intent(in)  :: V
518 |       complex(8), dimension(ldw, M), device, intent(in)  :: W
519 |       complex(8), dimension(N), device, intent(in)       :: x
520 |       !DIR$ IGNORE_TKR z1, z2
521 |       real(8), dimension(2*M), device                    :: z1, z2
522 |       !complex(8), dimension(M), device, intent(in)        :: z1, z2
523 | 
524 |       !real(8), dimension(32), shared                     :: r_s
525 |       !real(8), dimension(32), shared                     :: i_s
526 | 
527 |       integer :: i, j, tx, ty, istat
528 |       complex(8) :: val
529 |       real(8) :: rv1, rv2, iv1, iv2, xr, xi
530 | 
531 |       tx = threadIdx%x
532 |       ty = threadIdx%y
533 | 
534 |       i = (blockIdx%y - 1) * blockDim%y + ty 
535 |       j = (blockIdx%x - 1) * blockDim%x + tx
536 | 
537 |       !if (i > 2*M .or. j > N) return
538 |       if (i > 2*M) return
539 | 
540 |       val = x(j)
541 |       xr = dble(val); xi = dimag(val)
542 | 
543 |       if (j > N) then
544 |         !val = dcmplx(0,0)
545 |         rv1 = 0.d0; iv1 = 0.d0
546 |       else
547 |         if (i > M) then
548 |           val = W(j, i-M)
549 |         else
550 |           val = V(j, i)
551 |         endif
552 | 
553 |         rv2 = dble(val); iv2 = dimag(val)
554 | 
555 |         rv1 = rv2 * xr + iv2 * xi
556 |         iv1 = rv2 * xi - iv2 * xr
557 |       endif
558 | 
559 |       !Partial sum within warps using shuffle
560 |       rv2 = __shfl_down(rv1,1)
561 |       rv1 = rv1 + rv2
562 |       rv2 = __shfl_down(rv1,2)
563 |       rv1 = rv1 + rv2
564 |       rv2 = __shfl_down(rv1,4)
565 |       rv1 = rv1 + rv2
566 |       rv2 = __shfl_down(rv1,8)
567 |       rv1 = rv1 + rv2
568 |       rv2 = __shfl_down(rv1,16)
569 |       rv1 = rv1 + rv2
570 | 
571 |       !if (tx == 1) then
572 |         !r_s(ty + k*blockDim%y) = rv1
573 |         !r_s(ty) = rv1
574 |       !endif
575 | 
576 |       !Partial sum within warps using shuffle
577 |       iv2 = __shfl_down(iv1,1)
578 |       iv1 = iv1 + iv2
579 |       iv2 = __shfl_down(iv1,2)
580 |       iv1 = iv1 + iv2
581 |       iv2 = __shfl_down(iv1,4)
582 |       iv1 = iv1 + iv2
583 |       iv2 = __shfl_down(iv1,8)
584 |       iv1 = iv1 + iv2
585 |       iv2 = __shfl_down(iv1,16)
586 |       iv1 = iv1 + iv2
587 | 
588 |       !if (tx == 1) then
589 |         !i_s(ty + k*blockDim%y) = iv1
590 |         !i_s(ty) = iv1
591 |       !endif
592 | 
593 |       !call syncthreads()
594 | 
595 |       !if (ty == 1 .and. i+tx-1 <= 2*M) then
596 |       !  if (i+tx-1 > M) then
597 |       !    istat = atomicadd(z2(2*(i+tx-1-M) - 1), r_s(tx))
598 |       !    istat = atomicadd(z2(2*(i+tx-1-M)), i_s(tx))
599 |       !  else
600 |       !    istat = atomicadd(z1(2*(i+tx-1) - 1), r_s(tx))
601 |       !    istat = atomicadd(z1(2*(i+tx-1)), i_s(tx))
602 |       !  endif
603 |       !endif
604 | 
605 |       if (tx == 1) then
606 |         if (i > M) then
607 |           istat = atomicadd(z2(2*(i-M) - 1), rv1)
608 |           istat = atomicadd(z2(2*(i-M)), iv1)
609 |         else
610 |           istat = atomicadd(z1(2*i - 1), rv1)
611 |           istat = atomicadd(z1(2*i), iv1)
612 |         endif
613 |       endif
614 | 
615 |       return
616 |     end subroutine stacked_zgemv_C
617 | 
618 |     attributes(global) subroutine stacked_zgemv_N(M, N, V, ldv, W, ldw, z1, z2, y)
619 |       use cudafor
620 |       implicit none
621 |       integer, value                                     :: M, N, ldv, ldw
622 |       complex(8), dimension(ldv, N), device, intent(in)  :: V
623 |       complex(8), dimension(ldw, N), device, intent(in)  :: W
624 |       complex(8), dimension(N), device, intent(in)       :: z1, z2
625 |       !DIR$ IGNORE_TKR y
626 |       real(8), dimension(2*M), device                    :: y
627 | 
628 |       integer :: i, j, tx, ty, istat
629 |       complex(8) :: val1, val2
630 |       real(8) :: rv1, rv2, iv1, iv2, xr, xi
631 | 
632 |       tx = threadIdx%x
633 |       ty = threadIdx%y
634 | 
635 |       i = (blockIdx%x - 1) * blockDim%x + tx
636 |       j = (blockIdx%y - 1) * blockDim%y + ty
637 | 
638 |       if (i > M .or. j > 2*N) return
639 | 
640 |       if (j > N) then
641 |         val1 = z2(j-N)
642 |         val2 = V(i, j-N)
643 |       else
644 |         val1 = z1(j)
645 |         val2 = W(i, j)
646 |       endif
647 |       xr = dble(val1); xi = dimag(val1)
648 |       rv2 = dble(val2); iv2 = dimag(val2)
649 | 
650 |       rv1 = -rv2 * xr + iv2 * xi
651 |       iv1 = -rv2 * xi - iv2 * xr
652 | 
653 |       istat = atomicadd(y(2*i-1), rv1)
654 |       istat = atomicadd(y(2*i), iv1)
655 | 
656 |       return
657 | 
658 |     end subroutine stacked_zgemv_N
659 | 
660 |     attributes(global) subroutine finish_W_col_kernel(N, tau, x, y)
661 |       implicit none
662 |       integer, value                               :: N
663 |       complex(8), device                           :: tau
664 |       complex(8), dimension(N), device, intent(in) :: x
665 |       complex(8), dimension(N), device             :: y
666 | 
667 |       integer                                      :: tid, i, j, k, nb, istat, laneID
668 |       real(8)                                      :: rv1, rv2, iv1, iv2, rsum, isum
669 |       complex(8)                                   :: val, cv1, mytau
670 | 
671 |       real(8), shared                              :: alphar, alphai
672 |       !complex(8), shared                          :: alpha
673 |       complex(8)                                   :: alpha
674 | 
675 |       tid = threadIdx%x
676 |       laneID = iand(tid, 31)
677 | 
678 |       if (tid == 1) then
679 |         alphar = 0.0_8
680 |         alphai = 0.0_8
681 |       endif
682 | 
683 |       call syncthreads()
684 | 
685 |       rsum = 0.0_8
686 |       isum = 0.0_8
687 |       mytau = tau
688 | 
689 |       nb = ceiling(real(N)/blockDim%x) ! number of blocks down column
690 | 
691 |       i = tid
692 |       do j = 1, nb
693 | 
694 |         ! All threads perform their product, zero if out of bounds
695 |         if (i <= N) then
696 |           val = dconjg(mytau * y(i)) * x(i)
697 |         else
698 |           val = dcmplx(0.,0.)
699 |         endif
700 | 
701 |         rv1 = dble(val); iv1 = dimag(val)
702 | 
703 |         rsum = rsum + rv1
704 |         isum = isum + iv1
705 | 
706 |         i = i + blockDim%x
707 | 
708 |       end do
709 | 
710 |       ! Partial sum within warps using shuffle
711 |       rv1 = rsum
712 |       rv2 = __shfl_down(rv1,1)
713 |       rv1 = rv1 + rv2
714 |       rv2 = __shfl_down(rv1,2)
715 |       rv1 = rv1 + rv2
716 |       rv2 = __shfl_down(rv1,4)
717 |       rv1 = rv1 + rv2
718 |       rv2 = __shfl_down(rv1,8)
719 |       rv1 = rv1 + rv2
720 |       rv2 = __shfl_down(rv1,16)
721 |       rv1 = rv1 + rv2
722 | 
723 |       iv1 = isum
724 |       iv2 = __shfl_down(iv1,1)
725 |       iv1 = iv1 + iv2
726 |       iv2 = __shfl_down(iv1,2)
727 |       iv1 = iv1 + iv2
728 |       iv2 = __shfl_down(iv1,4)
729 |       iv1 = iv1 + iv2
730 |       iv2 = __shfl_down(iv1,8)
731 |       iv1 = iv1 + iv2
732 |       iv2 = __shfl_down(iv1,16)
733 |       iv1 = iv1 + iv2
734 | 
735 |       if (laneID == 1) then
736 |         istat = atomicadd(alphar, rv1)
737 |         istat = atomicadd(alphai, iv1)
738 |       endif
739 | 
740 |       call syncthreads()
741 | 
742 |       alpha = -dcmplx(0.5, 0.0) * mytau * dcmplx(alphar, alphai)
743 | 
744 |       do i = tid, N, blockDim%x
745 |         y(i) = mytau*y(i) + alpha * x(i) !zaxpy
746 |       end do
747 | 
748 |     end subroutine finish_W_col_kernel
749 | 
750 |     attributes(global) subroutine stacked_zgemv_N_finish_W(M, N, V, ldv, W, ldw, z1, z2, y, tau, x, y2, finished)
751 |       use cudafor
752 |       implicit none
753 |       integer, value                                     :: M, N, ldv, ldw
754 |       complex(8), dimension(ldv, N), device, intent(in)  :: V
755 |       complex(8), dimension(ldw, N), device, intent(in)  :: W
756 |       complex(8), dimension(N), device, intent(in)       :: z1, z2
757 |       !DIR$ IGNORE_TKR y
758 |       real(8), dimension(2*M), device                    :: y
759 |       complex(8), device                                 :: tau
760 |       complex(8), dimension(M), device, intent(in)       :: x
761 |       complex(8), dimension(M), device                   :: y2
762 |       integer, device                                    :: finished
763 | 
764 |       integer :: i, j, tx, ty, istat, nBlocks, tid, laneID, nb
765 |       integer, shared :: nFinished
766 |       complex(8) :: val1, val2, mytau, alpha
767 |       real(8) :: rv1, rv2, iv1, iv2, xr, xi, rsum, isum
768 |       real(8), shared :: alphar, alphai
769 | 
770 |       tx = threadIdx%x
771 |       ty = threadIdx%y
772 | 
773 |       i = (blockIdx%x - 1) * blockDim%x + tx
774 |       j = (blockIdx%y - 1) * blockDim%y + ty
775 | 
776 |       nBlocks = gridDim%x * gridDim%y
777 | 
778 |       if (i <= M .and. j <= 2*N) then
779 |         if (j > N) then
780 |           val1 = z2(j-N)
781 |           val2 = V(i, j-N)
782 |         else
783 |           val1 = z1(j)
784 |           val2 = W(i, j)
785 |         endif
786 |         xr = dble(val1); xi = dimag(val1)
787 |         rv2 = dble(val2); iv2 = dimag(val2)
788 | 
789 |         rv1 = -rv2 * xr + iv2 * xi
790 |         iv1 = -rv2 * xi - iv2 * xr
791 | 
792 |         istat = atomicadd(y(2*i-1), rv1)
793 |         istat = atomicadd(y(2*i), iv1)
794 |       endif
795 | 
796 |       call threadfence()
797 | 
798 |       nFinished = 0
799 |       call syncthreads()
800 |       if (tx + ty == 2) nFinished = atomicinc(finished, nBlocks-1)
801 |       call syncthreads()
802 | 
803 |       if ( nFinished < nBlocks - 1) return
804 | 
805 |       ! Begin finish_W_col work with last block
806 |       tid = threadIdx%x + (threadIdx%y - 1) * blockDim%x
807 |       laneID = iand(tid, 31)
808 | 
809 |       if (tid == 1) then
810 |         alphar = 0.0_8
811 |         alphai = 0.0_8
812 |       endif
813 | 
814 |       call syncthreads()
815 | 
816 |       rsum = 0.0_8
817 |       isum = 0.0_8
818 |       mytau = tau
819 | 
820 |       nb = ceiling(real(M)/(blockDim%x * blockDim%y)) ! number of blocks down column
821 | 
822 |       i = tid
823 |       do j = 1, nb
824 | 
825 |         ! All threads perform their product, zero if out of bounds
826 |         if (i <= M) then
827 |           val1 = dconjg(mytau * y2(i)) * x(i)
828 |         else
829 |           val1 = dcmplx(0.,0.)
830 |         endif
831 | 
832 |         rv1 = dble(val1); iv1 = dimag(val1)
833 | 
834 |         rsum = rsum + rv1
835 |         isum = isum + iv1
836 | 
837 |         i = i + blockDim%x*blockDim%y
838 | 
839 |       end do
840 | 
841 |       ! Partial sum within warps using shuffle
842 |       rv1 = rsum
843 |       rv2 = __shfl_down(rv1,1)
844 |       rv1 = rv1 + rv2
845 |       rv2 = __shfl_down(rv1,2)
846 |       rv1 = rv1 + rv2
847 |       rv2 = __shfl_down(rv1,4)
848 |       rv1 = rv1 + rv2
849 |       rv2 = __shfl_down(rv1,8)
850 |       rv1 = rv1 + rv2
851 |       rv2 = __shfl_down(rv1,16)
852 |       rv1 = rv1 + rv2
853 | 
854 |       iv1 = isum
855 |       iv2 = __shfl_down(iv1,1)
856 |       iv1 = iv1 + iv2
857 |       iv2 = __shfl_down(iv1,2)
858 |       iv1 = iv1 + iv2
859 |       iv2 = __shfl_down(iv1,4)
860 |       iv1 = iv1 + iv2
861 |       iv2 = __shfl_down(iv1,8)
862 |       iv1 = iv1 + iv2
863 |       iv2 = __shfl_down(iv1,16)
864 |       iv1 = iv1 + iv2
865 | 
866 |       if (laneID == 1) then
867 |         istat = atomicadd(alphar, rv1)
868 |         istat = atomicadd(alphai, iv1)
869 |       endif
870 | 
871 |       call syncthreads()
872 | 
873 |       alpha = -dcmplx(0.5, 0.0) * mytau * dcmplx(alphar, alphai)
874 | 
875 |       do i = tid, M, blockDim%x * blockDim%y
876 |         y2(i) = mytau*y2(i) + alpha * x(i) !zaxpy
877 |       end do
878 | 
879 |     end subroutine stacked_zgemv_N_finish_W
880 | 
881 | 
882 | end module zhetrd_gpu
883 | 
884 | 


--------------------------------------------------------------------------------
/test_driver/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAROOT = ${CUDA_HOME}
 2 | 
 3 | # Flags for GPUs with Volta architecture.  Modify cc value as required.
 4 | FLAGS = -O3 -mp -pgf90libs -Mcuda=cc70,cuda10.1 -Mlarge_arrays
 5 | 
 6 | # extract PGI major version
 7 | PGI_MAJOR_VERSION := $(shell pgfortran -V | egrep -o '[0-9]+\.[0-9]+' | sed -e 's/\.[0-9]\+//g')
 8 | 
 9 | # PGI 19.1+ uses LLVM backend so MKL has to be linked differently
10 | ifeq ($(shell expr ${PGI_MAJOR_VERSION} \< 19),1)
11 |     MKL_THREAD=mkl_pgi_thread
12 | else
13 |     MKL_THREAD=mkl_intel_thread
14 | endif
15 | 
16 | INCS = -I../lib_eigsolve 
17 | LIBS = -Mcudalib=cublas,cusolver -L${CUDAROOT}/lib64 -lnvToolsExt
18 | LIBS += ../lib_eigsolve/lib_eigsolve.a
19 | 
20 | # Populate this section for your LAPACK installation. Default here is for MKL.
21 | LAPACKROOT = ${MKLROOT}
22 | INCS += -I${LAPACKROOT}/include
23 | LIBS += -L${LAPACKROOT}/lib/intel64 -lmkl_intel_lp64 -lmkl_core -l${MKL_THREAD} -pgf90libs -lpthread -lm -ldl
24 | 
25 | # To enable comparison with MAGMA, uncomment and/or populate the following lines:
26 | #MAGMAROOT = 
27 | #INCS += -I${MAGMAROOT}/include
28 | #LIBS += -L${MAGMAROOT}/lib -lmagma
29 | #OPTFLAGS += -DHAVE_MAGMA
30 | 
31 | all: test_zhegvdx test_dsygvdx
32 | 
33 | test_zhegvdx: test_zhegvdx.F90 
34 | 	pgcc -c wallclock.c
35 | 	pgf90 -c ${FLAGS} toolbox.F90
36 | 	pgf90 -o test_zhegvdx test_zhegvdx.F90 toolbox.o wallclock.o ${LIBS} ${FLAGS} ${OPTFLAGS} -pgf90libs ${INCS}
37 | 
38 | test_dsygvdx: test_dsygvdx.F90 
39 | 	pgcc -c wallclock.c
40 | 	pgf90 -c ${FLAGS} toolbox.F90
41 | 	pgf90 -o test_dsygvdx test_dsygvdx.F90 toolbox.o wallclock.o ${LIBS} ${FLAGS} ${OPTFLAGS} -pgf90libs ${INCS}
42 | 
43 | clean:
44 | 	rm test_zhegvdx test_dsygvdx *.mod *.o
45 | 


--------------------------------------------------------------------------------
/test_driver/test_dsygvdx.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module funcs
 25 |   contains
 26 | 
 27 |   ! Creates pseudo-random positive-definite symmetric matrix
 28 |   subroutine create_random_symmetric_pd(A, N)
 29 |     use cudafor
 30 |     use cublas
 31 |     real(8), allocatable, dimension(:,:)         :: A, temp
 32 |     real(8), allocatable, dimension(:,:), device :: A_d, temp_d
 33 |     real(8)                                         :: rv
 34 |     integer                                         :: i, j, N
 35 | 
 36 |     allocate(A(N,N))
 37 |     allocate(temp(N,N))
 38 | 
 39 |     ! Create general symmetric temp
 40 |     do j = 1, N
 41 |       do i = 1, N
 42 |         if (i > j) then
 43 |           call random_number(rv)
 44 |           temp(i,j) = rv
 45 |           temp(j,i) = rv
 46 |         else if (i == j) then
 47 |           call random_number(rv)
 48 |           temp(i,j) = rv
 49 |         end if
 50 |       end do
 51 |     end do
 52 | 
 53 |     allocate(A_d, source = A)
 54 |     allocate(temp_d, source = temp)
 55 | 
 56 |     ! Multiply temp by transpose of temp to get positive definite A
 57 |     call cublasdgemm('N', 'T', N, N, N, 1.d0, temp_d, N, temp_d, N, 0.d0, A_d, N)
 58 | 
 59 |     A = A_d
 60 |     deallocate(temp)
 61 |     deallocate(A_d)
 62 |     deallocate(temp_d)
 63 |         
 64 |   end subroutine
 65 | end module funcs
 66 | 
 67 | ! cusovlerDnDsygvdx was added in CUDA 10.1
 68 | ! PGI did not expose CUDA_VERSION before version 19.7. This line can be simplified once the support for PGI 19.4 and earlier has been dropped
 69 | #if (((__PGIF90__ < 19 || (__PGIF90__ == 19 && __PGIF90_MINOR__ < 7)) && __CUDA_API_VERSION >= 10010) || ((__PGIF90__ > 19 || (__PGIF90__ == 19 && __PGIF90_MINOR__ >= 7)) && __CUDA_VERSION >= 10010))
 70 | #define HAVE_CUSOLVERDNDSYGVDX
 71 | #endif
 72 | 
 73 | program main
 74 |   use cudafor
 75 |   use cublas
 76 |   use cusolverDn
 77 |   use eigsolve_vars, ONLY: init_eigsolve_gpu
 78 |   use dsygvdx_gpu
 79 |   use nvtx_inters
 80 |   use funcs
 81 |   use compare_utils
 82 |   implicit none
 83 |   
 84 |   integer                                         :: N, M, i, j, info, lda, istat
 85 |   integer                                         :: n1, n2, m1, m2, lda1, lda2
 86 |   integer                                         :: lwork_d, lrwork_d, lwork, lrwork, liwork, il, iu
 87 | #ifdef HAVE_CUSOLVERDNDSYGVDX
 88 |   integer                                         :: h_meig
 89 | #endif
 90 |   character(len=20)                               :: arg
 91 |   character(:),allocatable                        :: file1,file2
 92 |   real(8)                                         :: ts, te, wallclock
 93 |   real(8), dimension(:,:), allocatable            :: A1, A2, Aref
 94 |   real(8), dimension(:,:), allocatable            :: B1, B2, Bref
 95 |   real(8), dimension(:,:), allocatable, pinned    :: Z1, Z2
 96 |   real(8), dimension(:,:), allocatable, device    :: A2_d, B2_d, Z2_d
 97 |   real(8), dimension(:), allocatable, pinned      :: work
 98 |   real(8), dimension(:), allocatable, pinned      :: w1, w2, rwork
 99 |   integer, dimension(:), allocatable, pinned      :: iwork
100 |   real(8), dimension(:), allocatable, device      :: work_d
101 |   real(8), dimension(:), allocatable, device      :: w2_d, rwork_d
102 |   integer, device                                 :: devInfo_d
103 |   type(cusolverDnHandle)                          :: h
104 | 
105 |   ! Parse command line arguments
106 |   i = command_argument_count()
107 | 
108 | 
109 |   if (i == 1) then
110 |     ! If N is provided, generate random symmetric matrices for A and B
111 |     print*, "Using randomly-generated matrices..."
112 |     call get_command_argument(1, arg)
113 |     read(arg, *)  N
114 |     lda = N
115 | 
116 |     ! Create random positive-definite hermetian matrices on host
117 |     call create_random_symmetric_pd(Aref, N)
118 |     call create_random_symmetric_pd(Bref, N)
119 | 
120 |   elseif (i ==2) then
121 |     print*, "Reading  matrices from files ..."
122 |     print*, "Unformatted files with n,m,lda "
123 |     print*, "A(lda,n) B(lda,n)"
124 |     call get_command_argument(1, arg)
125 |     file1=trim(arg)
126 |     call get_command_argument(2, arg)
127 |     file2=trim(arg)
128 |     open(UNIT=13, FILE=file1, ACTION="read", FORM="unformatted")
129 |     open(UNIT=14, FILE=file2, ACTION="read", FORM="unformatted")
130 |     read(13) n1,m1,lda1
131 |     read(14) n2,m2,lda2
132 |     if( n1/=n2 .or. m1/=m2 .or. lda1 /= lda2) then
133 |       print *,"expecting A and B to have same N,M,LDA"
134 |       call exit
135 |     end if
136 |     N=n1
137 |     M=m1
138 |     LDA=lda1
139 |     print *,"n,m,lda from files:",n,m,lda
140 |     allocate(Aref(lda,N))
141 |     allocate(Bref(lda,N))
142 |     read(13)Aref(1:n,1:n)
143 |     read(14)Bref(1:n,1:n)
144 |     close(13)
145 |     close(14)
146 |   else
147 |     print*, "Usage:\n\t ./main [N]"
148 |     call exit
149 |   endif
150 | 
151 |   print*, "Running with N = ", N
152 | 
153 |   ! Allocate/Copy matrices to device
154 |   allocate(A1, source = Aref)
155 |   allocate(A2, source = Aref)
156 |   allocate(A2_d, source = Aref)
157 |   allocate(B1, source = Bref)
158 |   allocate(B2, source = Bref)
159 |   allocate(B2_d, source = Bref)
160 |   allocate(Z1, source = Aref)
161 |   allocate(Z2, source = Aref)
162 |   allocate(Z2_d, source = Aref)
163 | 
164 |   allocate(w1(N), w2(N))
165 |   allocate(w2_d, source = w2)
166 | 
167 |   ! Initialize solvers
168 |   call init_eigsolve_gpu()
169 | 
170 |   istat = cublasInit
171 |   if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublas intialization failed'
172 | 
173 |   istat = cusolverDnCreate(h)
174 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'handle creation failed'
175 | 
176 | #ifdef HAVE_MAGMA
177 |   call magmaf_init
178 | #endif
179 | 
180 | 
181 |   !! Solving generalized eigenproblem using DSYGVD
182 |   ! CASE 1: CPU _____________________________________________
183 |   print*
184 |   print*, "CPU_____________________"
185 |   lwork = 1 + 6*N + 2*N*N
186 |   liwork = 3 + 5*N
187 |   allocate(iwork(liwork))
188 |   allocate(work(lwork))
189 |   call dsygvd(1, 'V', 'U', N, A1, lda, B1, lda, w1, work, -1, iwork, -1, istat)
190 |   if (istat /= 0) write(*,*) 'CPU dsygvd worksize failed'
191 |   lwork = work(1);; liwork = iwork(1)
192 |   deallocate(work, iwork )
193 |   allocate(work(lwork), iwork(liwork))
194 | 
195 |   A1 = Aref
196 |   B1 = Bref
197 |   ! Run once before timing
198 |   call dsygvd(1, 'V', 'U', N, A1, lda, B1, lda, w1, work, lwork, iwork, liwork, istat)
199 |   if (istat /= 0) write(*,*) 'CPU dsygvd failed. istat = ', istat
200 | 
201 |   A1 = Aref
202 |   B1 = Bref
203 |   ts = wallclock()
204 |   call nvtxStartRange("CPU DSYGVD",1)
205 |   call dsygvd(1, 'V', 'U', N, A1, lda, B1, lda, w1, work, lwork, iwork, liwork, istat)
206 |   call nvtxEndRange
207 |   te = wallclock()
208 |   if (istat /= 0) write(*,*) 'CPU dsygvd failed. istat = ', istat
209 | 
210 |   print*, "\tTime for CPU dsygvd = ", (te - ts)*1000.0
211 |   print*
212 | 
213 | #ifdef HAVE_MAGMA
214 |   ! CASE 2: using Magma ___________________________________________
215 |   print*
216 |   print*, "MAGMA_____________________"
217 |   call magmaf_dsygvd(1, 'V', 'U', N, A2, lda, B2, lda, w2, work, -1, iwork, -1, istat)
218 |   if (istat /= 0) write(*,*) 'magmaf_dsygvd buffer sizes failed',istat
219 |   deallocate(work, iwork)
220 |   allocate(work(lwork), iwork(liwork))
221 | 
222 |   ts = wallclock()
223 |   call nvtxStartRange("MAGMA",0)
224 |   call magmaf_dsygvd(1, 'V', 'U', N, A2, lda, B2, lda, w2, work, lwork, iwork, liwork, istat)
225 |   call nvtxEndRange
226 |   te = wallclock()
227 |   if (istat /= 0) write(*,*) 'magmaf_dsygvd failed',istat
228 | 
229 |   print*, "evalues/evector accuracy: (compared to CPU results)"
230 |   call compare(w1, w2, N)
231 |   call compare(A1, A2, N, N)
232 |   print*
233 | 
234 |   print*, "Time for magmaf_dsygvd = ", (te - ts)*1000.0
235 |   print*
236 | #endif
237 | 
238 | 
239 |   ! CASE 3: using Cusolver __________________________________________________________________
240 |   !print*
241 |   print*, "cuSOLVER_____________________"
242 | 
243 | #ifdef HAVE_CUSOLVERDNDSYGVDX
244 |   print *,"CUSOLVERDNDSYGVDX"
245 |   il = 1
246 |   iu = M
247 |   istat = cusolverDnDsygvdx_bufferSize(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, 0.D0, 0.D0, il, iu, h_meig, w2_d, lwork_d)
248 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnDsygvdx_buffersize failed'
249 | #else
250 |   istat = cusolverDnDsygvd_bufferSize(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, w2_d, lwork_d)
251 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnDsygvd_buffersize failed'
252 | #endif
253 |   allocate(work_d(lwork_d))
254 |   
255 |   A2 = Aref
256 |   B2 = Bref
257 |   w2 = 0
258 |   A2_d = A2
259 |   B2_d = B2
260 |   w2_d = 0
261 |   ts = wallclock()
262 |   call nvtxStartRange("cuSOLVER",5)
263 | #ifdef HAVE_CUSOLVERDNDSYGVDX
264 |   istat = cusolverDnDsygvdx(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, 0.D0, 0.D0, il, iu, h_meig, w2_d, work_d, lwork_d, devInfo_d)
265 | #else
266 |   istat = cusolverDnDsygvd(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, w2_d, work_d, lwork_d, devInfo_d)
267 | #endif
268 |   call nvtxEndRange
269 |   te = wallclock()
270 | 
271 |   print*, "evalues/evector accuracy: (compared to CPU results)"
272 |   w2 = w2_d
273 |   A2 = A2_d
274 |   call compare(w1, w2, iu)
275 |   call compare(A1, A2, N, iu)
276 |   print*
277 | 
278 | #ifdef HAVE_CUSOLVERDNDSYGVDX
279 |   print*, "Time for cusolverDnDsygvdx = ", (te - ts)*1000.0
280 | #else
281 |   print*, "Time for cusolverDnDsygvd = ", (te - ts)*1000.0
282 | #endif
283 |   print*
284 |   istat = devInfo_d
285 | #ifdef HAVE_CUSOLVERDNDSYGVDX
286 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnDsygvdx failed'
287 | #else
288 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnDsygvd failed'
289 | #endif
290 | 
291 | 
292 |   ! CASE 4: using CUSTOM ____________________________________________________________________
293 |   print*
294 |   print*, "CUSTOM_____________________"
295 |   A2 = Aref
296 |   B2 = Bref
297 |   w2 = 0
298 |   A2_d = A2
299 |   B2_d = B2
300 |   w2_d = w2
301 |   il = 1
302 |   iu = M
303 | 
304 |   deallocate(work, iwork)
305 |   lwork = 1+6*N+2*N*N
306 |   liwork = 3+5*N
307 |   allocate(work(lwork), iwork(liwork))
308 | 
309 |   deallocate(work_d)
310 |   lwork_d = 2*64*64 + 66 * N
311 |   allocate(work_d(1*lwork_d))
312 | 
313 |   ts = wallclock()
314 |   call nvtxStartRange("Custom",0)
315 |   call dsygvdx_gpu(N, A2_d, lda, B2_d, lda, Z2_d, lda, il, iu, w2_d, work_d, lwork_d, &
316 |                           work, lwork, iwork, liwork, Z2, lda, w2, istat)
317 |   call nvtxEndRange
318 |   te = wallclock()
319 | 
320 |   print*, "evalues/evector accuracy: (compared to CPU results)"
321 |   call compare(w1, w2, iu)
322 |   call compare(A1, Z2, N, iu)
323 |   print*
324 | 
325 |   print*, "Time for CUSTOM dsygvd/x = ", (te - ts)*1000.0
326 |   if (istat /= 0) write(*,*) 'dsygvdx_gpu failed'
327 | 
328 | end program
329 | 


--------------------------------------------------------------------------------
/test_driver/test_zhegvdx.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module funcs
 25 |   contains
 26 | 
 27 |   ! Creates pseudo-random positive-definite hermetian matrix
 28 |   subroutine create_random_hermetian_pd(A, N)
 29 |     use cudafor
 30 |     use cublas
 31 |     complex(8), allocatable, dimension(:,:)         :: A, temp
 32 |     complex(8), allocatable, dimension(:,:), device :: A_d, temp_d
 33 |     complex(8)                                      :: val
 34 |     real(8)                                         :: rv, iv
 35 |     integer                                         :: i, j, N
 36 | 
 37 |     allocate(A(N,N))
 38 |     allocate(temp(N,N))
 39 | 
 40 |     ! Create general hermetian temp
 41 |     do j = 1, N
 42 |       do i = 1, N
 43 |         if (i > j) then
 44 |           call random_number(rv)
 45 |           call random_number(iv)
 46 |           temp(i,j) = cmplx(rv, iv, 8)
 47 |           temp(j,i) = conjg(temp(i,j))
 48 |         else if (i == j) then
 49 |           call random_number(rv)
 50 |           temp(i,j) = rv
 51 |         end if
 52 |       end do
 53 |     end do
 54 | 
 55 |     allocate(A_d, source = A)
 56 |     allocate(temp_d, source = temp)
 57 | 
 58 |     ! Multiply temp by conjugate transpose of temp to get positive definite hermetian A
 59 |     call cublaszgemm('N', 'C', N, N, N, cmplx(1.0, 0.0, 8), temp_d, N, temp_d, N, cmplx(0.0, 0.0, 8), A_d, N)
 60 | 
 61 |     A = A_d
 62 |     deallocate(temp)
 63 |     deallocate(A_d)
 64 |     deallocate(temp_d)
 65 |         
 66 |   end subroutine
 67 | end module funcs
 68 | 
 69 | ! cusovlerDnZhegvdx was added in CUDA 10.1
 70 | ! PGI did not expose CUDA_VERSION before version 19.7. This line can be simplified once the support for PGI 19.4 and earlier has been dropped
 71 | #if (((__PGIF90__ < 19 || (__PGIF90__ == 19 && __PGIF90_MINOR__ < 7)) && __CUDA_API_VERSION >= 10010) || ((__PGIF90__ > 19 || (__PGIF90__ == 19 && __PGIF90_MINOR__ >= 7)) && __CUDA_VERSION >= 10010))
 72 | #define HAVE_CUSOLVERDNZHEGVDX
 73 | #endif
 74 | 
 75 | program main
 76 |   use cudafor
 77 |   use cublas
 78 |   use cusolverDn
 79 |   use eigsolve_vars, ONLY: init_eigsolve_gpu
 80 |   use zhegvdx_gpu
 81 |   use nvtx_inters
 82 |   use funcs
 83 |   use compare_utils
 84 |   implicit none
 85 |   
 86 |   integer                                         :: N, M, i, j, info, lda, istat
 87 |   integer                                         :: lwork_d, lrwork_d, lwork, lrwork, liwork, il, iu
 88 | #ifdef HAVE_CUSOLVERDNZHEGVDX
 89 |   integer                                         :: h_meig
 90 | #endif
 91 |   character(len=20)                               :: arg
 92 |   real(8)                                         :: ts, te, wallclock
 93 |   complex(8), dimension(:,:), allocatable         :: A1, A2, Aref
 94 |   complex(8), dimension(:,:), allocatable         :: B1, B2, Bref
 95 |   complex(8), dimension(:,:), allocatable, pinned :: Z1, Z2
 96 |   complex(8), dimension(:,:), allocatable, device :: A2_d, B2_d, Z2_d
 97 |   complex(8), dimension(:), allocatable, pinned   :: work
 98 |   real(8), dimension(:), allocatable, pinned      :: w1, w2, rwork
 99 |   integer, dimension(:), allocatable, pinned      :: iwork
100 |   complex(8), dimension(:), allocatable, device   :: work_d
101 |   real(8), dimension(:), allocatable, device      :: w2_d, rwork_d
102 |   integer, device                                 :: devInfo_d
103 |   type(cusolverDnHandle)                          :: h
104 | 
105 |   ! Parse command line arguments
106 |   i = command_argument_count()
107 | 
108 |   if (i >= 1) then
109 |     ! If N is provided, generate random hermetian matrices for A and B
110 |     print*, "Using randomly-generated matrices..."
111 |     call get_command_argument(1, arg)
112 |     read(arg, *)  N
113 |     lda = N
114 | 
115 |     ! Create random positive-definite hermetian matrices on host
116 |     call create_random_hermetian_pd(Aref, N)
117 |     call create_random_hermetian_pd(Bref, N)
118 | 
119 |   else
120 |     print*, "Usage:\n\t ./main [N]"
121 |     call exit
122 |   endif
123 | 
124 |   print*, "Running with N = ", N
125 | 
126 |   ! Allocate/Copy matrices to device
127 |   allocate(A1, source = Aref)
128 |   allocate(A2, source = Aref)
129 |   allocate(A2_d, source = Aref)
130 |   allocate(B1, source = Bref)
131 |   allocate(B2, source = Bref)
132 |   allocate(B2_d, source = Bref)
133 |   allocate(Z1, source = Aref)
134 |   allocate(Z2, source = Aref)
135 |   allocate(Z2_d, source = Aref)
136 | 
137 |   allocate(w1(N), w2(N))
138 |   allocate(w2_d, source = w2)
139 | 
140 |   ! Initialize solvers
141 |   call init_eigsolve_gpu()
142 | 
143 |   istat = cublasInit
144 |   if (istat /= CUBLAS_STATUS_SUCCESS) write(*,*) 'cublas intialization failed'
145 | 
146 |   istat = cusolverDnCreate(h)
147 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'handle creation failed'
148 | 
149 | #ifdef HAVE_MAGMA
150 |   call magmaf_init
151 | #endif
152 | 
153 |   !! Solving generalized eigenproblem using ZHEGVD
154 |   ! CASE 1: CPU _____________________________________________
155 |   print*
156 |   print*, "CPU_____________________"
157 |   lwork = 2*N + N*N
158 |   lrwork = 1 + 5*N + 2*N*N
159 |   liwork = 3 + 5*N
160 |   allocate(iwork(liwork))
161 |   allocate(rwork(lrwork))
162 |   allocate(work(Lwork))
163 |   call zhegvd(1, 'V', 'U', N, A1, lda, B1, lda, w1, work, -1, rwork, -1, iwork, -1, istat)
164 |   if (istat /= 0) write(*,*) 'CPU zhegvd worksize failed'
165 |   lwork = work(1); lrwork = rwork(1); liwork = iwork(1)
166 |   deallocate(work, rwork, iwork )
167 |   allocate(work(lwork), rwork(lrwork), iwork(liwork))
168 | 
169 |   A1 = Aref
170 |   B1 = Bref
171 |   ! Run once before timing
172 |   call zhegvd(1, 'V', 'U', N, A1, lda, B1, lda, w1, work, lwork, rwork, lrwork, iwork, liwork, istat)
173 |   if (istat /= 0) write(*,*) 'CPU zhegvd failed. istat = ', istat
174 | 
175 |   A1 = Aref
176 |   B1 = Bref
177 |   ts = wallclock()
178 |   call nvtxStartRange("CPU ZHEGVD",1)
179 |   call zhegvd(1, 'V', 'U', N, A1, lda, B1, lda, w1, work, lwork, rwork, lrwork, iwork, liwork, istat)
180 |   call nvtxEndRange
181 |   te = wallclock()
182 |   if (istat /= 0) write(*,*) 'CPU zhegvd failed. istat = ', istat
183 | 
184 |   print*, "\tTime for CPU zhegvd = ", (te - ts)*1000.0
185 |   print*
186 | 
187 | #ifdef HAVE_MAGMA
188 |   ! CASE 2: using Magma ___________________________________________
189 |   print*
190 |   print*, "MAGMA_____________________"
191 |   call magmaf_zhegvd(1, 'V', 'U', N, A2, lda, B2, lda, w2, work, -1, rwork, -1, iwork, -1, istat)
192 |   if (istat /= 0) write(*,*) 'magmaf_zhegvd buffer sizes failed',istat
193 |   deallocate(work, rwork, iwork)
194 |   allocate(work(lwork), rwork(lrwork), iwork(liwork))
195 | 
196 |   ts = wallclock()
197 |   call nvtxStartRange("MAGMA",0)
198 |   call magmaf_zhegvd(1, 'V', 'U', N, A2, lda, B2, lda, w2, work, lwork, rwork, lrwork, iwork, liwork, istat)
199 |   call nvtxEndRange
200 |   te = wallclock()
201 |   if (istat /= 0) write(*,*) 'magmaf_zhegvd failed',istat
202 | 
203 |   print*, "evalues/evector accuracy: (compared to CPU results)"
204 |   call compare(w1, w2, N)
205 |   call compare(A1, A2, N, N)
206 |   print*
207 | 
208 |   print*, "Time for magmaf_zhegvd = ", (te - ts)*1000.0
209 |   print*
210 | #endif
211 | 
212 | 
213 |   ! CASE 3: using Cusolver __________________________________________________________________
214 |   !print*
215 |   print*, "cuSOLVER_____________________"
216 | 
217 | #ifdef HAVE_CUSOLVERDNZHEGVDX
218 |   il = 1
219 |   iu = N
220 |   istat = cusolverDnZhegvdx_bufferSize(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, 0.D0, 0.D0, il, iu, h_meig, w2_d, lwork_d)
221 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnZhegvdx_buffersize failed'
222 | #else
223 |   istat = cusolverDnZhegvd_bufferSize(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, w2_d, lwork_d)
224 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnZhegvd_buffersize failed'
225 | #endif
226 | 
227 |   allocate(work_d(lwork_d))
228 |   
229 |   A2 = Aref
230 |   B2 = Bref
231 |   w2 = 0
232 |   A2_d = A2
233 |   B2_d = B2
234 |   w2_d = 0
235 |   ts = wallclock()
236 |   call nvtxStartRange("cuSOLVER",5)
237 | #ifdef HAVE_CUSOLVERDNZHEGVDX
238 |   istat = cusolverDnZhegvdx(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUSOLVER_EIG_RANGE_I, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, 0.D0, 0.D0, il, iu, h_meig, w2_d, work_d, lwork_d, devInfo_d)
239 | #else
240 |   istat = cusolverDnZhegvd(h, CUSOLVER_EIG_TYPE_1, CUSOLVER_EIG_MODE_VECTOR, CUBLAS_FILL_MODE_UPPER, N, A2_d, lda, B2_d, lda, w2_d, work_d, lwork_d, devInfo_d)
241 | #endif
242 |   call nvtxEndRange
243 |   te = wallclock()
244 | 
245 |   print*, "evalues/evector accuracy: (compared to CPU results)"
246 |   w2 = w2_d
247 |   A2 = A2_d
248 |   call compare(w1, w2, N)
249 |   call compare(A1, A2, N, N)
250 |   print*
251 | 
252 | #ifdef HAVE_CUSOLVERDNZHEGVDX
253 |   print*, "Time for cusolverDnZhegvdx = ", (te - ts)*1000.0
254 | #else
255 |   print*, "Time for cusolverDnZhegvd = ", (te - ts)*1000.0
256 | #endif
257 |   print*
258 |   istat = devInfo_d
259 | #ifdef HAVE_CUSOLVERDNZHEGVDX
260 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnZhegvdx failed'
261 | #else
262 |   if (istat /= CUSOLVER_STATUS_SUCCESS) write(*,*) 'cusolverDnZhegvd failed'
263 | #endif
264 | 
265 | 
266 |   ! CASE 4: using CUSTOM ____________________________________________________________________
267 |   print*
268 |   print*, "CUSTOM_____________________"
269 |   A2 = Aref
270 |   B2 = Bref
271 |   w2 = 0
272 |   A2_d = A2
273 |   B2_d = B2
274 |   w2_d = w2
275 |   il = 1
276 |   iu = N
277 | 
278 |   deallocate(work, rwork, iwork)
279 |   lwork = N
280 |   lrwork = 1+5*N+2*N*N
281 |   liwork = 3+5*N
282 |   allocate(work(lwork), rwork(lrwork), iwork(liwork))
283 | 
284 |   deallocate(work_d)
285 |   lwork_d = 2*64*64 + 65 * N
286 |   lrwork_d = N
287 |   allocate(work_d(1*lwork_d))
288 |   allocate(rwork_d(1*lrwork_d))
289 | 
290 |   ts = wallclock()
291 |   call nvtxStartRange("Custom",0)
292 |   call zhegvdx_gpu(N, A2_d, lda, B2_d, lda, Z2_d, lda, il, iu, w2_d, work_d, lwork_d, rwork_d, lrwork_d, &
293 |                           work, lwork, rwork, lrwork, iwork, liwork, Z2, lda, w2, istat)
294 |   call nvtxEndRange
295 |   te = wallclock()
296 | 
297 |   print*, "evalues/evector accuracy: (compared to CPU results)"
298 |   call compare(w1, w2, iu)
299 |   call compare(A1, Z2, N, iu)
300 |   print*
301 | 
302 |   print*, "Time for CUSTOM zhegvd/x = ", (te - ts)*1000.0
303 |   if (istat /= 0) write(*,*) 'zhegvdx_gpu failed'
304 | 
305 | end program
306 | 


--------------------------------------------------------------------------------
/test_driver/toolbox.F90:
--------------------------------------------------------------------------------
  1 | !
  2 | ! Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
  3 | ! 
  4 | ! 
  5 | ! Permission is hereby granted, free of charge, to any person obtaining a
  6 | ! copy of this software and associated documentation files (the "Software"),
  7 | ! to deal in the Software without restriction, including without limitation
  8 | ! the rights to use, copy, modify, merge, publish, distribute, sublicense,
  9 | ! and/or sell copies of the Software, and to permit persons to whom the
 10 | ! Software is furnished to do so, subject to the following conditions:
 11 | ! 
 12 | ! The above copyright notice and this permission notice shall be included in
 13 | ! all copies or substantial portions of the Software.
 14 | ! 
 15 | ! THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | ! IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | ! FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
 18 | ! THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | ! LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 20 | ! FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 21 | ! DEALINGS IN THE SOFTWARE.
 22 | !
 23 | 
 24 | module compare_utils
 25 | 
 26 |   interface compare
 27 |     module procedure compare_real_1d_cpu
 28 |     ! Note: compare_real/complex_2d_cpu compares absolute values in order to deal with
 29 |     !       eigenvectors which may differ only by sign
 30 |     module procedure compare_real_2d_cpu
 31 |     module procedure compare_complex_2d_cpu
 32 |   end interface compare
 33 | 
 34 |   contains
 35 | 
 36 |   subroutine compare_real_1d_cpu(A1_h,A2_h,N)
 37 |     implicit none
 38 |     real(8), dimension(:) :: A1_h, A2_h
 39 |     real(8), dimension(:), allocatable :: A1, A2
 40 |     real(8) :: maxerr,perr,l2normerr,norm,buf
 41 |     integer :: i,j,k,N,imax
 42 |     character (len=4) :: itcount
 43 |     character (len=1) :: proc
 44 | 
 45 |     allocate(A1, source = A1_h)
 46 |     allocate(A2, source = A2_h)
 47 | 
 48 |     l2normerr = 0.d0
 49 |     norm = 0.d0
 50 |     maxerr = 0.d0
 51 |     imax=1
 52 |     do i=1, N
 53 |         if(abs(A1(i)) >= 1e-10) then
 54 |           perr = abs(A1(i) - A2(i))/abs(A1(i))*100.d0
 55 |           norm = norm + abs(A1(i)*A1(i));
 56 |           l2normerr = l2normerr + abs((A1(i) - A2(i))*(A1(i) - A2(i)))
 57 |         else
 58 |           perr = 0.d0
 59 |         endif
 60 |         if(perr>maxerr .and. A1(i)/=0.0d0 .and. A2(i)/=0.0d0) then
 61 |           maxerr = perr
 62 |           imax = i
 63 |         endif
 64 |     enddo
 65 | 
 66 |   norm = sqrt(norm)
 67 |   l2normerr = sqrt(l2normerr)
 68 |   if(l2normerr /= 0.d0) then
 69 |     l2normerr = l2normerr/norm
 70 |     write(*,"(A16,2X,ES10.3,A12,ES10.3,A6,I5,A6,2X,E20.14,2X,A6,2X,E20.14)") &
 71 |     "l2norm error",l2normerr,"max error",maxerr,"% at",imax,"cpu=",A1(imax),"gpu=",A2(imax)
 72 |   else
 73 |     write(*,"(A16)") "EXACT MATCH"
 74 |   endif
 75 | 
 76 |   deallocate(A1, A2)
 77 | 
 78 |   end subroutine compare_real_1d_cpu
 79 | 
 80 |   subroutine compare_real_2d_cpu(A1_h,A2_h,N, M)
 81 |     implicit none
 82 |     real(8), dimension(:,:) :: A1_h, A2_h
 83 |     real(8), dimension(:,:), allocatable :: A1, A2
 84 |     real(8) :: maxerr,perr,l2normerr,norm,buf
 85 |     integer :: i,j,k,imax,jmax,kmax,N,M
 86 |     character (len=4) :: itcount
 87 |     character (len=1) :: proc
 88 | 
 89 |     allocate(A1, source = A1_h)
 90 |     allocate(A2, source = A2_h)
 91 | 
 92 |     l2normerr = 0.d0
 93 |     norm = 0.d0
 94 |     maxerr = 0.d0
 95 |     imax=1
 96 |     jmax=1
 97 |     kmax=1
 98 |     do j=1, M
 99 |       do i=1, N
100 |         if(abs(A1(i,j)) >= 1e-10) then
101 |           perr = abs(abs(A1(i,j)) - abs(A2(i,j)))/abs(A1(i,j))*100.d0
102 |           norm = norm + abs(A1(i,j)*A1(i,j));
103 |           l2normerr = l2normerr + abs((abs(A1(i,j)) - abs(A2(i,j)))*(abs(A1(i,j)) - abs(A2(i,j))))
104 |         else
105 |           perr = 0.d0
106 |         endif
107 |         if(perr>maxerr .and. A1(i,j)/=0.0d0 .and. A2(i,j)/=0.0d0) then
108 |           maxerr = perr
109 |           imax = i
110 |           jmax = j
111 |         endif
112 |       enddo
113 |    enddo
114 | 
115 |   norm = sqrt(norm)
116 |   l2normerr = sqrt(l2normerr)
117 |   if(l2normerr /= 0.d0) then
118 |     l2normerr = l2normerr/norm
119 |     write(*,"(A16,2X,ES10.3,A12,ES10.3,A6,I5,I5,A6,2X,E20.14,1X,2X,A6,2X,E20.14,1X)") &
120 |     "l2norm error",l2normerr,"max error",maxerr,"% at",imax,jmax,"cpu=",REAL(A1(imax,jmax)),"gpu=",REAL(A2(imax,jmax))
121 |   else
122 |     write(*,"(A16)") "EXACT MATCH"
123 |   endif
124 | 
125 |   deallocate(A1, A2)
126 | 
127 |   end subroutine compare_real_2d_cpu
128 | 
129 |   subroutine compare_complex_2d_cpu(A1_h,A2_h,N, M)
130 |     implicit none
131 |     complex(8), dimension(:,:) :: A1_h, A2_h
132 |     complex(8), dimension(:,:), allocatable :: A1, A2
133 |     real(8) :: maxerr,perr,l2normerr,norm,buf
134 |     integer :: i,j,k,imax,jmax,kmax,N,M
135 |     character (len=4) :: itcount
136 |     character (len=1) :: proc
137 | 
138 |     allocate(A1, source = A1_h)
139 |     allocate(A2, source = A2_h)
140 | 
141 |     l2normerr = 0.d0
142 |     norm = 0.d0
143 |     maxerr = 0.d0
144 |     imax=1
145 |     jmax=1
146 |     kmax=1
147 |     do j=1, M
148 |       do i=1, N
149 |         if(abs(A1(i,j)) >= 1e-10) then
150 |           perr = abs(abs(A1(i,j)) - abs(A2(i,j)))/abs(A1(i,j))*100.d0
151 |           norm = norm + abs(A1(i,j)*A1(i,j));
152 |           l2normerr = l2normerr + abs((abs(A1(i,j)) - abs(A2(i,j)))*(abs(A1(i,j)) - abs(A2(i,j))))
153 |         else
154 |           perr = 0.d0
155 |         endif
156 |         if(perr>maxerr .and. A1(i,j)/=0.0d0 .and. A2(i,j)/=0.0d0) then
157 |           maxerr = perr
158 |           imax = i
159 |           jmax = j
160 |         endif
161 |       enddo
162 |    enddo
163 | 
164 |   norm = sqrt(norm)
165 |   l2normerr = sqrt(l2normerr)
166 |   if(l2normerr /= 0.d0) then
167 |     l2normerr = l2normerr/norm
168 |     write(*,"(A16,2X,ES10.3,A12,ES10.3,A6,I5,I5,A6,2X,E20.14,1X,E20.14,2X,A6,2X,E20.14,1X,E20.14)") &
169 |     "l2norm error",l2normerr,"max error",maxerr,"% at",imax,jmax,"cpu=",REAL(A1(imax,jmax)),AIMAG(A1(imax,jmax)),"gpu=",REAL(A2(imax,jmax)),AIMAG(A2(imax,jmax))
170 |   else
171 |     write(*,"(A16)") "EXACT MATCH"
172 |   endif
173 | 
174 |   deallocate(A1, A2)
175 | 
176 |   end subroutine compare_complex_2d_cpu
177 | end module compare_utils
178 | 


--------------------------------------------------------------------------------
/test_driver/wallclock.c:
--------------------------------------------------------------------------------
 1 | /*
 2 | * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
 3 | * 
 4 | * 
 5 | * Permission is hereby granted, free of charge, to any person obtaining a
 6 | * copy of this software and associated documentation files (the "Software"),
 7 | * to deal in the Software without restriction, including without limitation
 8 | * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 9 | * and/or sell copies of the Software, and to permit persons to whom the
10 | * Software is furnished to do so, subject to the following conditions:
11 | * 
12 | * The above copyright notice and this permission notice shall be included in
13 | * all copies or substantial portions of the Software.
14 | * 
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 | * DEALINGS IN THE SOFTWARE.
22 | */
23 | 
24 | #include <time.h>
25 | 
26 | #include <sys/types.h>
27 | #include <sys/times.h>
28 | #include <sys/time.h>
29 | 
30 | double wallclock_(void)
31 | {
32 |   struct timeval tv;
33 |   struct timezone tz;
34 |   double t;
35 | 
36 |   gettimeofday(&tv, &tz);
37 | 
38 |   t = (double)tv.tv_sec;
39 |   t += ((double)tv.tv_usec)/1000000.0;
40 | 
41 |   return t;
42 | }
43 | 


--------------------------------------------------------------------------------