├── Example.f90 ├── LICENSE ├── Load.f90 ├── Matrix ├── 001_ColIdxs.txt ├── 001_Definition.txt ├── 001_RHS.txt ├── 001_RowIdxs.txt ├── 001_Values.txt ├── Matrix_Spy_Map.png └── Settings.ini ├── Misc.f90 ├── Parallelization.f90 ├── Quicksort.f90 ├── RCM.f90 ├── Readme.MD ├── Settings.ini ├── Solver.f90 ├── SparseMatrix.f90 └── Technical_report.pdf /Example.f90: -------------------------------------------------------------------------------- 1 | !MIT License 2 | ! 3 | !Copyright (c) [2017] [HydroGeophysics Group, Aarhus University, Denmark] 4 | ! 5 | !Permission is hereby granted, free of charge, to any person obtaining a copy 6 | !of this software and associated documentation files (the "Software"), to deal 7 | !in the Software without restriction, including without limitation the rights 8 | !to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | !copies of the Software, and to permit persons to whom the Software is 10 | !furnished to do so, subject to the following conditions: 11 | ! 12 | !The above copyright notice and this permission notice shall be included in all 13 | !copies or substantial portions of the Software. 14 | ! 15 | !THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | !IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | !FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | !AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | !LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | !OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | !SOFTWARE. 22 | 23 | !This is a Simple example that shows the sparse Iterative Solver from Aarhus Hydrogeogphysical Group in action. 24 | !This example loads a linear system from txt files in CSR format, it loads an ini-file with various settings, and then it solves the linear system 25 | program Solver_example 26 | use mParallel 27 | use mSolver 28 | use mSMat 29 | use mInput 30 | implicit none 31 | integer :: AffinityMode,NCPUs,UseNested,StartThread,NCPUsLow,NCPUInner,NCPUOuter,NCPUInnerLow,AffinityPattern 32 | integer :: n,i,nCols,nRows,UseRCM 33 | real*8, pointer :: rhs(:),x(:),rhs_backup(:),Residual(:) 34 | character*256 :: SettingFileName,SparseMatrixFiles,OutFile 35 | CHARACTER(len=256) :: arg 36 | type(TSparseMat) :: A 37 | type(Tsettings) :: Set 38 | !Solver vars 39 | integer :: MaxIter, NmodPar, NoRhs, Solverinformation, solvertype,UseNormalization 40 | real*8 :: FillingFactor,error,t1,t2 41 | type(TSolverSettings) :: SolverSettings 42 | 43 | !First we start by loading the additional arguments, which should have been provided with the call. 44 | CALL get_command_argument(1, arg) !Argument 1 is the name of the settingsfile 45 | IF (LEN_TRIM(arg) == 0) then 46 | print*,'ERROR not enough arguments given' 47 | end if 48 | read(arg,*) SettingFileName 49 | 50 | CALL get_command_argument(2, arg) !Argument 2 is the name of the SparseMatrix files. 51 | IF (LEN_TRIM(arg) == 0) then 52 | print*,'ERROR not enough arguments given' 53 | end if 54 | read(arg,*) SparseMatrixFiles 55 | 56 | CALL get_command_argument(3, arg) !Argument 3 is the name of the output file. 57 | IF (LEN_TRIM(arg) == 0) then 58 | print*,'ERROR not enough arguments given' 59 | end if 60 | read(arg,*) OutFile 61 | 62 | print*,'SettingFileName:',SettingFileName 63 | print*,'SparseMatrixFiles:',SparseMatrixFiles 64 | print*,'OutFile',OutFile 65 | call LoadSettings(SettingFileName,set) 66 | 67 | !Before we do anything else openMp related, we call initOpenMp() 68 | call InitOpenMP(set%AffinityMode,set%AffinityPattern,set%StartThread,set%UseNested,set%NCPUs,set%NCPUsLow,set%NCPUOuter) 69 | 70 | !Now lets create a sparse linear system to work on 71 | call SMatReadSparse(A,rhs, SparseMatrixFiles,Set%BlockSize) !Read the sparsematrix files into memory 72 | allocate(x(A%norows)) 73 | allocate(rhs_backup(A%noRows)) 74 | allocate(Residual(A%noRows)) 75 | rhs_backup=rhs 76 | UseRCM=0 77 | x=0 78 | t1=omp_get_wtime() !Start a timer 79 | call SetSolverSettings(SolverSettings,A,UseRCM,set%UseNormalization,set%SolverType,set%MaxIter,set%blocksize,set%FillingFactor) !Set the solver and create the preconditioner factorization 80 | call SingleRHS_wrapper(A,rhs,x,SolverSettings) !Propagate the actual linear system 81 | t2=omp_get_wtime() 82 | call SolverNormalizeMatrix(solversettings,A,.false.) !Renormalize matrix A (This is only needed because we want to validate that the solution did indeed solve the initial matrix). 83 | 84 | call SMatAmultV(A,x,rhs) !Show that the solution we found is indeed a solution to the linear system by multiplying the solution on the matrix 85 | Residual=rhs-rhs_backup 86 | Error=sqrt(ddot(A%NoRows,Residual,1,Residual,1)) 87 | print*,'Error',Error 88 | print*,'Time for solve',t2-t1 89 | open(78,file=trim(OutFile),position='append') 90 | write(78,*) t2-t1 91 | write(78,*) solversettings%output%status 92 | write(78,*) Error 93 | write(78,*) omp_get_max_threads() 94 | write(78,*) set%SolverType 95 | close(78) 96 | 97 | end program Solver_example 98 | 99 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Tue Boesen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Load.f90: -------------------------------------------------------------------------------- 1 | module mInput 2 | implicit none 3 | !Loads the settings used by the parallelization framework and the iterative solver 4 | type Tsettings 5 | integer :: AffinityMode 6 | integer :: AffinityPattern 7 | integer :: NCPUs 8 | integer :: UseNested 9 | integer :: StartThread 10 | integer :: NCPUsLow 11 | integer :: NCPUOuter 12 | integer :: NCPUInner 13 | integer :: NCPUInnerLow 14 | integer :: SolverType 15 | integer :: MaxIter 16 | integer :: BlockSize 17 | integer :: UseNormalization 18 | real*8 :: FillingFactor 19 | end type Tsettings 20 | contains 21 | 22 | subroutine LoadSettings(FName,Settings) 23 | character*256,intent(in) :: Fname 24 | type(Tsettings),intent(inout) :: Settings 25 | integer :: Nsettings,i,j,f,pos 26 | character*256 :: str,str2 27 | F=55 28 | Nsettings=13 29 | open(unit=F,file=FName,status='old') 30 | do i=1,Nsettings 31 | read(F,'(A)') str ! read line 32 | str = adjustl(str) ! remove left blanks 33 | pos = scan(str,'!') 34 | if(pos.gt.0) then 35 | str = str(1:pos-1) ! extract the parameter value 36 | end if 37 | select case(i) 38 | case(2) 39 | read(str,*) Settings%AffinityMode 40 | case(3) 41 | read(str,*) Settings%AffinityPattern 42 | case(4) 43 | read(str,*) Settings%UseNested 44 | case(5) 45 | read(str,*) Settings%StartThread 46 | case(6) 47 | read(str,*) Settings%NCPUs 48 | case(7) 49 | read(str,*) Settings%NCPUsLow 50 | case(8) 51 | read(str,*) Settings%NCPUOuter 52 | case(9) 53 | read(str,*) Settings%SolverType 54 | case(10) 55 | read(str,*) Settings%MaxIter 56 | case(11) 57 | read(str,*) Settings%BlockSize 58 | case(12) 59 | read(str,*) Settings%FillingFactor 60 | case(13) 61 | read(str,*) Settings%UseNormalization 62 | end select 63 | end do 64 | close(F) 65 | end subroutine LoadSettings 66 | 67 | end module mInput -------------------------------------------------------------------------------- /Matrix/001_Definition.txt: -------------------------------------------------------------------------------- 1 | 42332 !Number of rows 2 | 42332 !Number of columns 3 | 545828 !Number of elements 4 | 38 !Sounding size 5 | -------------------------------------------------------------------------------- /Matrix/Matrix_Spy_Map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tueboesen/Sparse-iterative-parallel-linear-solver/fe82bd08e2050716c2ed43996337d5e8f4723c44/Matrix/Matrix_Spy_Map.png -------------------------------------------------------------------------------- /Matrix/Settings.ini: -------------------------------------------------------------------------------- 1 | Example of an ini file - this is skipped during the load and can contain anything 2 | -99 !AffinityMode 3 | -99 !AffinityPattern 4 | 0 !UseNested 5 | -99 !StartThread 6 | 0 !NCPUs 7 | -99 !NCPUsLow 8 | -99 !NCPUOuter 9 | -1 !SolverType 10 | -1 !Max number of iterations to run during the propagation 11 | 1 !Blocksize, Each block will be a modulo of this size 12 | -1 !FillingFactor, A factor that scales the number of elements saved in each row during the IC factorization 13 | 1 !Determines whether to normalize the matrix before solving it. -------------------------------------------------------------------------------- /Misc.f90: -------------------------------------------------------------------------------- 1 | Module mMisc 2 | !Tue a module for small usefull routines that doesn't fit in anywhere else, but is used through different modules 3 | implicit none 4 | interface LoadOptionalParam 5 | module procedure LoadOptionalParam_int, LoadOptionalParam_real, LoadOptionalParam_real2 6 | end interface 7 | contains 8 | 9 | Function LoadOptionalParam_int(DefaultValue,iParam) 10 | !Tue Dec, 2014 11 | !This function is used to load optional parameters 12 | !IO 13 | !DefaultValue -> A default value which we load if the optional parameter does not exist. 14 | !iParam -> the optional parameter that we want to load 15 | implicit none 16 | integer, intent(in) :: DefaultValue 17 | integer, intent(in),optional :: iParam 18 | integer :: LoadOptionalParam_int 19 | if(present(iParam)) then 20 | LoadOptionalParam_int=iParam 21 | else 22 | LoadOptionalParam_int=DefaultValue 23 | end if 24 | end function LoadOptionalParam_int 25 | Function LoadOptionalParam_real(DefaultValue,iParam) 26 | !Tue Dec, 2014 27 | !This function is used to load optional parameters 28 | !IO 29 | !DefaultValue -> A default value which we load if the optional parameter does not exist. 30 | !iParam -> the optional parameter that we want to load 31 | implicit none 32 | real*8, intent(in) :: DefaultValue 33 | real*8, intent(in),optional :: iParam 34 | real*8 :: LoadOptionalParam_real 35 | if(present(iParam)) then 36 | LoadOptionalParam_real=iParam 37 | else 38 | LoadOptionalParam_real=DefaultValue 39 | end if 40 | end function LoadOptionalParam_real 41 | Function LoadOptionalParam_real2(DefaultValue,iParam) 42 | !Tue Dec, 2014 43 | !This function is used to load optional parameters 44 | !IO 45 | !DefaultValue -> A default value which we load if the optional parameter does not exist. 46 | !iParam -> the optional parameter that we want to load 47 | implicit none 48 | integer, intent(in) :: DefaultValue 49 | real*8, intent(in),optional :: iParam 50 | real*8 :: LoadOptionalParam_real2 51 | if(present(iParam)) then 52 | LoadOptionalParam_real2=iParam 53 | else 54 | LoadOptionalParam_real2=DefaultValue 55 | end if 56 | end function LoadOptionalParam_real2 57 | end module mMisc 58 | 59 | 60 | module mArrays 61 | interface ArraysCheckAndIncAlloc 62 | module procedure mArraysCheckAndIncAllocReal, mArraysCheckAndIncAllocInt 63 | end interface 64 | 65 | contains 66 | subroutine mArraysReallocateInts(iArray,iNewSize) 67 | implicit none 68 | integer,dimension(:),pointer,intent(inout) :: iArray 69 | integer,intent(in) :: iNewSize 70 | integer,dimension(:),pointer :: TempArray 71 | integer :: CurSize 72 | integer :: TransferSize 73 | integer :: Error 74 | character*256 :: S 75 | !Find the size to transfer to the resized array to be allocated. 76 | CurSize=Size(iArray) 77 | 78 | if(CurSize==0) then 79 | !goto 902 80 | !CS: Below: in case we don't want an error message 81 | allocate(iArray(iNewSize),Stat=Error) 82 | if (Error.ne.0)& 83 | goto 900 84 | return 85 | end if 86 | 87 | if (CurSize.ne.iNewsize) then 88 | TransferSize=CurSize 89 | if (TransferSize.gt.iNewSize)& 90 | TransferSize=iNewSize 91 | !Allocate temporary arrays. 92 | allocate(TempArray(TransferSize),Stat=Error) 93 | if (Error.ne.0)& 94 | goto 900 95 | !Transfer values to temporary arrays - first the real*8 array holding the values. 96 | TempArray(1:TransferSize)=iArray(1:TransferSize) 97 | !Deallocate old array 98 | if (associated(iArray))& 99 | deallocate(iArray) 100 | !Allocate new array of updated length and transfer the old values 101 | allocate(iArray(iNewSize),Stat=Error) 102 | if (Error.ne.0)& 103 | goto 900 104 | if (TransferSize+1.le.iNewSize) then 105 | iArray(TransferSize+1:iNewSize)=0 106 | end if 107 | iArray(1:TransferSize)=TempArray(1:TransferSize) 108 | deallocate(TempArray) 109 | end if 110 | return 111 | 900 continue 112 | S='Error re-allocating memory. Please increase the free physical memory or decrease the number of data.' 113 | go to 999 114 | 901 continue 115 | S='Error de-allocating memory. Please increase the free physical memory or decrease the number of data.' 116 | go to 999 117 | 902 continue 118 | S='Error in mArraysReallocateInts iArray array has a 0 size at the entry of the subroutine.' 119 | go to 999 120 | 999 print*, S 121 | end subroutine mArraysReallocateInts 122 | 123 | !****************************************************************************** 124 | subroutine mArraysReallocateReals(iArray,iNewSize) 125 | implicit none 126 | real*8,dimension(:),pointer,intent(inout) :: iArray 127 | integer,intent(in) :: iNewSize 128 | real*8,dimension(:),pointer :: TempArray 129 | integer :: CurSize 130 | integer :: TransferSize 131 | integer :: Error 132 | character*256 :: S 133 | !Find the size to transfer to the resized array to be allocated. 134 | CurSize=Size(iArray) 135 | 136 | if(CurSize==0) then 137 | !goto 902 138 | !CS: Below: in case we don't want an error message 139 | allocate(iArray(iNewSize),Stat=Error) 140 | iArray(:)=0.d0 141 | if (Error.ne.0)& 142 | goto 900 143 | return 144 | end if 145 | 146 | if (CurSize.ne.iNewsize) then 147 | TransferSize=CurSize 148 | if (TransferSize.gt.iNewSize)& 149 | TransferSize=iNewSize 150 | !Allocate temporary arrays. 151 | allocate(TempArray(TransferSize),Stat=Error) 152 | if (Error.ne.0)& 153 | goto 900 154 | !Transfer values to temporary arrays - first the real*8 array holding the values. 155 | TempArray(1:TransferSize)=iArray(1:TransferSize) 156 | !Deallocate old array 157 | if (associated(iArray)) then 158 | deallocate(iArray,Stat=Error) 159 | if (Error.ne.0)& 160 | goto 901 161 | end if 162 | !Allocate new array of updated length and transfer the old values 163 | allocate(iArray(iNewSize),Stat=Error) 164 | iArray(:)=0.d0 165 | if (Error.ne.0)& 166 | goto 900 167 | iArray(1:TransferSize)=TempArray(1:TransferSize) 168 | deallocate(TempArray,Stat=Error) 169 | if (Error.ne.0)& 170 | goto 901 171 | end if 172 | return 173 | 900 continue 174 | S='Error re-allocating memory. Please increase the free physical memory or decrease the number of data.' 175 | go to 999 176 | 901 continue 177 | S='Error de-allocating memory. Please increase the free physical memory or decrease the number of data.' 178 | go to 999 179 | 902 continue 180 | S='Error in mArraysReallocateReals iArray array has a 0 size at the entry of the subroutine.' 181 | go to 999 182 | 999 print*, S 183 | end subroutine mArraysReallocateReals 184 | 185 | subroutine mArraysCheckAndIncAllocReal(Nnew,ioAR) 186 | ! This routine increases the allocation of ioAR if 187 | ! Nnew is larger than the allocation of ioAR. 188 | ! The Nnew allocation will be 2*Nnew to avoid future allocations 189 | ! No data are transferred ! 190 | integer,intent(in) :: Nnew 191 | real*8,allocatable,intent(inout) :: ioAR(:) 192 | integer :: k 193 | if (allocated(ioAR)) then 194 | K=ubound(ioAR,dim=1) 195 | if (k.lt.Nnew) then 196 | deallocate(ioAR) 197 | allocate(ioAR(2*Nnew+1)) 198 | end if 199 | else 200 | allocate(ioAR(2*Nnew)) 201 | end if 202 | end subroutine mArraysCheckAndIncAllocReal 203 | 204 | !dir$ attributes offload : mic :: marrayscheckAnsIncAllocInt 205 | subroutine mArraysCheckAndIncAllocInt(Nnew,ioAR) 206 | ! This routine increases the allocation of ioAR if 207 | ! Nnew is larger than the allocation of ioAR. 208 | ! The Nnew allocation will be 2*Nnew to avoid future allocations 209 | ! No data are transferred ! 210 | integer,intent(in) :: Nnew 211 | integer,allocatable,intent(inout) :: ioAR(:) 212 | integer :: k 213 | if (allocated(ioAR)) then 214 | K=ubound(ioAR,dim=1) 215 | if (k.lt.Nnew) then 216 | deallocate(ioAR) 217 | allocate(ioAR(2*Nnew+1)) 218 | end if 219 | else 220 | allocate(ioAR(2*Nnew)) 221 | end if 222 | end subroutine mArraysCheckAndIncAllocInt 223 | end module mArrays -------------------------------------------------------------------------------- /Parallelization.f90: -------------------------------------------------------------------------------- 1 | Module mParallel 2 | !Tue Boesen 2017 3 | !This module is meant to work as a general parallelization framework, and is meant to replace all other parallelization existing in a code. 4 | !The general structure of the code is as follows: 5 | !On initialization the subroutine InitOpenMP() should be called, it is imperative that this routine is called before any OpenMP routines are called - even omp_get_wtime() 6 | !When creating parallel regions anywhere in the code, these regions should in general always be created with an if call to StartOpenMP 7 | use omp_lib 8 | use IFPORT, only: SETENVQQ 9 | implicit none 10 | 11 | !Module variables - These variables changes dynamically during a run 12 | logical,private :: RunningSingleParallelRegion !Used to lock against further parallelization while running a nonnested region. 13 | integer,private :: OldThreadNumber !Saves the previous threadnumber so we can return to this number afterwards if desired. 14 | !Initial settings - These variables get set in InitOpenMP, and then remain static after that 15 | integer,private :: UseNested !Use nested parallelization? 16 | integer,private :: NCPUs !Total number of threads 17 | integer,private :: NCPUsLow !Total number of threads in a memory bandwidth intense region 18 | integer,private :: NCPUOuter !Number of CPUs in the outer region of a nested parallelization 19 | integer,private :: NCPUInner !Number of CPUs in the inner region of a nested parallelization 20 | integer,private :: NCPUInnerLow !Number of CPUs in the inner region of a nested parallelization with memory bandwith limits 21 | integer,private :: AffinityMode !Dedicated or shared server? 22 | integer,private :: AffinityPattern !Scattered or compact affinity 23 | integer,private :: StartThread !ThreadID of first affinity binding 24 | contains 25 | 26 | subroutine InitOpenMP(iAffinityMode,iAffinityPattern,iStartThread,iUseNested, iNCPUs,iNCPUsLow,iNCPUOuter) 27 | !This is the initialization routine for OpenMP. 28 | !It NEEDS to be run before any OpenMP routines are called - even omp_get_wtime(). 29 | !This routine setup the entire parallel framework. 30 | !Based on the input variables it determines the number of threads to use, the affinity which they should be used with, the parallelization structure 31 | !On output this routine will have set all the following private module variables: NCPUs, NCPUInner, NCPUOuter, NCPUInnerLow, NCPUsLow, AffinityMode, UseNested, StartThread. 32 | !All these variables will be used by the subroutine StartOpenMP, which should be called everytime a parallel region is potentially about to be created. 33 | 34 | !IO 35 | !Note any variable can be set to -99 for automatic determination. 36 | !AffinityMode -> Specifies different modes thread binding can occur. 37 | ! 0=shared mode. 38 | ! Threads bind to a socket, but are free to migrate between the cores of a socket. 39 | ! This mode is usefull when running on a server with multiple users, however, 40 | ! performance can also be lower than the dedicated mode. Shared mode does not use nested parallelization in any way. 41 | ! Note if all CPU's are used, AffinityMode will automatically be changed from shared to dedicated 42 | ! 1=Dedicated mode. 43 | ! Threads bind to individual cores, and hence are locked in place. This gives better performance, but only if the cores are not used for anything else. 44 | ! 2=manual mode. No affinity settings are made inside the program. It is left to the user to set proper environment variables, before starting 45 | ! the program. 46 | !iAffinityPattern -> Sets the affinity pattern for the threadbinding. This is mostly relevant on NUMA systems, or when using nested parallelization. 47 | ! 0=Scatter threads. 48 | ! Threads are distributed as far apart as possible. 49 | ! 1=Compact threads. 50 | ! Threads are clustered together. 51 | !iStartThread -> The threadID the first thread is bound to. 52 | !iUseNested -> Determines whether to enable nested parallelization or not (0=no nested parallelization allowed,1=Nested parallelization enabled) 53 | !iNCPUs -> Total number of CPUs to use (0=all, -1=all-1, -2=all-2, ect.) 54 | !iNCPUsLow -> Total number of CPUs to use in a bandwidth intensive regions (0=all, -1=all-1, -2=all-2, ect.) 55 | !The following are only relevant if nested parallelization is used. 56 | !iNCPUOuter -> Number of CPUs to use in outer parallel region. (Should be set between 2..and Number of numanodes) 57 | use ifwin 58 | implicit none 59 | integer,intent(in) :: iAffinityMode,iAffinityPattern,iStartThread,iUseNested,iNCPUs,iNCPUsLow,iNCPUOuter 60 | !Local vars 61 | logical(4) :: Success 62 | integer :: i,j 63 | integer :: numaNodeCount,processorPackageCount,processorCoreCount,logicalProcessorCount 64 | integer,dimension(3) :: processorCacheCount 65 | real*8 :: rand 66 | logical :: Debug = .false. !Only used for debugging, to see how the threads are bound and what was detected. 67 | character(1024) :: KmpAff 68 | !Vars for manual binding 69 | integer(HANDLE) Process 70 | integer(DWORD_PTR) AffinityMask 71 | integer Core 72 | integer(BOOL) Retval 73 | integer(DWORD) iError 74 | 75 | call GetMachineArch(numaNodeCount,processorPackageCount,processorCoreCount, logicalProcessorCount,processorCacheCount ) !Get machine architecture 76 | if(Debug) then 77 | print*, "GetLogicalProcessorInformation results:" 78 | print*, " Number of NUMA nodes: ", numaNodeCount 79 | print*, " Number of physical processor packages: ", processorPackageCount 80 | print*, " Number of processor cores: ", processorCoreCount 81 | print*, " Number of logical processors: ", logicalProcessorCount 82 | print*, " Number of processor L1/L2/L3 caches: ",processorCacheCount 83 | end if 84 | 85 | !We set StartThread 86 | StartThread = iStartThread 87 | if (StartThread.le.-1) then 88 | call Random_seed() 89 | call Random_Number(rand) !Random number between 0-1 90 | StartThread=floor(rand*logicalProcessorCount) !scaled to 0-(logicalprocessorcount-1) 91 | end if 92 | StartThread=max(0,StartThread) !Sanity check 93 | StartThread=min(logicalProcessorCount-1,StartThread) !Sanity check 94 | 95 | !We set NCPUs 96 | NCPUs=iNCPUs 97 | if (NCPUs.eq.-99) then 98 | NCPUs=processorCoreCount/2 !We use approximately half the cores if nothing else is specified. 99 | end if 100 | if (NCpus.le.0) then 101 | NCPUs = max(processorCoreCount+NCpus,1) 102 | end if 103 | NCPUs=min(logicalProcessorCount,NCPUs) !Sanity check 104 | NCPUs=max(1,NCPUs) !Sanity check 105 | 106 | !We set NCPUsLow 107 | NCPUsLow=iNCPUsLow 108 | if (NCPUsLow.eq.-99) then 109 | NCPUsLow=NCPUs*2/3 !We use approximately 2/3 of the threads specified for a bandwith intensive region, if nothing else is specified. 110 | end if 111 | if (NCpusLow.le.0) then 112 | NCPUsLow = max(processorCoreCount+NCpusLow,1) 113 | end if 114 | NCPUsLow=min(NCPUs,NCPUsLow) !Sanity check 115 | NCPUsLow=max(1,NCPUsLow) !Sanity check 116 | 117 | !We set AffinityMode 118 | AffinityMode=iAffinityMode 119 | if (AffinityMode.eq.-99) then 120 | AffinityMode=0 121 | end if 122 | if(NCpus.ge.logicalProcessorCount.AND.AffinityMode.EQ.0) then !We use all cores, no reason to run shared mode in that case. 123 | AffinityMode=1 124 | end if 125 | if (AffinityMode.lt.0.OR.AffinityMode.gt.2) then !Sanity check 126 | print*, 'Warning! Affinity mode not set correctly. Setting shared AffinityMode' 127 | AffinityMode=0 128 | end if 129 | 130 | !We set UseNested 131 | UseNested=iUseNested 132 | if ((AffinityMode.eq.0).OR.(NCPUs.lt.2).OR.(UseNested.ne.1)) then 133 | UseNested=0 134 | end if 135 | 136 | !We set AffinityPattern 137 | AffinityPattern=iAffinityPattern 138 | if((AffinityPattern.ne.0).OR.(AffinityPattern.ne.1)) then 139 | if(UseNested) then 140 | AffinityPattern=0 !Scatter 141 | else 142 | AffinityPattern=1 !Compact 143 | end if 144 | end if 145 | 146 | if(UseNested.eq.1) then 147 | !We set NCPUOuter 148 | if (NCPUOuter.eq.-99) then 149 | NCPUOuter=numaNodeCount 150 | end if 151 | NCPUOuter=min(max(NCPUOuter,2),numaNodeCount) !Sanity check 152 | NCPUOuter=min(NCPUOuter,NCPUs) !Sanity check 153 | 154 | !We set NCPUInner 155 | NCPUInner=NCPUs/NCPUOuter 156 | 157 | !We set NCPUInnerLow 158 | NCPUInnerLow=NCPUsLow/NCPUOuter 159 | 160 | if(NCPUs.ne.NCPUOuter*NCPUInner) then 161 | !The original CPU numbers do not match a multiplum of the detected NumaNodes 162 | NCPUs=NCPUOuter*NCPUInner 163 | if(iNCPUs.ne.-99) then 164 | print*,'NCPUs was originally chosen as',iNCPUs 165 | print*,'In order to preserve even load across the NUMAnodes NCPUs is changed to:', NCPUs 166 | end if 167 | end if 168 | if(NCPUsLow.ne.NCPUOuter*NCPUInnerLow) then 169 | !The original Low CPU numbers do not match a multiplum of the detected NumaNodes 170 | NCPUsLow=NCPUOuter*NCPUInnerLow 171 | if(iNCPUsLow.ne.-99) then 172 | print*,'NCPUsLow was originally chosen as',iNCPUsLow 173 | print*,'In order to preserve even load across the NUMAnodes NCPUs is changed to:', NCPUsLow 174 | end if 175 | end if 176 | else 177 | NCPUOuter = NCPUs 178 | NCPUInner = 1 179 | end if 180 | 181 | !All parameters determined and set, now we set all the actual environment variables 182 | if(NCPUs.ge.1.AND.AffinityMode.ne.2) then 183 | call GetKmpAffinityStr(KmpAff,NCPUs,numaNodeCount,processorCoreCount, logicalProcessorCount,AffinityMode,AffinityPattern,StartThread) 184 | Success = SETENVQQ(KmpAff) 185 | else 186 | Process = GetCurrentProcess() 187 | Core = StartThread 188 | AffinityMask = ishft(1_DWORD_PTR, Core) 189 | Retval = SetProcessAffinityMask(Process,AffinityMask) 190 | if(Retval == 0) then 191 | iError = GetLastError() 192 | write(*,*) 'Failed to set the affinity properly',iError 193 | !stop 194 | end if 195 | end if 196 | if(Debug) then 197 | print*,'omp_bind_string:',KmpAff 198 | print*,'NCPU',NCPUs 199 | print*,'NCPUsLow',NCPUsLow 200 | print*,'NCPUouter',NCPUouter 201 | print*,'NCPUInner',NCPUinner 202 | print*,'NCPUInnerLow',NCPUinnerLow 203 | print*,'UseNested',UseNested 204 | print*,'AffinityMode',AffinityMode 205 | print*,'AffinityPattern',AffinityPattern 206 | print*,'StartThread',StartThread 207 | end if 208 | call KMP_SET_STACKSIZE_S(8000000) !In our code this is essential otherwise we get very subtle stack memory corruption in some of our routines 209 | if (UseNested) then 210 | call omp_set_nested(1) 211 | call omp_set_max_active_levels(2) 212 | else 213 | call omp_set_nested(0) 214 | end if 215 | call omp_set_num_threads(NCPUs) 216 | !OpenMP settings are applied when the first parrallel loop is found. We do a dummy loop here to get it done now.. 217 | !$OMP PARALLEL DEFAULT(PRIVATE) SHARED(NCpus) REDUCTION(+:J) 218 | J=0 219 | !$OMP DO 220 | do i=1,NCpus 221 | J=J+1 222 | end do 223 | !$OMP END DO 224 | !$OMP END PARALLEL 225 | if(debug) then 226 | print*,'InitOpenMP complete' 227 | end if 228 | return 229 | end subroutine InitOpenMP 230 | 231 | subroutine GetMachineArch(numaNodeCount,processorPackageCount,processorCoreCount, logicalProcessorCount,processorCacheCount ) 232 | !This routine gets all the current hardware system information. 233 | !IO 234 | !numaNodeCount -> Number of NUMA nodes 235 | !processorPackageCount -> Number of physical processor packages 236 | !processorCoreCount -> Number of processor cores 237 | !logicalProcessorCount -> Number of logical processors 238 | !processorCacheCount -> Number of processor L1/L2/L3 caches 239 | use, intrinsic :: ISO_C_BINDING 240 | use kernel32 241 | implicit none 242 | integer,intent(out) :: numaNodeCount,processorPackageCount,processorCoreCount,logicalProcessorCount 243 | integer,dimension(3),intent(out) :: processorCacheCount 244 | !Local vars 245 | procedure(GetLogicalProcessorInformation), pointer :: glpi 246 | type(T_SYSTEM_LOGICAL_PROCESSOR_INFORMATION), allocatable, dimension(:) :: buffer 247 | integer(DWORD) :: returnLength = 0 248 | integer(DWORD) :: ret 249 | integer :: nlpi, lpi_element_length, i 250 | processorCacheCount=0 251 | numaNodeCount = 0 252 | processorCoreCount = 0 253 | logicalProcessorCount = 0 254 | processorPackageCount = 0 255 | call c_f_procpointer( & !Get kernel information 256 | transfer( & 257 | GetProcAddress( & 258 | GetModuleHandle("kernel32"//C_NULL_CHAR), & 259 | "GetLogicalProcessorInformation"//C_NULL_CHAR & 260 | ), & 261 | C_NULL_FUNPTR & 262 | ), & 263 | glpi) 264 | 265 | if (.not. associated(glpi)) then 266 | print*, "GetLogicalProcessorInformation not supported" 267 | error stop 268 | end if 269 | !We don't know in advance the size of the buffer we need. We'll pick a number, allocate it, 270 | !and see if that's sufficient. If not, we'll use the returned size information and reallocate 271 | !the buffer to the required size. 272 | allocate (buffer(20)) 273 | lpi_element_length = C_SIZEOF(buffer(1)) 274 | returnLength = C_SIZEOF(buffer) 275 | ret = glpi(buffer, returnLength) 276 | if (ret == FALSE) then ! Failed 277 | if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) then 278 | deallocate (buffer) 279 | allocate (buffer(returnLength/lpi_element_length)) 280 | ret = glpi(buffer, returnLength) 281 | if (ret == FALSE) then 282 | print*, "GetLogicalProcessorInformation call failed with error code ", GetLastError() 283 | error stop 284 | end if 285 | else 286 | print*, "GetLogicalProcessorInformation call failed with error code ", GetLastError() 287 | error stop 288 | end if 289 | end if 290 | !Now we can iterate through the elements of buffer and see what we can see 291 | do i=1, returnLength / lpi_element_length ! Number of elements in buffer 292 | select case (buffer(i)%Relationship) 293 | case(RelationNumaNode) 294 | numaNodeCount = numaNodeCount + 1 295 | case(RelationProcessorCore) 296 | processorCoreCount = processorCoreCount + 1 297 | !A Hyperthreaded core supplies more than one logical processor 298 | logicalProcessorCount = logicalProcessorCount + popcnt(buffer(i)%processorMask) 299 | case(RelationCache) 300 | !One cache descriptor for each cache 301 | if (buffer(i)%Cache%Level > 0 .and. buffer(i)%Cache%Level <= 3) then 302 | processorCacheCount(buffer(i)%Cache%Level) = processorCacheCount(buffer(i)%Cache%Level) + 1 303 | else 304 | print*, "Invalid processor cache level ", buffer(i)%Cache%Level 305 | end if 306 | case(RelationProcessorPackage) 307 | !Logical processors share a physical package (socket) 308 | processorPackageCount = processorPackageCount + 1 309 | case default 310 | print*, "Unrecognized relationship code ", buffer(i)%Relationship 311 | end select 312 | end do 313 | if(allocated(buffer)) deallocate(buffer) 314 | end subroutine GetMachineArch 315 | 316 | 317 | subroutine GetKmpAffinityStr(Str,NCPUs,numaNodeCount,processorCoreCount, logicalProcessorCount,AffinityMode,IsClose,StartThread) 318 | !This routine builds a custom AFFINITY string, that binds each thread. 319 | !The actual binding depends on whether AffinityMode is 0 or 1. 320 | ! AffinityMode=0 321 | ! The binding will be loose and each thread will only be bound to a processor package. 322 | ! Furthermore all threads will be distributed among the NUMA nodes round-robin style (ie. like dealing cards). (IsClose is not respected in this case) 323 | ! AffinityMode=1 324 | ! Each thread will be bound to a logical processor. 325 | !IO 326 | !Str -> On output the string will contain a KMP_AFFINITY enviroment variable string. 327 | !NCPUs -> Number of threads we wish to use 328 | !numaNodeCount -> Number of numa nodes in the system 329 | !LogicalProcessorCount -> Number of logical processors in the system (ie. including hyperthreads). 330 | !processorCoreCount -> Number of physical processors in the system (ie. excluding hyperthreads). 331 | !AffinityMode -> Affinitymode to generate string for. 0->shared server, 1->dedicated server 332 | !IsClose -> If 1, then threads will be distributed on the same NUMA node first until full, before moving on to the next, if not then threads will be spread across all NUMA nodes 333 | !StartThread -> which threadID to start the affinity binding with 334 | implicit none 335 | character*(*),intent(inout) :: Str 336 | integer,intent(in) :: numaNodeCount,logicalProcessorCount,AffinityMode,processorCoreCount,NCPUs,StartThread,IsClose 337 | !Local vars 338 | integer :: FirstIdx,LastIdx,i,j,ij,ProcPrNode,thread,bound,inc,CoresPrNuma,NumaID,NInc,reminder,k,numa 339 | character(len=4) :: TmpStr 340 | integer,allocatable :: usedthreads(:) 341 | logical :: ThreadAccepted, Matchfound 342 | character*256 :: String 343 | if (affinityMode.eq.0) then !Shared server 344 | str="KMP_AFFINITY=verbose,granularity=fine,proclist=[" !48 characters in string 345 | FirstIdx=49 346 | LastIdx=49 347 | ProcPrNode=LogicalProcessorCount/numaNodeCount 348 | numa=StartThread/ProcPrNode+1 349 | do i=1,numaNodeCount 350 | if (i.ne.1) then 351 | Str(FirstIdx:FirstIdx)=',' 352 | FirstIdx=FirstIdx+1 353 | end if 354 | Str(FirstIdx:FirstIdx)='{' 355 | FirstIdx=FirstIdx+1 356 | do j=1,ProcPrNode 357 | ij=(numa-1)*ProcPrNode+j-1 358 | if (j.ne.1) then 359 | Str(FirstIdx:FirstIdx)=',' 360 | FirstIdx=FirstIdx+1 361 | end if 362 | LastIdx=FirstIdx 363 | if (ij.gt.9) then 364 | LastIdx=FirstIdx+1 365 | end if 366 | if (ij.gt.99) then 367 | LastIdx=FirstIdx+2 368 | end if 369 | write (TmpStr, '(I3)') ij 370 | str(FirstIdx:LastIdx)=TmpStr(3-(LastIdx-FirstIdx):3) 371 | FirstIdx=LastIdx+1 372 | end do 373 | Str(FirstIdx:FirstIdx)='}' 374 | FirstIdx=FirstIdx+1 375 | numa=numa+1 376 | if (numa.gt.numaNodeCount) then 377 | numa=1 378 | end if 379 | end do 380 | Str(FirstIdx:FirstIdx+10)="],explicit" 381 | print *,Str 382 | else if (affinityMode.eq.1) then !Dedicated server 383 | inc=LogicalProcessorCount/processorCoreCount !Takes care of hyperthreading 384 | CoresPrNuma=LogicalProcessorCount/NumaNodeCount 385 | str="OMP_PLACES=" !12 characters in string 386 | FirstIdx=12 387 | LastIdx=12 388 | bound=min(NCPUs,LogicalProcessorCount) 389 | thread=StartThread 390 | allocate(usedthreads(bound)) 391 | usedthreads=-1 392 | do i=1,bound 393 | if(i.eq.1) then 394 | str(FirstIdx:LastIdx)="{" 395 | else 396 | LastIdx=FirstIdx+1 397 | str(FirstIdx:LastIdx)=",{" 398 | end if 399 | FirstIdx=LastIdx+1 400 | LastIdx=FirstIdx 401 | do k=1,bound !We keep looping until we find a thread that is acceptable to all our criterias, or until we run out of threads 402 | threadaccepted=.true. 403 | matchfound=.false. !First we check whether we already have bound this particular thread 404 | do j=1,bound 405 | if(usedthreads(j).eq.thread) then 406 | matchfound=.true. 407 | threadaccepted=.false. !we take another round 408 | exit 409 | end if 410 | end do 411 | if (matchfound) then !If we have bound this before we jump to the next thread 412 | thread=thread+inc 413 | end if 414 | if(thread.ge.LogicalProcessorCount) then !If we are trying to bound to a thread higher than the number of threads available, we loop around 415 | thread=mod(thread,LogicalProcessorCount) 416 | threadaccepted=.false. !we take another round 417 | end if 418 | if(threadaccepted) then 419 | exit !Finally we found a thread that fits our criteria, so we can exit 420 | end if 421 | end do 422 | if (.not.threadaccepted) then 423 | !exit !If you really insist on binding more threads than cpu's just enable this Exit. 424 | goto 11 !Safety check, we could not find a thread to bind to. This can for instance happen if you try to bind more threads than you have on your system, (hyperthreads do not count) 425 | else 426 | usedthreads(i)=thread 427 | end if 428 | !Finally we bind the thread 429 | if(thread.gt.9) then 430 | LastIdx=FirstIdx+1 431 | end if 432 | if (thread.gt.99) then 433 | LastIdx=FirstIdx+2 434 | end if 435 | write (TmpStr, '(I3)') thread 436 | str(FirstIdx:LastIdx)=TmpStr(3-(LastIdx-FirstIdx):3) 437 | FirstIdx=LastIdx+1 438 | LastIdx=FirstIdx 439 | str(FirstIdx:LastIdx)="}" 440 | FirstIdx=LastIdx+1 441 | !Next up we increase the thread number, since this is distributed, we increase it by jumping up one numa node. 442 | if(IsClose) then 443 | thread=thread+inc !we increase to next non-hyperthread 444 | else 445 | thread=thread+CoresPrNuma !we increase to next numa node 446 | end if 447 | end do 448 | end if 449 | return 450 | 11 print*,'No suitable thread to bind the process to was found, make sure your hardware settings are correct. If they are report this issue.' 451 | end subroutine GetKmpAffinityStr 452 | 453 | function StartOpenMP(iType,iSaveOldThreadNumber) result(UseOpenMP) 454 | use mMisc 455 | !This function will return a logical parameter which tells whether further parallelization should occur. 456 | !Furthermore it will set the number of threads depending on: iType, the level of parallelization set in InitOpenMP and the level of parallelization already enabled. 457 | ! 458 | !IO 459 | !iType -> !Determine what kind of parallel region we wish to make 460 | !iType = 1 : (Default) Adaptive parallelization. 461 | !If no previous parallelization is detected it set the number of threads to NCPUs and locks against further parallelization. 462 | !If 1 layer of previous parallelization is detected it sets the number of threads to NCPUInner. 463 | !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false. 464 | !iType = 2 : Adaptive parallelization for memorybandwidth intensive parallelization. Locks against further parallelization. 465 | !If no previous parallelization is detected it set the number of threads to NCPUsLow and locks against further parallelization. 466 | !If 1 layer of previous parallelization is detected it sets the number of threads to NCPUInnerLow. 467 | !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false. 468 | !iType = 3 : NUMA spreading. Only Locks against further parallelization if previous parallelization is detected. 469 | !If no previous parallelization is detected it set the number of threads to NCPUOuter. 470 | !If 1 layer of previous parallelization is detected it set the number of threads to NCPUInner. 471 | !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false. 472 | !iType = 4 : NUMA spreading for memorybandwidth intensive parallelization. Only Locks against further parallelization if previous parallelization is detected. 473 | !If no previous parallelization is detected it spawns NCPUOuter threads in the parallelization spread across the different NUMA-nodes. 474 | !If 1 layer of previous parallelization is detected it set the number of threads to NCPUInnerLow. 475 | !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false. 476 | !iType = 5 : XeonPhi Not implemented 477 | !Note only Use iType=3/4 if you are sure further parallelization will occur before the CPU intensive tasks 478 | !iSaveOldThreadNumber -> if set to 1 it saves the old threadnumber such that it can be restored after the parallel region is done, if desired. In order to recall it call, the subroutine ReturnOldThreadNumber() 479 | integer,intent(in) :: iType 480 | integer,intent(in),optional :: iSaveOldThreadNumber 481 | !Local vars 482 | logical :: UseOpenMP 483 | integer :: i,saveOldThreadNumber,CPUs 484 | !Do we wish to save the old threadnumber before changing it? 485 | SaveOldThreadNumber=LoadOptionalParam(0,iSaveOldThreadNumber) 486 | if(SaveOldThreadNumber.eq.1) then 487 | OldThreadNumber = omp_get_max_threads() 488 | end if 489 | i=omp_get_active_level() !Get previous level of parallelization 490 | if (i.eq.0) then !No previous parallelization detected 491 | RunningSingleParallelRegion = .false. !We unlock any potential previous parallelization locks 492 | UseOpenMP=.true. 493 | SELECT CASE (iType) 494 | CASE(1) 495 | RunningSingleParallelRegion = .true. 496 | CPUs=NCPUs 497 | call omp_set_num_threads(NCPUs) 498 | CASE(2) 499 | RunningSingleParallelRegion = .true. 500 | call omp_set_num_threads(NCPUsLow) 501 | CASE(3,4) 502 | call omp_set_num_threads(NCPUOuter) 503 | RunningSingleParallelRegion=.false. 504 | END SELECT 505 | elseif(i.eq.1.AND.UseNested.AND.(.not.RunningSingleParallelRegion)) then !Outer parallelization detected, but room for inner parallelization 506 | UseOpenMP=.true. 507 | SELECT CASE (iType) 508 | CASE(1,3) 509 | call omp_set_num_threads(NCPUInner) 510 | CASE(2,4) 511 | call omp_set_num_threads(NCPUInnerLow) 512 | END SELECT 513 | else !Full parallelization already detected 514 | UseOpenMP=.false. 515 | call omp_set_num_threads(1) 516 | end if 517 | end function StartOpenMP 518 | 519 | subroutine ReturnOldThreadNum() 520 | !This returns the thread number to the value saved in the module variable OldThreadNumber 521 | call omp_set_num_threads(OldThreadNumber) 522 | end subroutine ReturnOldThreadNum 523 | 524 | end module mParallel 525 | 526 | 527 | -------------------------------------------------------------------------------- /Quicksort.f90: -------------------------------------------------------------------------------- 1 | ! Recursive Fortran 95 quicksort routine 2 | ! sorts real numbers into ascending numerical order 3 | ! Author: Juli Rew, SCD Consulting (juliana@ucar.edu), 9/03 4 | ! Based on algorithm from Cormen et al., Introduction to Algorithms, 5 | ! 1997 printing 6 | 7 | ! Made F conformant by Walt Brainerd 8 | ! 9 | !Updated 2016 Tue 10 | !The RealQsort now uses real*8 instead of real. 11 | ! 12 | ! Dec 2016 KRA, added integer version 13 | ! 14 | 15 | module mQuickSort 16 | 17 | implicit none 18 | public :: RealQsortC 19 | private :: RealPartition 20 | public :: IntQsortC 21 | private :: IntPartition 22 | 23 | interface IntPartition 24 | module procedure IntPartition_real,IntPartition_complex,IntPartition_int 25 | end interface 26 | interface IntQsortC 27 | module procedure IntQsortC_complex, IntQsortC_real, IntQsortC_int 28 | end interface 29 | 30 | contains 31 | recursive subroutine RealQsortC(A) 32 | real*8, intent(in out), dimension(:) :: A 33 | integer :: iq 34 | 35 | if(size(A) > 1) then 36 | call RealPartition(A, iq) 37 | call RealQsortC(A(:iq-1)) 38 | call RealQsortC(A(iq:)) 39 | endif 40 | end subroutine RealQsortC 41 | 42 | subroutine RealPartition(A, marker) 43 | real*8, intent(in out), dimension(:) :: A 44 | integer, intent(out) :: marker 45 | integer :: i, j 46 | real*8 :: temp 47 | real*8 :: x ! pivot point 48 | x = A(1) 49 | i= 0 50 | j= size(A) + 1 51 | 52 | do 53 | j = j-1 54 | do 55 | if (A(j) <= x) exit 56 | j = j-1 57 | end do 58 | i = i+1 59 | do 60 | if (A(i) >= x) exit 61 | i = i+1 62 | end do 63 | if (i < j) then 64 | ! exchange A(i) and A(j) 65 | temp = A(i) 66 | A(i) = A(j) 67 | A(j) = temp 68 | elseif (i == j) then 69 | marker = i+1 70 | return 71 | else 72 | marker = i 73 | return 74 | endif 75 | end do 76 | 77 | end subroutine RealPartition 78 | 79 | recursive subroutine IntQsortC_int(SortInts,Secondarys) 80 | integer, intent(inout), dimension(:) :: SortInts 81 | integer, intent(inout), dimension(:) :: Secondarys 82 | integer :: iq 83 | 84 | if(size(SortInts) > 1) then 85 | call IntPartition(SortInts,Secondarys, iq) 86 | call IntQsortC(SortInts(:iq-1),Secondarys(:iq-1)) 87 | call IntQsortC(SortInts(iq:),Secondarys(iq:)) 88 | endif 89 | end subroutine IntQsortC_int 90 | recursive subroutine IntQsortC_real(SortInts,Secondarys) 91 | integer, intent(inout), dimension(:) :: SortInts 92 | real*8, intent(inout), dimension(:) :: Secondarys 93 | integer :: iq 94 | 95 | if(size(SortInts) > 1) then 96 | call IntPartition(SortInts,Secondarys, iq) 97 | call IntQsortC(SortInts(:iq-1),Secondarys(:iq-1)) 98 | call IntQsortC(SortInts(iq:),Secondarys(iq:)) 99 | endif 100 | end subroutine IntQsortC_real 101 | 102 | 103 | recursive subroutine IntQsortC_complex(SortInts,Secondarys) 104 | integer, intent(inout), dimension(:) :: SortInts 105 | complex*16, intent(inout), dimension(:) :: Secondarys 106 | integer :: iq 107 | 108 | if(size(SortInts) > 1) then 109 | call IntPartition(SortInts,Secondarys, iq) 110 | call IntQsortC(SortInts(:iq-1),Secondarys(:iq-1)) 111 | call IntQsortC(SortInts(iq:),Secondarys(iq:)) 112 | endif 113 | end subroutine IntQsortC_complex 114 | 115 | subroutine IntPartition_int(SortInts, Secondarys,marker) 116 | integer, intent(inout), dimension(:) :: SortInts 117 | integer, intent(inout), dimension(:) :: Secondarys 118 | integer, intent(out) :: marker 119 | integer :: i, j 120 | integer :: temp 121 | real*8 :: rtemp 122 | integer :: x ! pivot point 123 | x = SortInts(1) 124 | i= 0 125 | j= size(SortInts) + 1 126 | do 127 | j = j-1 128 | do 129 | if (SortInts(j) <= x) exit 130 | j = j-1 131 | end do 132 | i = i+1 133 | do 134 | if (SortInts(i) >= x) exit 135 | i = i+1 136 | end do 137 | if (i < j) then 138 | ! exchange A(i) and A(j) 139 | temp = SortInts(i) 140 | SortInts(i) = SortInts(j) 141 | SortInts(j) = temp 142 | rtemp=Secondarys(i) 143 | Secondarys(i)=Secondarys(j) 144 | Secondarys(j)=rtemp 145 | elseif (i == j) then 146 | marker = i+1 147 | return 148 | else 149 | marker = i 150 | return 151 | endif 152 | end do 153 | 154 | end subroutine IntPartition_int 155 | subroutine IntPartition_real(SortInts, Secondarys,marker) 156 | integer, intent(inout), dimension(:) :: SortInts 157 | real*8, intent(inout), dimension(:) :: Secondarys 158 | integer, intent(out) :: marker 159 | integer :: i, j 160 | integer :: temp 161 | real*8 :: rtemp 162 | integer :: x ! pivot point 163 | x = SortInts(1) 164 | i= 0 165 | j= size(SortInts) + 1 166 | do 167 | j = j-1 168 | do 169 | if (SortInts(j) <= x) exit 170 | j = j-1 171 | end do 172 | i = i+1 173 | do 174 | if (SortInts(i) >= x) exit 175 | i = i+1 176 | end do 177 | if (i < j) then 178 | ! exchange A(i) and A(j) 179 | temp = SortInts(i) 180 | SortInts(i) = SortInts(j) 181 | SortInts(j) = temp 182 | rtemp=Secondarys(i) 183 | Secondarys(i)=Secondarys(j) 184 | Secondarys(j)=rtemp 185 | elseif (i == j) then 186 | marker = i+1 187 | return 188 | else 189 | marker = i 190 | return 191 | endif 192 | end do 193 | 194 | end subroutine IntPartition_real 195 | 196 | 197 | subroutine IntPartition_complex(SortInts, Secondarys,marker) 198 | integer, intent(inout), dimension(:) :: SortInts 199 | complex*16, intent(inout), dimension(:) :: Secondarys 200 | integer, intent(out) :: marker 201 | integer :: i, j 202 | integer :: temp 203 | complex*16 :: rtemp 204 | integer :: x ! pivot point 205 | x = SortInts(1) 206 | i= 0 207 | j= size(SortInts) + 1 208 | do 209 | j = j-1 210 | do 211 | if (SortInts(j) <= x) exit 212 | j = j-1 213 | end do 214 | i = i+1 215 | do 216 | if (SortInts(i) >= x) exit 217 | i = i+1 218 | end do 219 | if (i < j) then 220 | ! exchange A(i) and A(j) 221 | temp = SortInts(i) 222 | SortInts(i) = SortInts(j) 223 | SortInts(j) = temp 224 | rtemp=Secondarys(i) 225 | Secondarys(i)=Secondarys(j) 226 | Secondarys(j)=rtemp 227 | elseif (i == j) then 228 | marker = i+1 229 | return 230 | else 231 | marker = i 232 | return 233 | endif 234 | end do 235 | 236 | end subroutine IntPartition_complex 237 | 238 | end module mQuickSort 239 | 240 | 241 | 242 | -------------------------------------------------------------------------------- /RCM.f90: -------------------------------------------------------------------------------- 1 | module mRCM 2 | ! By Kristoffer Andersen, Dec 2016 3 | ! Updated by Tue Boesen, August 2017 4 | ! These routines can be used to reorder a matrix. 5 | ! The only dependecies are to mSmat, mError, mArrays and mQuicksort 6 | ! 7 | ! The main routines are> 8 | ! RCMSort - performs a reverse Cuthill-McKee ordering of a matrix 9 | ! SymPerm - performs a symmetric permutation of a matrix: PAP^t 10 | ! HalfPermOnlyTranspose - by using permutation operations this 11 | ! routine finds the transpose of a matrix 12 | 13 | use mSMat 14 | implicit none 15 | 16 | contains 17 | subroutine RCMSort(ioA,oPerm) 18 | ! Kristoffer Andersen, Dec 2016 19 | ! this routine finds the reverse Cuthill-McKee ordering of matrix ioA 20 | ! and makes the symmetric permutation A = PAP^t 21 | ! 22 | ! IO 23 | ! IO - ioA - on input the unpermuted matrix 24 | ! on output the permuted matrix 25 | ! O - oPerm - the permuatation vector - this can ie be used for the vectors used together with the matrix 26 | ! 27 | implicit none 28 | type(Tsparsemat),intent(inout) :: ioA 29 | integer,intent(out) :: oPerm(:) 30 | ! prog 31 | call RCMOrdering(ioA,oPerm) 32 | call symperm(ioA,oPerm) 33 | end subroutine 34 | 35 | 36 | subroutine RCMOrdering(iA,oRCMind,opStartIndex) 37 | ! Kristoffer Andersen, Dec 2016 38 | ! this routine finds the reverse Cuthill-McKee ordering for a 39 | ! symmetric sparse matrix iA. 40 | ! IO 41 | ! IO - iA - the unpermuted matrix 42 | ! O - RCMind - the permutation vector - this can ie be used for the vectors used together with the matrix 43 | ! I (optional) - opStartIndex - optional start index (default = 1) 44 | ! 45 | use mArrays 46 | use mQuickSort 47 | implicit none 48 | type(Tsparsemat),intent(inout) :: iA 49 | integer,intent(out) :: oRCMind(:) 50 | integer,intent(in),optional :: opStartIndex 51 | !var 52 | logical, allocatable :: inRCMind(:) 53 | integer :: i,j,k,ni,ni1,nneighbours,jn,ad,itot,inArray,MinEle,n,MaxEle 54 | integer,allocatable :: Neighbours(:),Adj(:),R(:),ele(:) 55 | !prog 56 | 57 | allocate(inRCMind(iA%NoRows)) 58 | allocate(Ele(iA%NoRows)) 59 | allocate(R(iA%NoRows)) 60 | 61 | inRCMInd(:) = .false. ! non sorted 62 | 63 | MaxEle=-1 64 | Ele=-1 65 | do j=1,iA%NoRows 66 | n=iA%RowIdxs(j+1)-iA%RowIdxs(j) 67 | MaxEle=max(MaxEle,n) 68 | Ele(j)=n 69 | end do 70 | 71 | allocate(Neighbours(MaxEle)) 72 | allocate(Adj(MaxEle)) 73 | 74 | 75 | R=-1 76 | itot = 1 77 | inArray = 0 78 | do 79 | if(itot.gt.iA%NoRows) exit ! termination 80 | if(inArray.ge.iA%NoRows ) exit 81 | 82 | if(R(itot).eq.-1) then 83 | MinEle=999999 84 | inArray=inArray+1 85 | do j=1,iA%NoRows 86 | if (inRCMInd(j)) then 87 | cycle 88 | end if 89 | n=iA%RowIdxs(j+1)-iA%RowIdxs(j) 90 | if (n.lt.MinEle) then 91 | MinEle=n 92 | R(itot)=j 93 | end if 94 | end do 95 | end if 96 | i = R(itot) !present index 97 | inRCMInd(i) = .true. !Mark it 98 | 99 | 100 | 101 | ! get neighbours 102 | ni = iA%Rowidxs(I) 103 | ni1 = iA%Rowidxs(I+1) 104 | Nneighbours = ni1-ni 105 | Neighbours(1:Nneighbours) = ia%Colidxs(ni:(ni1-1)) 106 | ! remove already present neighbours - this will also remove the self reference 107 | jn = 1 108 | do j = 1,Nneighbours 109 | if(.not.inRCMInd(Neighbours(J))) then 110 | Neighbours(Jn) = Neighbours(J) 111 | jn = jn+1 112 | endif 113 | enddo 114 | Nneighbours = jn-1 115 | 116 | Adj(1:Nneighbours)=Ele(Neighbours(1:Nneighbours)) 117 | ! sorting 118 | call IntQSortC(adj(1:Nneighbours),Neighbours(1:Nneighbours)) ! we need a real type routine for this. 119 | 120 | !Add new nodes to the queue 121 | do j = 1,Nneighbours 122 | inArray = inArray+1 123 | R(inArray) = Neighbours(j) ! add neighbours 124 | inRCMInd(Neighbours(J)) = .true. ! mark as included 125 | enddo 126 | itot = itot+1 127 | enddo 128 | 129 | ! finally reverse the order 130 | do i=1,(iA%NoRows) 131 | oRCMind(i) = R(iA%NoRows-i+1) 132 | enddo 133 | 134 | return 135 | ! error messages 136 | end subroutine RCMOrdering 137 | 138 | 139 | subroutine halfperm(iA,oB,iPerm,opColCounts) 140 | ! by Kristoffer Andersen, Dec 2016 141 | ! this routine performs the operation B = (P*A)^t, where P is er 142 | ! permutation vector of A 143 | ! The out is sorted independent on the status of the input - nice feature! 144 | ! 145 | ! IO 146 | ! I iA - unpermuted matrix 147 | ! O oB - permuted and sorted matrix 148 | ! I iPerm - permutation vector 149 | ! I opColCounts (optional) - used to speed up repeated calls of the routine. 150 | implicit none 151 | !io 152 | type(Tsparsemat),intent(in) :: iA 153 | type(Tsparsemat),intent(out) :: oB 154 | integer,intent(in) :: iPerm(:) 155 | integer,intent(in),optional :: opColCounts(:) 156 | !var 157 | integer :: pi,i,j,jp,p,pm1,pm2,jpt 158 | !prog 159 | 160 | call SMatCreate(oB,ia%NoCols,iA%NoRows,'(PA)^T',.false.,iA%NoElements) 161 | 162 | ! create column structure in permuted matrix - the column count is unchanged by row permutations 163 | if(present(opColCounts)) then ! when halfperm is called twice in symperm, one already knows this 164 | oB%RowIdxs(1:iA%NoRows) = opColCounts(1:iA%NoRows) 165 | else 166 | oB%RowIdxs(:) = 0 167 | do i=1,iA%NoRows 168 | do j=iA%RowIDxs(i),iA%RowIdxs(i+1)-1 169 | oB%RowIdxs(iA%ColIDxs(j)) = oB%RowIdxs(iA%ColIDxs(j))+1 ! find coumn counts 170 | enddo 171 | enddo 172 | endif 173 | ! create column pointers 174 | pm2 = 0 175 | pm1 = 1 176 | do i=3,iA%NoCols+3 177 | p = pm1+ob%RowIdxs(i-2) 178 | ob%RowIdxs(i-2) = pm2 179 | pm2 =pm1 180 | pm1 = p 181 | enddo 182 | 183 | ! fill matrix 184 | do i=1,iA%NoRows 185 | pi = iPerm(i) ! get the permuted row 186 | do jp=iA%RowIDxs(pi),iA%RowIdxs(pi+1)-1 187 | j = iA%ColIdxs(jp) 188 | jpt = oB%RowIdxs(j+1) 189 | ob%ColIdxs(jpt) = i 190 | oB%Vals(jpt) = iA%Vals(jp) 191 | oB%RowIdxs(j+1) = jpt+1 192 | enddo 193 | enddo 194 | oB%RowIdxs(1) = 1 195 | oB%NoElements = iA%NoElements 196 | 197 | return 198 | ! error messages 199 | 200 | end subroutine halfperm 201 | 202 | subroutine halfpermOnlyTranspose(iA,oB) 203 | ! by Kristoffer Andersen, Dec 2016 204 | ! this routine performs the operation B = A^t 205 | ! The out is sorted independent on the status of the input - nice feature! 206 | ! 207 | ! IO 208 | ! I iA - unpermuted matrix 209 | ! O oB - permuted and sorted matrix 210 | ! 211 | implicit none 212 | !io 213 | type(Tsparsemat),intent(in) :: iA 214 | type(Tsparsemat),intent(out) :: oB 215 | !var 216 | integer :: pi,i,j,jp,p,pm1,pm2,jpt 217 | !prog 218 | call SMatCreate(oB,ia%NoCols,iA%NoRows,'(A)^T',.false.,iA%NoElements) 219 | 220 | ! create column structure in permuted matrix - the column count is unchanged by row permutations 221 | oB%RowIdxs(:) = 0 222 | do i=1,iA%NoRows 223 | do j=iA%RowIDxs(i),iA%RowIdxs(i+1)-1 224 | oB%RowIdxs(iA%ColIDxs(j)) = oB%RowIdxs(iA%ColIDxs(j))+1 ! find coumn counts 225 | enddo 226 | enddo 227 | 228 | ! create column pointers 229 | pm2 = 0 230 | pm1 = 1 231 | do i=3,iA%NoCols+3 232 | p = pm1+ob%RowIdxs(i-2) 233 | ob%RowIdxs(i-2) = pm2 234 | pm2 =pm1 235 | pm1 = p 236 | enddo 237 | 238 | ! fill matrix 239 | do i=1,iA%NoRows 240 | pi = i ! get row 241 | do jp=iA%RowIDxs(pi),iA%RowIdxs(pi+1)-1 242 | j = iA%ColIdxs(jp) 243 | jpt = oB%RowIdxs(j+1) 244 | ob%ColIdxs(jpt) = i 245 | oB%Vals(jpt) = iA%Vals(jp) 246 | oB%RowIdxs(j+1) = jpt+1 247 | enddo 248 | enddo 249 | oB%RowIdxs(1) = 1 250 | oB%NoElements = iA%NoElements 251 | 252 | return 253 | ! error messages 254 | 255 | end subroutine halfpermOnlyTranspose 256 | 257 | subroutine symperm(ioA,iPerm) 258 | ! by Kristoffer Andersen, Dec 2016 259 | ! performs the halfperm twice in order to get B = PAP^t = (P(PA)^t)^t 260 | ! 261 | ! IO 262 | ! IO ioA - in input: unpermuted matrix 263 | ! on output: permuted and sorted matrix 264 | ! I iPerm- permutation vector 265 | implicit none 266 | !io 267 | type(Tsparsemat),intent(inout) :: ioA 268 | integer,intent(in) :: iPerm(:) 269 | !var 270 | type(tsparsemat) :: temp 271 | integer,allocatable :: ColCounts(:) 272 | integer :: i,pi,N 273 | !prog 274 | call halfperm(ioA,temp,iPerm) 275 | N = ioA%NoRows 276 | allocate(ColCounts(N)) 277 | do i = 1,N 278 | pi = iPerm(i) 279 | ColCounts(i) = ioA%RowIdxs(pi+1)-ioA%RowIdxs(pi) 280 | enddo 281 | call halfperm(temp,ioA,iPerm,ColCounts) 282 | return 283 | end subroutine symperm 284 | 285 | subroutine PermuteVector(ioV,iPerm) 286 | ! by Kristoffer Andersen, Dec 2016 287 | ! this routine permutes a vector according to iPerm 288 | ! 289 | ! IO 290 | ! IO ioV - in input: unpermuted vector 291 | ! on output: permuted vector 292 | ! I iPerm- permutation vector 293 | real*8,intent(inout) :: ioV(:) 294 | integer,intent(in) :: iPerm(:) 295 | ! var 296 | real*8,allocatable :: temp(:) 297 | integer :: i,N 298 | !prog 299 | N = ubound(ioV,1) 300 | allocate(temp(N)) 301 | temp(1:N) = ioV(1:N) 302 | do i=1,N 303 | ioV(iPerm(i)) = temp(i) 304 | enddo 305 | return 306 | end subroutine PermuteVector 307 | 308 | end module mRCM 309 | 310 | 311 | -------------------------------------------------------------------------------- /Readme.MD: -------------------------------------------------------------------------------- 1 | # Sparse Iterative parallel linear solver 2 | 3 | ## Introduction 4 | 5 | This is a sparse iterative linear solver I developed while doing my Phd at the HydroGeophysics Group, Aarhus University, Denmark. 6 | The code is developed in Fortran/OpenMP, and is designed with focus on speed especially in parallel. 7 | The code uses a block-splitting of the linear system without any overlap. 8 | It is intended to be used on matrices which have already been reordered using a reverse Cuthill-Mckee reordering. 9 | 10 | 11 | ## What is in this pack 12 | 13 | | Filename | Description | 14 | | ----------- | ----------- | 15 | | Example.f90 | Contains a small example, that initialize the parallelization, loads a matrix and solves the linear system. | 16 | | Solver.f90 | Contains the iterative sparse solver. | 17 | | SparseMatrix.f90 | Contains various sparse matrix functions needed by the iterative solver. | 18 | | Paralllelization.f90 | Contains the parallelization framework. | 19 | | Misc.f90 | Contains a few helper routines needed by the other modules. | 20 | | Load.f90 | Contains a minimalistic structure for loading and setting the various settings for the example. | 21 | | Matrix folder | Contains the matrix used in the example saved in compressed sparse row format.| 22 | | Technical_Report.pdf | A technical report which introduces both the parallelization framework and the iterative solver.| 23 | | Settings.ini | Settings for the parallelization framework and the iterative solver.| 24 | 25 | ## How to set it up 26 | 27 | Open either the parallelization solution file or project file, or build your own project by using Example.f90 as the main program and the rest as dependencies. 28 | 29 | Once the project has been built, place all files in the same folder and run it like this: 30 | 31 | -> solver.exe settings.ini 001 outputfilename (where 001 is the name of the sparseMatrix in the matrix folder) 32 | -------------------------------------------------------------------------------- /Settings.ini: -------------------------------------------------------------------------------- 1 | Example of an ini file - this is skipped during the load and can contain anything 2 | -99 !AffinityMode 3 | -99 !AffinityPattern 4 | 0 !UseNested 5 | -99 !StartThread 6 | 0 !NCPUs 7 | -99 !NCPUsLow 8 | -99 !NCPUOuter 9 | -1 !SolverType 10 | -1 !Max number of iterations to run during the propagation 11 | 1 !Blocksize, Each block will be a modulo of this size 12 | -1 !FillingFactor, A factor that scales the number of elements saved in each row during the IC factorization 13 | 1 !Determines whether to normalize the matrix before solving it. -------------------------------------------------------------------------------- /Solver.f90: -------------------------------------------------------------------------------- 1 | !MIT License 2 | ! 3 | !Copyright (c) [2017] [HydroGeophysics Group, Aarhus University, Denmark] 4 | ! 5 | !Permission is hereby granted, free of charge, to any person obtaining a copy 6 | !of this software and associated documentation files (the "Software"), to deal 7 | !in the Software without restriction, including without limitation the rights 8 | !to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | !copies of the Software, and to permit persons to whom the Software is 10 | !furnished to do so, subject to the following conditions: 11 | ! 12 | !The above copyright notice and this permission notice shall be included in all 13 | !copies or substantial portions of the Software. 14 | ! 15 | !THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | !IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | !FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | !AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | !LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | !OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | !SOFTWARE. 22 | 23 | !Tue jan, 2015 24 | !This module contains iterative solvers for linear systems 25 | 26 | module mSolver 27 | use mMisc 28 | use mSMat 29 | implicit none 30 | 31 | type TSolverStatus 32 | !This type is a subtype in TSolverSettings, meant for output from the solve 33 | logical, private :: Success=.false. !True if system converged 34 | integer :: Status !Status is the variable that gets reported to the surrounding code, on a succesfull solve, status=nint(iter) but otherwise it can contain error messages or -1 if no convergence was reached 35 | real*8, private :: AbsError !Average absolute error 36 | real*8, private :: RelError !Average relative error 37 | real*8, private :: FactorizationTime !Time spent factorizing 38 | real*8, private :: SolverTime !Time spent solving 39 | 40 | !MultiRHS variables 41 | real*8, private :: Iter !Average number of iterations used 42 | integer, private :: Iter_min !Minimum number of iterations 43 | integer, private :: Iter_max !Maximum number of iterations 44 | real*8, private :: AbsError_min !Minimum absolute error 45 | real*8, private :: AbsError_max !Maximum absolute error 46 | real*8, private :: RelError_min !Minimum relative error 47 | real*8, private :: RelError_max !Maximum relative error 48 | end type TSolverStatus 49 | 50 | type TSolverSettings 51 | !This type is meant to contain all settings needed for the solver to run, in addition it will also contain the output from the solve. 52 | logical, private :: IsPrepared=.false. !Before solver runs it checks this to see whether solversettings have been prepared. 53 | logical, private :: IsComplex !True if system is complex 54 | integer, private :: UseNormalization !If True, the matrix is normalized as: D^-1*A*D^-1 (D*x) = D^-1*b, where D=sqrt(diag(A)) 55 | integer, private :: UseRCM !If True, the matrix is reordered using Reverse Cuthill-Mckee reordering 56 | integer, private :: NoRHS !Number of right hands sides 57 | integer, private :: PrecondMaxFill !Max number of elements each row in LUT factorization can contain (excluding the diagonal) 58 | integer, private :: NoElements !Actual number of elements used in LUT factorization (not sure this is set) 59 | integer, private :: MaxIter !Max number of Iterations 60 | integer, private :: SolverType !Which solver to use 61 | integer, private :: Inform !Set how much information solver should print, 0 = silent, 1 = basic statistics, 2 = solversettings+basic statistics, 3=full information 62 | integer, private :: Blocksize !The minimum blocksize 63 | integer, private :: NBlocks !The number of blocks used in the parallel preconditioner computation 64 | integer, private :: FillingFactor !The filling factor used for computing PrecondMaxFill 65 | real*8, private :: PrecondTolerance !DropTolerance of ILUT filling 66 | real*8, private :: StepLimit !When to stop if convergence is not reached 67 | real*8, private :: AbsTol !Convergence criteria 68 | real*8, private :: RelTol !Convergence criteria 69 | real*8, private :: DiagDom !Diagonal dominance, an empirical parameter that gives a rough measure for how difficult this system will be to solve, and hence how many elements we need in LUT factorization 70 | type(TSolverStatus) :: Output 71 | 72 | !RCM ordering indices 73 | integer,dimension(:),allocatable,private :: RCMIndices !Indices returned from RCM reordering, in case it should be reversed at the end 74 | !Normalization factor 75 | real*8,dimension(:),allocatable,private :: Normalization !Normalization is used if the logical Variable UseNormalization is true 76 | real*8,dimension(:),allocatable,private :: ReNormalization !Renormalization is used if the logical variable UseNormalization is true, this brings back the original Matrix afterwards. 77 | !Block splitting 78 | integer,dimension(:,:),allocatable, private :: Blocks !The actual blocks the matrix will be split up in during a parallel run 79 | !LU0/LUT preconditioner 80 | integer,dimension(:),allocatable, private :: PrecondCols !Used for LUT precondition to store the ColIdx in CSR format 81 | integer,dimension(:),allocatable, private :: PrecondRows !Used for LUT precondition to store the RowIdx in CSR format 82 | real*8,dimension(:),allocatable, private :: PrecondVals !Used in real LU0/LUT preconditioning to store the values. 83 | !LU0 - complex values 84 | complex*16,dimension(:),allocatable, private :: PrecondcVals !Used in complex LU0 preconditioning to store the values 85 | !BLUT preconditioner 86 | integer,dimension(:,:),allocatable, private :: BPrecondIdxs !Used for BLUT precondition to store the ColIdx in block CSR format 87 | integer,dimension(:,:),allocatable, private :: BPrecondDiag !Used for BLUT precondition to store the RowIdx in block CSR format 88 | real*8,dimension(:,:),allocatable, private :: BPrecondVals !Used in BLUT preconditioning to store the values of each block. 89 | !settings for PARDISO solver 90 | 91 | end type TSolverSettings 92 | 93 | interface SolverAxB 94 | module procedure SingleRHS_wrapper 95 | end interface 96 | 97 | contains 98 | 99 | 100 | !**************************************************************************************** 101 | 102 | 103 | subroutine SingleRHS_wrapper(iA,iRHS,ioX,SolverSettings) 104 | !Tue, Dec 2014 105 | !This routine is a wrapper routine for all the our solvers which handle systems with a single righthandside. 106 | !IO 107 | !iA -> input matrix, in CSR format, and of the type TSparseMat 108 | !iRHS -> a system of right hand sides, RHS(:,1) is one right hand side vector 109 | !ioX -> on input this matrix contains the solution guess for each vector, on output it contains the found solutions 110 | !SolverSettings -> contains all the relevant settings the solvers need, it is also where stats about the solve is saved. 111 | ! This needs to be set before calling this routine. 112 | ! 113 | use omp_lib 114 | use msMat 115 | implicit none 116 | type (TSparseMat),intent(inout) :: iA 117 | type(TSolverSettings),intent(inout):: SolverSettings 118 | real*8,intent(inout),dimension(:) :: iRHS 119 | real*8,intent(inout),dimension(:) :: ioX 120 | character*200 :: S 121 | integer :: i,j 122 | integer :: NoNumaNodes,Threads,ostatus,error 123 | real*8 :: RunTimeBegin,RunTimeEnd 124 | real*8,dimension(:),allocatable :: InvDiag,tmp 125 | integer,parameter :: NoRowsDenseLimit=35000 !with 35000 elements the dense solver takes about 150 seconds to solve the problem 126 | integer :: OldThreads 127 | logical :: dummy 128 | logical(4) :: Success 129 | allocate(tmp(iA%NoRows)) 130 | dummy = startOpenMP(2,1) 131 | 132 | RuntimeBegin = omp_get_wtime() 133 | Call checkSolverSettings(SolverSettings) 134 | 135 | if(SolverSettings%UseRCM) then 136 | tmp(:)=iRHS(SolverSettings%RCMIndices) 137 | iRHS=tmp 138 | tmp(:)=ioX(SolverSettings%RCMIndices) 139 | ioX=tmp 140 | end if 141 | if(SolverSettings%UseNormalization) then 142 | do i=1,iA%NoRows 143 | iRHS(i)=iRHS(i)*SolverSettings%Normalization(i) 144 | ioX(i)=ioX(i)*SolverSettings%ReNormalization(i) 145 | end do 146 | end if 147 | 148 | SELECT CASE (SolverSettings%SolverType) 149 | CASE(1,2) 150 | call SingleRHS(iA,iRHS,ioX,SolverSettings) 151 | CASE(3) 152 | !Pardiso (Not implemented here) 153 | END SELECT 154 | 155 | !If we used Normalization, we now need to renormalize our solution vector x and our input iRHS. 156 | if(SolverSettings%UseNormalization) then 157 | do i=1,iA%NoRows 158 | ioX(i)=ioX(i)*SolverSettings%Normalization(i) 159 | iRHS(i)=iRHS(i)*SolverSettings%ReNormalization(i) 160 | end do 161 | end if 162 | if(SolverSettings%UseRCM) then !This is not right, it needs to be the inverse here, but should be easily fixed 163 | do i=1,iA%NoRows 164 | tmp(SolverSettings%RCMIndices(i)) = iRHS(i) 165 | end do 166 | iRHS=tmp 167 | do i=1,iA%NoRows 168 | tmp(SolverSettings%RCMIndices(i)) = ioX(i) 169 | end do 170 | ioX=tmp 171 | end if 172 | 173 | RuntimeEnd=omp_get_wtime() 174 | SolverSettings%Output%SolverTime=RunTimeEnd-RunTimeBegin 175 | if (.not.SolverSettings%Output%success) SolverSettings%Output%Status = -SolverSettings%Output%Status 176 | call ReturnOldThreadNum() 177 | return 178 | end subroutine SingleRHS_wrapper 179 | 180 | !**************************************************************************************** 181 | 182 | 183 | 184 | subroutine SingleRHS(iA,iRHS,ioX,SolverSettings) 185 | !Tue, November 2014 186 | !Block Preconditioned Conjugate Gradient, with uma. 187 | !This routine solves linear systems using the Preconditioned Conjugate gradient method, as a preconditioner the method uses either SOR or BLUT depending on SolverSettings 188 | !More information on the method can be found in the book "Iterative methods for sparse linear systems". 189 | !IO 190 | !iA -> input matrix, in CSR format, and of the type TSparseMat 191 | !iRHS -> a system of right hand sides, RHS(:,1) is one right hand side vector 192 | !ioX -> on input this matrix contains the solution guess for each vector, on output it contains the found solutions 193 | !SolverSettings -> contains all the relevant settings the solvers need, it is also where stats about the solve is saved. 194 | ! This needs to be set before calling this routine. 195 | ! 196 | ! 197 | use omp_lib 198 | implicit none 199 | 200 | type (TSparseMat),intent(inout) :: iA 201 | type(TSolverSettings),intent(inout) :: SolverSettings 202 | 203 | real*8,intent(in),dimension(:) :: iRHS 204 | real*8,intent(inout),dimension(:) :: ioX 205 | 206 | !Parallel vars 207 | Integer :: Threads 208 | !Local vars 209 | integer :: i 210 | integer,save :: j 211 | integer :: MaxIter,NoRows,iNuma 212 | real*8 :: rzDotProd,eps 213 | real*8 :: alfa, beta 214 | real*8 :: error,norm 215 | real*8,allocatable :: r(:),z(:),p(:),Ap(:),OldX(:),StartX(:) 216 | logical :: KeepLooping 217 | 218 | 219 | !SOR vars 220 | integer,dimension(:),allocatable :: DiagIdx 221 | integer,dimension(:,:),allocatable :: BRowIdx 222 | real*8,dimension(:),allocatable :: InvDiag 223 | 224 | 225 | Eps=1e-6 226 | iNuma=1 227 | NoRows=size(iRHS) 228 | allocate(r(NoRows)) 229 | allocate(z(NoRows)) 230 | allocate(p(NoRows)) 231 | allocate(Ap(NoRows)) 232 | allocate(OldX(NoRows)) 233 | allocate(StartX(NoRows)) 234 | OldX=ioX 235 | StartX=ioX 236 | !First the block parallisation 237 | Threads=OMP_GET_MAX_THREADS () 238 | 239 | call SmatAmultV(iA,ioX(:),r(:)) 240 | call VectorAddition(1,iNuma,NoRows,r,iRHS) !r(:)=iRHS(:)-r(:) 241 | 242 | SELECT CASE (SolverSettings%SolverType) 243 | CASE(1) 244 | allocate(DiagIdx(iA%NoRows)) 245 | call SMatGetDiagIdx(iA,DiagIdx) 246 | allocate(InvDiag(iA%NoRows)) 247 | call SMatGetInvDiag(iA,InvDiag) 248 | allocate(BRowIdx(iA%NoRows,2)) 249 | call SMatGetBlockRowIdx(iA,SolverSettings%Blocks,BRowIdx) 250 | call Apply_BPrecondition_BSGS(iA%ColIdxs,iA%RowIdxs,iA%Vals,BRowIdx,DiagIdx,InvDiag,r,z,SolverSettings%Blocks) 251 | CASE(2) 252 | call Apply_BPrecondition_BLUT(SolverSettings%Blocks, r, z, SolverSettings%BPrecondVals, SolverSettings%BPrecondIdxs, SolverSettings%BPrecondDiag) 253 | END SELECT 254 | 255 | call VectorAddition(0,iNuma,NoRows,p,z) !p(:)=z(:) 256 | 257 | do j = 1 , SolverSettings%MaxIter 258 | call SMatAMultV(iA,p,Ap) 259 | 260 | rzDotProd=ddot(NoRows,r,1,z,1) 261 | alfa=rzDotProd/ddot(NoRows,Ap(:),1,p,1) 262 | call VectorAddition(4,iNuma,NoRows,ioX,p,alfa,r,Ap) !ioX(:) = ioX(:) + alfa*p(:) , r(:) = r(:) - alfa*Ap(:) 263 | 264 | SELECT CASE (SolverSettings%SolverType) 265 | CASE(1) 266 | call Apply_BPrecondition_BSGS(iA%ColIdxs,iA%RowIdxs,iA%Vals,BRowIdx,DiagIdx,InvDiag,r,z,SolverSettings%Blocks) 267 | CASE(2) 268 | call Apply_BPrecondition_BLUT(SolverSettings%Blocks, r, z, SolverSettings%BPrecondVals, SolverSettings%BPrecondIdxs, SolverSettings%BPrecondDiag) 269 | END SELECT 270 | beta = ddot(NoRows,r,1,z,1)/rzDotProd 271 | call VectorAddition(3,iNuma,NoRows,p,z,beta) !p(:) = z(:) + beta * p(:) 272 | 273 | !Did we converge yet? 274 | error=0d0 275 | norm =0d0 276 | Error=sqrt(ddot(NoRows,r,1,r,1)) 277 | Norm=sqrt(ddot(NoRows,ioX,1,ioX,1)) 278 | 279 | if (((error/norm).le.SolverSettings%RelTol).or.(error).le.SolverSettings%AbsTol) then 280 | SolverSettings%Output%Success=.TRUE. 281 | SolverSettings%Output%Status = min(j,SolverSettings%MaxIter) 282 | exit 283 | end if 284 | if (SolverSettings%StepLimit.gt.0) then 285 | Keeplooping=.False. 286 | do i=1,NoRows 287 | if (abs(ioX(i)-OldX(i))/Norm.gt.SolverSettings%StepLimit) then 288 | Keeplooping=.True. 289 | exit 290 | end if 291 | end do 292 | if (.NOT.keeplooping) then 293 | exit 294 | else 295 | OldX(:) = ioX(:) 296 | end if 297 | end if 298 | end do 299 | SolverSettings%Output%Status=min(j,SolverSettings%MaxIter) 300 | SolverSettings%Output%AbsError=error 301 | SolverSettings%Output%RelError=error/norm 302 | if (allocated(DiagIdx)) then 303 | deallocate(DiagIdx) 304 | end if 305 | if (allocated(InvDiag)) then 306 | deallocate(InvDiag) 307 | end if 308 | if (allocated(BRowIdx)) then 309 | deallocate(BRowIdx) 310 | end if 311 | if(.NOT.SolverSettings%Output%Success) ioX=StartX 312 | if(allocated(r)) deallocate(r) 313 | if(allocated(z)) deallocate(z) 314 | if(allocated(p)) deallocate(p) 315 | if(allocated(Ap)) deallocate(Ap) 316 | if(allocated(OldX)) deallocate(OldX) 317 | if(allocated(StartX)) deallocate(StartX) 318 | 319 | end subroutine SingleRHS 320 | 321 | subroutine VectorAddition(imode,iNuma,iNoRows,ioX,iY,alfa,ioZ,iV) 322 | !Tue, Dec 2014 323 | !This routine does different kinds of vector additions in parallel depending on the mode selected. 324 | !mode = 0 : ioX = iY 325 | !mode = 1 : ioX = iY - ioX 326 | !mode = 2 : ioX = ioX + alfa*iY 327 | !mode = 3 : ioX = alfa*ioX + iY 328 | !mode = 4 : ioX(:) = ioX(:) + alfa*iY(:) 329 | ! ioZ(:) = ioZ(:) - alfa*iV(:) 330 | !IO 331 | !imode -> selects the type of vector addition used 332 | !iNuma, -> Number of Numa Nodes 333 | !iNoRows -> Number of Rows 334 | !ioX,iY,ioZ,iV -> vectors 335 | !alfa -> scalar 336 | implicit none 337 | integer,intent(in) :: imode,iNuma,iNoRows 338 | real*8,intent(inout),dimension(:) :: ioX 339 | real*8,intent(in),dimension(:) :: iY 340 | real*8,intent(in),optional :: alfa 341 | real*8,intent(inout),dimension(:),optional :: ioZ 342 | real*8,intent(in),dimension(:),optional :: iV 343 | 344 | integer i 345 | if (iNuma.gt.1) then 346 | if (imode.eq.0) then 347 | !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY) PRIVATE(I) 348 | !$OMP DO 349 | do i=1,iNoRows 350 | ioX(i) = iY(i) 351 | end do 352 | !$OMP END DO 353 | !$OMP END PARALLEL 354 | elseif (imode.eq.1) then 355 | !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY) PRIVATE(I) 356 | !$OMP DO 357 | do i=1,iNoRows 358 | ioX(i) = iY(i) - ioX(i) 359 | end do 360 | !$OMP END DO 361 | !$OMP END PARALLEL 362 | elseif(imode.eq.2) then 363 | !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY,Alfa) PRIVATE(I) 364 | !$OMP DO 365 | do i=1,iNoRows 366 | ioX(i) = ioX(i) + alfa*iY(i) 367 | end do 368 | !$OMP END DO 369 | !$OMP END PARALLEL 370 | elseif(imode.eq.3) then 371 | !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY,Alfa) PRIVATE(I) 372 | !$OMP DO 373 | do i=1,iNoRows 374 | ioX(i) = alfa*ioX(i) +iY(i) 375 | end do 376 | !$OMP END DO 377 | !$OMP END PARALLEL 378 | elseif(imode.eq.4) then 379 | !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY,alfa,ioZ,iV) PRIVATE(I) 380 | !$OMP DO 381 | do i=1,iNoRows 382 | ioX(i) = ioX(i) + alfa*iY(i) 383 | ioZ(i) = ioZ(i) - alfa*iV(i) 384 | end do 385 | !$OMP END DO 386 | !$OMP END PARALLEL 387 | end if 388 | 389 | else 390 | if (imode.eq.0) then 391 | ioX(:) = iY(:) 392 | elseif (imode.eq.1) then 393 | ioX(:) = iY(:) - ioX(:) 394 | elseif(imode.eq.2) then 395 | ioX(:) = ioX(:) + alfa*iY(:) 396 | elseif(imode.eq.3) then 397 | ioX(:) = alfa*ioX(:) + iY(:) 398 | elseif(imode.eq.4) then 399 | ioX(:) = ioX(:) + alfa*iY(:) 400 | ioZ(:) = ioZ(:) - alfa*iV(:) 401 | end if 402 | end if 403 | end subroutine VectorAddition 404 | 405 | subroutine Compute_Precondition_BLUT_real(iA,SolverSettings) 406 | !Tue November 2014 407 | !This routine sets up all the parameters needed for the Block LUT matrix factorization in parallel. 408 | !After everything has been setup, then the factorization algorithm is called. 409 | !IO 410 | !iA -> input matrix, in CSR format, and of the type TSparseMat 411 | !SolverSettings -> contains all the relevant settings the solvers need, it is also where stats about the solve is saved. 412 | ! This needs to be set before calling this routine. 413 | use omp_lib 414 | implicit none 415 | type (TSolverSettings),intent(inout) :: SolverSettings 416 | type (TSparseMat), intent(inout) :: iA 417 | 418 | !Local Vars 419 | integer :: i,Threads,NoRows,MaxRows,NoPrecondElements,SplitNoRows,ierr 420 | integer,dimension(:),allocatable :: jw 421 | integer,dimension(:,:),allocatable :: BRowIdx 422 | real*8 :: Timer1, Timer2 423 | real*8,dimension(:),allocatable :: w 424 | Timer1=omp_get_wtime() 425 | Threads=SolverSettings%NBlocks 426 | allocate(BRowIdx(iA%NoRows,2)) 427 | call SMatGetBlockRowIdx(iA,SolverSettings%BlockS,BRowIdx) 428 | MaxRows = 0 429 | do i = 1 , threads 430 | MaxRows = Max(MaxRows,SolverSettings%BlockS(i,2)-SolverSettings%BlockS(i,1)+1) 431 | end do 432 | NoPrecondElements=(2*SolverSettings%PrecondMaxFill+1)*MaxRows 433 | Allocate(SolverSettings%BPrecondVals(NoPrecondElements,Threads)) 434 | Allocate(SolverSettings%BPrecondIdxs(NoPrecondElements,Threads)) 435 | Allocate(SolverSettings%BPrecondDiag(MaxRows,Threads)) 436 | !$OMP PARALLEL IF(StartOpenMP(2)) DEFAULT(SHARED) PRIVATE(i,SplitNoRows,w,jw,ierr,NoPrecondElements) 437 | !$OMP DO SCHEDULE(STATIC) 438 | do i = 1 , Threads 439 | SplitNoRows=SolverSettings%BlockS(i,2)-SolverSettings%BlockS(i,1)+1 440 | Allocate(w(SplitNoRows+1)) 441 | Allocate(jw(2*SplitNoRows)) 442 | NoPrecondElements=(2*SolverSettings%PrecondMaxFill)*SplitNoRows 443 | 444 | call BLUT_Factorization_real(SplitNoRows,iA%Vals,iA%ColIdxs,BRowIdx(SolverSettings%BlockS(i,1):SolverSettings%BlockS(i,2),:),& 445 | SolverSettings%PrecondMaxFill,SolverSettings%PrecondTolerance,SolverSettings%BPrecondVals(:,i),SolverSettings%BPrecondIdxs(:,i),& 446 | SolverSettings%BPrecondDiag(:,i),NoPrecondElements,w,jw,SolverSettings%BlockS(i,1)-1,ierr) 447 | !On Numa architechture we may now need to gather LU into one matrix and copy it around, depending whether we use PCG og BPCG routines. 448 | deAllocate(w) 449 | deAllocate(jw) 450 | end do 451 | !$OMP END DO 452 | !$OMP END PARALLEL 453 | Timer2=omp_get_wtime() 454 | SolverSettings%Output%FactorizationTime=Timer2-Timer1 455 | if (allocated(BRowIdx)) then 456 | deallocate(BRowIdx) 457 | end if 458 | end subroutine Compute_Precondition_BLUT_real 459 | 460 | 461 | !************************************************************* 462 | 463 | subroutine BlockSplitter(iNoRows,iThreads,iParam,oBlocks) 464 | !Tue, September 2014 465 | !Splits iNoRows into iThreads Intervals. 466 | !Each interval will have a length of iParam*n, where n is a natural number. 467 | !The result will be saved in oBlocks, where oBlocks(i,1) is the start of the i'th interval and oBlocks(i,2) is the end of the interval 468 | !IO 469 | !Update 2015 Tue 470 | !Fixed the case if iNoRows < iThreads 471 | implicit none 472 | Integer,intent(in) :: iNoRows,iThreads 473 | Integer,intent(in) :: iParam 474 | Integer,intent(out),allocatable :: oBlocks(:,:) 475 | !Local vars 476 | integer :: i 477 | integer :: BlockSize 478 | 479 | BlockSize=max(iNoRows/iThreads,1) 480 | BlockSize =mod(IParam-mod(BlockSize,iParam),iParam)+BlockSize 481 | Allocate(oBlocks(iThreads,2)) 482 | do i=1,iThreads 483 | oBlocks(i,1) = (i-1)*(BlockSize)+1 484 | oBlocks(i,2) = min(i*BlockSize,iNoRows) 485 | end do 486 | print*,'Blocks',oBlocks 487 | end subroutine BlockSplitter 488 | 489 | 490 | subroutine BLUT_Factorization_real(n,iVals,iColIdxs,iBRowIdxs,lfil,droptol,alu,jlu,ju,iwk,w,jw,Offset,ierr) 491 | !Tue November 2014 492 | !This routine is designed to compute an Incomplete Cholesky factorization, of a sparse linear system, limited by on a system. 493 | !It has been made by modifying an old ILUT preconditioner made by Saad. 494 | !A slightly modfified description of the ILUT preconditioner is found below. 495 | ! 496 | !IO 497 | ! 498 | !n -> Number of rows in input matrix 499 | !iVals,iColIdxs -> Is the values and Column indexes of the input matrix stored in CSR format 500 | !iBRowIdxs -> Is the cutoff row indices for the input matrix. iBRowIdxs(i,1) is the first element in row i we use, iBRowIdxs(i,2) is the last element in row i we use 501 | !lfil -> The fill in parameter, each row of L and each row of U will have a maximum of lfil elements (excluding the diagonal element) 502 | !droptol -> The minimum threshold before we drop a term. 503 | !alu,jlu,ju -> the output LU matrix stored in modified sparse row format 504 | !iwk -> the length of arrays alu,jlu 505 | !w -> work array 506 | !jw -> work array 507 | !Offset -> The offset between the input matrix idx and the LU idx we want to save them in. 508 | !ierr -> status of subroutine 509 | ! 510 | !----------------------------------------------------------------------* 511 | ! *** ILUT preconditioner *** * 512 | ! incomplete LU factorization with dual truncation mechanism * 513 | !----------------------------------------------------------------------* 514 | !c Author: Yousef Saad *May, 5, 1990, Latest revision, August 1996 * 515 | !c----------------------------------------------------------------------* 516 | !c PARAMETERS 517 | !c----------- 518 | !c 519 | !c on entry: 520 | !c========== 521 | !c n = integer. The row dimension of the matrix A. The matrix 522 | !c 523 | !c iVals,iColIdxs, = matrix stored in Compressed Sparse Row format. 524 | !c iBRowIdxs is special! 525 | !c lfil = integer. The fill-in parameter. Each row of L and each row 526 | !c of U will have a maximum of lfil elements (excluding the 527 | !c diagonal element). lfil must be .ge. 0. 528 | !c ** WARNING: THE MEANING OF LFIL HAS CHANGED WITH RESPECT TO 529 | !c EARLIER VERSIONS. 530 | !c 531 | !c droptol = real*8. Sets the threshold for dropping small terms in the 532 | !c factorization. See below for details on dropping strategy. 533 | !c 534 | !c 535 | !c iwk = integer. The lengths of arrays alu and jlu. If the arrays 536 | !c are not big enough to store the ILU factorizations, ilut 537 | !c will stop with an error message. 538 | !c 539 | !c On return: 540 | !c=========== 541 | !c 542 | !c alu,jlu = matrix stored in Modified Sparse Row (MSR) format containing 543 | !c the L and U factors together. The diagonal (stored in 544 | !c alu(1:n) ) is inverted. Each i-th row of the alu,jlu matrix 545 | !c contains the i-th row of L (excluding the diagonal entry=1) 546 | !c followed by the i-th row of U. 547 | !c 548 | !c ju = integer array of length n containing the pointers to 549 | !c the beginning of each row of U in the matrix alu,jlu. 550 | !c 551 | !c ierr = integer. Error message with the following meaning. 552 | !c ierr = 0 --> successful return. 553 | !c ierr .gt. 0 --> zero pivot encountered at step number ierr. 554 | !c ierr = -1 --> Error. input matrix may be wrong. 555 | !c (The elimination process has generated a 556 | !c row in L or U whose length is .gt. n.) 557 | !c ierr = -2 --> The matrix L overflows the array al. 558 | !c ierr = -3 --> The matrix U overflows the array alu. 559 | !c ierr = -4 --> Illegal value for lfil. 560 | !c ierr = -5 --> zero row encountered. 561 | !c 562 | !c work arrays: 563 | !c============= 564 | !c jw = integer work array of length 2*n. 565 | !c w = real work array of length n+1. 566 | !c 567 | !c---------------------------------------------------------------------- 568 | !c w, ju (1:n) store the working array [1:ii-1 = L-part, ii:n = u] 569 | !c jw(n+1:2n) stores nonzero indicators 570 | !c 571 | !c Notes: 572 | !c ------ 573 | !c The diagonal elements of the input matrix must be nonzero (at least 574 | !c 'structurally'). 575 | !c 576 | !c----------------------------------------------------------------------* 577 | !c---- Dual drop strategy works as follows. * 578 | !c * 579 | !c 1) Theresholding in L and U as set by droptol. Any element whose * 580 | !c magnitude is less than some tolerance (relative to the abs * 581 | !c value of diagonal element in u) is dropped. * 582 | !c * 583 | !c 2) Keeping only the largest lfil elements in the i-th row of L * 584 | !c and the largest lfil elements in the i-th row of U (excluding * 585 | !c diagonal elements). * 586 | !c * 587 | !c Flexibility: one can use droptol=0 to get a strategy based on * 588 | !c keeping the largest elements in each row of L and U. Taking * 589 | !c droptol .ne. 0 but lfil=n will give the usual threshold strategy * 590 | !c (however, fill-in is then mpredictible). * 591 | !c----------------------------------------------------------------------* 592 | implicit none 593 | integer,intent(in) :: n,lfil,iwk,offset 594 | real*8,intent(in),dimension(:) :: iVals 595 | real*8,intent(inout),dimension(:) :: alu,w 596 | real*8,intent(in) :: droptol 597 | 598 | integer,intent(in),dimension(:) :: iColIdxs 599 | integer,intent(in),dimension(:,:) :: iBRowIdxs 600 | integer,intent(inout),dimension(:):: jlu,ju,jw 601 | integer,intent(inout) :: ierr 602 | 603 | !c locals 604 | integer ju0,k,j1,j2,j,ii,i,lenl,lenu,jj,jrow,jpos,length 605 | real*8 tnorm, t, abs, s, fact 606 | if (lfil .lt. 0) goto 998 607 | !c----------------------------------------------------------------------- 608 | !c initialize ju0 (points to next element to be added to alu,jlu) 609 | !c and pointer array. 610 | !c----------------------------------------------------------------------- 611 | ju0 = n+2 612 | jlu(1) = ju0 613 | !c 614 | !c initialize nonzero indicator array. 615 | !c 616 | do j=1,n 617 | jw(n+j) = 0 618 | end do 619 | !c----------------------------------------------------------------------- 620 | !c beginning of main loop. 621 | !c----------------------------------------------------------------------- 622 | do ii = 1, n 623 | j1 = iBRowIdxs(ii,1) 624 | j2 = iBRowIdxs(ii,2) 625 | !j1=iRowIdxs(ii) 626 | !j2=iRowIdxs(ii+1)-1 627 | tnorm = 0.0d0 628 | do k=j1,j2 629 | tnorm = tnorm+abs(iVals(k)) 630 | end do 631 | if (tnorm .eq. 0.0) goto 999 632 | tnorm = tnorm/real(j2-j1+1) 633 | !c 634 | !c unpack L-part and U-part of row of A in arrays w 635 | !c 636 | lenu = 1 637 | lenl = 0 638 | jw(ii) = ii 639 | w(ii) = 0.0 640 | jw(n+ii) = ii 641 | 642 | do j = j1, j2 643 | k = iColIdxs(j)-offset 644 | t = iVals(j) 645 | if (k .lt. ii) then 646 | lenl = lenl+1 647 | jw(lenl) = k 648 | w(lenl) = t 649 | jw(n+k) = lenl 650 | else if (k .eq. ii) then 651 | w(ii) = t 652 | else 653 | lenu = lenu+1 654 | jpos = ii+lenu-1 655 | jw(jpos) = k 656 | w(jpos) = t 657 | jw(n+k) = jpos 658 | endif 659 | end do 660 | jj = 0 661 | length = 0 662 | !c 663 | !c eliminate previous rows 664 | !c 665 | 150 jj = jj+1 666 | if (jj .gt. lenl) goto 160 667 | !c----------------------------------------------------------------------- 668 | !c in order to do the elimination in the correct order we must select 669 | !c the smallest column index among jw(k), k=jj+1, ..., lenl. 670 | !c----------------------------------------------------------------------- 671 | jrow = jw(jj) 672 | k = jj 673 | !c 674 | !c determine smallest column index 675 | !c 676 | do j=jj+1,lenl 677 | if (jw(j) .lt. jrow) then 678 | jrow = jw(j) 679 | k = j 680 | end if 681 | end do 682 | 683 | if (k .ne. jj) then 684 | !c exchange in jw 685 | j = jw(jj) 686 | jw(jj) = jw(k) 687 | jw(k) = j 688 | !c exchange in jr 689 | jw(n+jrow) = jj 690 | jw(n+j) = k 691 | !c exchange in w 692 | s = w(jj) 693 | w(jj) = w(k) 694 | w(k) = s 695 | endif 696 | !c 697 | !c zero out element in row by setting jw(n+jrow) to zero. 698 | !c 699 | jw(n+jrow) = 0 700 | !c 701 | !c get the multiplier for row to be eliminated (jrow). 702 | !c 703 | fact = w(jj)*alu(jrow) 704 | if (abs(fact) .le. droptol) goto 150 705 | !c 706 | !c combine current row and row jrow 707 | !c 708 | do k = ju(jrow), jlu(jrow+1)-1 709 | s = fact*alu(k) 710 | j = jlu(k) 711 | jpos = jw(n+j) 712 | if (j .ge. ii) then 713 | !c 714 | !c dealing with upper part. 715 | !c 716 | if (jpos .eq. 0) then 717 | !c 718 | !c this is a fill-in element 719 | !c 720 | lenu = lenu+1 721 | if (lenu .gt. n) goto 995 722 | i = ii+lenu-1 723 | jw(i) = j 724 | jw(n+j) = i 725 | w(i) = - s 726 | else 727 | !c 728 | !c this is not a fill-in element 729 | !c 730 | w(jpos) = w(jpos) - s 731 | 732 | endif 733 | else 734 | !c 735 | !c dealing with lower part. 736 | !c 737 | if (jpos .eq. 0) then 738 | !c 739 | !c this is a fill-in element 740 | !c 741 | lenl = lenl+1 742 | if (lenl .gt. n) goto 995 743 | jw(lenl) = j 744 | jw(n+j) = lenl 745 | w(lenl) = - s 746 | else 747 | !c 748 | !c this is not a fill-in element 749 | !c 750 | w(jpos) = w(jpos) - s 751 | endif 752 | endif 753 | end do 754 | !c 755 | !c store this pivot element -- (from left to right -- no danger of 756 | !c overlap with the working elements in L (pivots). 757 | !c 758 | length = length+1 759 | w(length) = fact 760 | jw(length) = jrow 761 | goto 150 762 | 763 | 160 continue 764 | !c 765 | !c reset double-pointer to zero (U-part) 766 | !c 767 | do k=1, lenu 768 | jw(n+jw(ii+k-1)) = 0 769 | end do 770 | !c 771 | !c update L-matrix 772 | !c 773 | lenl = length 774 | length = min0(lenl,lfil) 775 | !c 776 | !c sort by quick-split 777 | !c 778 | call qsplit (w,jw,lenl,length) 779 | !c 780 | !c store L-part 781 | !c 782 | do k=1, length 783 | if (ju0 .gt. iwk) goto 996 784 | alu(ju0) = w(k) 785 | jlu(ju0) = jw(k) 786 | ju0 = ju0+1 787 | end do 788 | !c 789 | !c save pointer to beginning of row ii of U 790 | !c 791 | ju(ii) = ju0 792 | !c 793 | !c update U-matrix -- first apply dropping strategy 794 | !c 795 | length = 0 796 | do k=1, lenu-1 797 | if (abs(w(ii+k)) .gt. droptol*tnorm) then 798 | length = length+1 799 | w(ii+length) = w(ii+k) 800 | jw(ii+length) = jw(ii+k) 801 | endif 802 | enddo 803 | lenu = length+1 804 | length = min0(lenu,lfil) 805 | 806 | call qsplit(w(ii+1:ii+lenu), jw(ii+1:ii+lenu), lenu-1,length) 807 | !c 808 | !c copy 809 | !c 810 | t = abs(w(ii)) 811 | if (length + ju0 .gt. iwk) goto 997 812 | do k=ii+1,ii+length-1 813 | jlu(ju0) = jw(k) 814 | alu(ju0) = w(k) 815 | t = t + abs(w(k) ) 816 | ju0 = ju0+1 817 | end do 818 | !c 819 | !c store inverse of diagonal element of u 820 | !c 821 | if (w(ii) .eq. 0.0) w(ii) = (0.0001 + droptol)*tnorm 822 | 823 | alu(ii) = 1.0d0/ w(ii) 824 | !c 825 | !c update pointer to beginning of next row of U. 826 | !c 827 | jlu(ii+1) = ju0 828 | !c----------------------------------------------------------------------- 829 | !c end main loop 830 | !c----------------------------------------------------------------------- 831 | end do 832 | ierr = 0 833 | return 834 | !c 835 | !c incomprehensible error. Matrix must be wrong. 836 | !c 837 | 995 ierr = -1 838 | return 839 | !c 840 | !c insufficient storage in L. 841 | !c 842 | 996 ierr = -2 843 | return 844 | !c 845 | !c insufficient storage in U. 846 | !c 847 | 997 ierr = -3 848 | return 849 | !c 850 | !c illegal lfil entered. 851 | !c 852 | 998 ierr = -4 853 | return 854 | !c 855 | !c zero row encountered 856 | !c 857 | 999 ierr = -5 858 | return 859 | !c----------------end-of-ilut-------------------------------------------- 860 | end subroutine BLUT_Factorization_real 861 | 862 | 863 | subroutine Apply_BPrecondition_BLUT(iBlocks, iRHS, oX, iLUVals, iLUIdxs, iLUDiag) 864 | !Tue November 2014 865 | !This routine does a forward and backward block solve with the Block LUT matrix 866 | ! 867 | !IO 868 | ! 869 | !iBlocks -> The blocksplitting of the Matrix 870 | !iRHS -> The righthandside we update our solution after 871 | !oX -> The output solution 872 | !iLUVals,iLUIdxs,iLUDiag -> The Block LU matrix where each block is a separate matrix stored in modfied sparse row format. 873 | integer,intent(in),dimension(:,:) :: iBlocks 874 | real*8,intent(in),dimension(:) :: iRHS 875 | real*8,intent(out),dimension(:) :: oX 876 | real*8,intent(in),dimension(:,:) :: iLUVals 877 | integer,intent(in),dimension(:,:) :: iLUIdxs, iLUDiag 878 | !local variables 879 | integer :: i,k,j,threads,m,tmp 880 | 881 | 882 | threads=size(iBlocks,1) 883 | !$OMP PARALLEL IF(StartOpenMP(2)) DEFAULT(NONE) SHARED(iBlocks,threads,iRHS,oX,iLUVals,iLUIdxs,iLUDiag) PRIVATE(k,i,j,m,tmp) 884 | !$OMP DO SCHEDULE(STATIC) 885 | do k=1,threads 886 | do i = iBlocks(k,1),iblocks(k,2) 887 | oX(i) = iRHS(i) 888 | end do 889 | ! 890 | ! forward solve (with U^T) 891 | ! 892 | tmp=iBlocks(k,1)-1 893 | m=0 894 | do i = iBlocks(k,1),iblocks(k,2) 895 | m=m+1 896 | oX(i) = oX(i) * iLUVals(m,k) 897 | do j=iLUDiag(m,k),iLUIdxs(m+1,k)-1 898 | oX(tmp+iLUIdxs(j,k)) = oX(tmp+iLUIdxs(j,k)) - iLUVals(j,k)* oX(i) 899 | end do 900 | end do 901 | ! 902 | ! backward solve (with L^T) 903 | ! 904 | do i = iBlocks(k,2),iBlocks(k,1),-1 905 | m=i-tmp 906 | do j=iLUIdxs(m,k),iLUDiag(m,k)-1 907 | oX(tmp+iLUIdxs(j,k)) = ox(tmp+iLUIdxs(j,k)) - iLUVals(j,k)*oX(i) 908 | end do 909 | end do 910 | end do 911 | !$OMP END DO 912 | !$OMP END PARALLEL 913 | end subroutine Apply_BPrecondition_BLUT 914 | 915 | subroutine Apply_BPrecondition_BSGS(iColIdxs,iRowIdxs,iVals,iBRowIdx,iDiagIdx,iInvDiag,iRhS,ioX,Blocks) 916 | !Tue, August 2014 917 | !This routine does a forward and backward block solve with the Block SOR preconditioner 918 | !This algorithm is described in 266-267 in the book "Iterative methods for sparse linear systems" in the normal non-block edition. 919 | !Note that the SGS preconditioner is not calculated beforehand, but rather calculated on the fly during the applying of it. 920 | !IO 921 | !ispA -> Input, Sparse matrix of the linear system 922 | !iDiagIdx -> Input, a vector containing indexes to the diagonal entries in the CSR matrix ispA. 923 | !iInvDiag -> Input, a vector containing the inverse values of the diagonal element in ispA. 924 | !iRhS -> Input, the right hand side of the linear system we try to solve. 925 | !ioX -> Input, Solution estimate to update iteratively. Output, our new updated solution 926 | !Blocks -> Input, contains the blocksplitting of ispA Blocks(i,1) contains the rowIdx of the first row in the i'th block, Blocks(i,2) contains the last. 927 | implicit none 928 | integer,intent(in),dimension(:) :: iColIdxs,iRowIdxs 929 | real*8,intent(in),dimension(:) :: iVals 930 | integer,intent(in),dimension(:,:) :: iBRowIdx 931 | integer,intent(in),dimension(:) :: iDiagIdx 932 | real*8,intent(in),dimension(:) :: iInvDiag 933 | real*8,intent(in),dimension(:) :: iRHS 934 | real*8,intent(inout),dimension(:) :: ioX 935 | integer,intent(in) :: Blocks(:,:) 936 | !Local vars 937 | integer :: i,j,k 938 | real*8 :: tmp 939 | real*8 :: Factor 940 | integer :: threads 941 | real*8 :: omega 942 | threads=size(Blocks,1) 943 | 944 | !$OMP PARALLEL IF(StartOpenMP(2)) DEFAULT(NONE) SHARED(omega,Blocks,iDiagIdx,iInvDiag,iColIdxs,iRowIdxs,iVals,ioX,irhs,threads,iBRowIdx) PRIVATE(i,k,j) 945 | !c$OMP CRITICAL 946 | !$OMP DO SCHEDULE(STATIC) 947 | do k=1,threads 948 | !We start by doing the forward step 949 | do i = Blocks(k,1),Blocks(k,2) 950 | ioX(i) = iRHS(i) 951 | do j = iBRowIdx(i,1) , iDiagIdx(i)-1 952 | ioX(i) = ioX(i) - iVals(j)*iInvDiag(iColIdxs(j))*ioX(iColIdxs(j)) 953 | end do 954 | end do 955 | 956 | !Now comes the backward step 957 | do i = Blocks(k,2),Blocks(k,1),-1 958 | do j = iBRowIdx(i,2) ,iDiagIdx(i)+1,-1 959 | ioX(i) = ioX(i) - iVals(j)*ioX(iColIdxs(j)) 960 | end do 961 | ioX(i) = ioX(i)*iInvDiag(i) 962 | end do 963 | end do 964 | !$OMP END DO 965 | !c$OMP END CRITICAL 966 | !$OMP END PARALLEL 967 | end subroutine Apply_BPrecondition_BSGS 968 | 969 | 970 | subroutine SetSolverSettings(SolverSettings,A,iUseRCM,iUseNormalization,iSolverType,iIter,iBlockSize,iFillingFactor,iStepLimit) 971 | !Tue Dec, 2014 972 | !This routine Sets the solversettings prior to a linear solve. 973 | !NOTE that this also calculates the precondition matrix, which the caller takes responsibility to free. The freeing should be done by calling the FreeSolverSettings subroutine. 974 | !any optional value you wish chosen by the subroutine should be set to -1 975 | !IO 976 | !SolverSettings -> This is where we save all the relevant settings about the solve we want to do - note if we have preconditioner this is also saved in here. 977 | !A -> input matrix, in CSR format, and of the type TSparseMat 978 | !(Optional) The rest are optional parameters which can be chosen manually, any parameter set to -1 will be chosen automatically. 979 | !iUseNormalization -> (optional) Use Normalization 980 | !iSolverType -> (optional) manually chooses the solvertype, if none is selected or set to -1, a fitting type will be chosen based on the problem. 981 | ! 1 SGS 982 | ! 2 BLUT 983 | ! 3 Pardiso (not implemented here) 984 | !iIter -> (optional) manually set the maximum iterations. (default is DEFAULT_MAXITERATIONS) 985 | !iBaseBlockSize -> (optional) The blocksize will be a multiplum of this number for all blocks if specified. 986 | ! this is used for blocksplitting to make sure we don't split in the middle of a block. 987 | !iFillingFactor -> (optional) Determines the number of elements allowed in the IC factorization. Factor is based on the average number of elements in iA; default value 1.5 988 | !iStepLimit -> (optional) sets the relative minimum difference an iterative step should make in the solution 989 | ! This could be important to set when using block solvers where convergence to the solution is not guaranteed 990 | use omp_lib 991 | use msMat 992 | use mRCM 993 | implicit none 994 | type (TSolverSettings),intent(inout) :: SolverSettings 995 | type (TSparseMat), intent(inout) :: A 996 | integer, intent(in),optional :: iUseRCM 997 | integer, intent(in),optional :: iUseNormalization 998 | integer, intent(in),optional :: iSolverType 999 | integer, intent(in),optional :: iIter 1000 | integer, intent(in),optional :: iBlockSize 1001 | real*8, intent(in),optional :: iFillingFactor 1002 | integer, intent(in),optional :: iStepLimit 1003 | character*256 :: S 1004 | real*8 :: NParFactor,DampCutOff,StartTolerance,EndTolerance,DampVal 1005 | real*8,allocatable :: Diag(:) 1006 | logical :: UseSlowButProvenSettings=.false. 1007 | logical :: IsLayeredInv 1008 | integer :: ErrorCode,Param,NModPar,NParSec,i,j,MaxWidth 1009 | integer :: OldThreads 1010 | 1011 | real*8 :: Elements_pr_row,DiagDom,RunTimeBegin,RunTimeEnd 1012 | !Tweakable factors 1013 | integer,parameter :: DEFAULT_USERCM = 0 1014 | integer,parameter :: DEFAULT_USENORMALIZATION = 1 1015 | integer,parameter :: DEFAULT_INFORMATIONLEVEL = 1 1016 | integer,parameter :: DEFAULT_NORHS = 1 1017 | integer,parameter :: DEFAULT_MAXITERATIONS = 500 1018 | 1019 | real*8,parameter :: DEFAULT_STEPLIMIT = 0 1020 | real*8,parameter :: DEFAULT_FILLINGFACTOR=2.5 1021 | real*8,parameter :: MAXDIAGDOM=1 !Not made to be adjusted just yet 1022 | real*8,parameter :: SGS_THRESHOLD=0.1 1023 | 1024 | logical,parameter :: DEBUG_MODE = .FALSE. 1025 | !Test 1026 | real*8 :: DiagDomTest,test 1027 | integer :: NumberOfBlocks 1028 | integer :: Error 1029 | real*8,dimension(:),allocatable :: DummyVal 1030 | logical :: Dummy 1031 | 1032 | 1033 | solverSettings%UseRCM=LoadOptionalParam(-1,iUseRCM) 1034 | if(solverSettings%UseRCM.eq.-1) solverSettings%UseRCM=DEFAULT_USERCM 1035 | if (solverSettings%UseRCM.eq.1) then 1036 | if(allocated(solverSettings%RCMIndices)) deallocate(solverSettings%RCMIndices) 1037 | allocate(solverSettings%RCMIndices(A%NoRows)) 1038 | call RCMSort(A,solverSettings%RCMIndices) 1039 | end if 1040 | 1041 | solverSettings%UseNormalization=LoadOptionalParam(-1,iUseNormalization) 1042 | if(solverSettings%UseNormalization.eq.-1) solverSettings%UseNormalization=DEFAULT_USENORMALIZATION 1043 | if(present(iSolverType)) then ! remove normalization for direct solver 1044 | if(iSolverType.eq.3) solverSettings%UseNormalization = 0 1045 | endif 1046 | 1047 | SolverSettings%FillingFactor=LoadOptionalParam(-1,iFillingFactor) 1048 | if(SolverSettings%FillingFactor.eq.-1) SolverSettings%FillingFactor=DEFAULT_FILLINGFACTOR 1049 | 1050 | if (solverSettings%UseNormalization) then 1051 | if(allocated(SolverSettings%ReNormalization)) deallocate(SolverSettings%ReNormalization) 1052 | if(allocated(SolverSettings%Normalization)) deallocate(SolverSettings%Normalization) 1053 | allocate(SolverSettings%ReNormalization(A%NoRows)) 1054 | allocate(SolverSettings%Normalization(A%NoRows)) 1055 | call SMatGetDiag(A,SolverSettings%ReNormalization,.TRUE.) 1056 | SolverSettings%ReNormalization = sqrt(SolverSettings%ReNormalization) 1057 | SolverSettings%Normalization = 1d0/SolverSettings%ReNormalization 1058 | call SMatDAD(A,SolverSettings%Normalization) 1059 | end if 1060 | 1061 | SolverSettings%IsPrepared = .False. 1062 | SolverSettings%output%Success = .False. 1063 | 1064 | SolverSettings%IsComplex = A%IsComplex 1065 | SolverSettings%BlockSize=LoadOptionalParam(-1,iBlockSize) 1066 | SolverSettings%RelTol=1e-12 1067 | SolverSettings%AbsTol=1e-9 1068 | 1069 | SolverSettings%PrecondTolerance=1e-6 1070 | 1071 | SolverSettings%Steplimit=LoadOptionalParam(-1,iSteplimit) 1072 | if(SolverSettings%Steplimit.eq.-1) SolverSettings%Steplimit=DEFAULT_STEPLIMIT 1073 | 1074 | 1075 | SolverSettings%MaxIter=LoadOptionalParam(-1,iIter) 1076 | if(SolverSettings%MaxIter.eq.-1) SolverSettings%MaxIter=DEFAULT_MAXITERATIONS 1077 | 1078 | if(.not.(iSolverType.eq.3)) call SMatGetDiagonalDominant(A,SolverSettings%DiagDom) 1079 | 1080 | !SolverType is determined 1081 | SolverSettings%SolverType=LoadOptionalParam(-1,iSolverType) 1082 | if (SolverSettings%SolverType.le.0) call SelectSolver(A,SolverSettings,SolverSettings%DiagDom,SGS_threshold) 1083 | 1084 | if(.not.(isolvertype.eq.3)) then 1085 | Elements_pr_row = (1d0*A%NoElements)/(1d0*A%NoRows) 1086 | !Maximum number of precondition elements is chosen 1087 | MaxWidth=SMatFindMaxColDif(A) 1088 | SolverSettings%PrecondMaxFill=NINT(sqrt(SolverSettings%DiagDom)*SolverSettings%FillingFactor*Elements_pr_row) 1089 | SolverSettings%PrecondMaxFill=min(A%NoRows,min(SolverSettings%PrecondMaxFill,2*MaxWidth)) 1090 | endif 1091 | 1092 | !DEBUG_MODE Only relevant in debug mode 1093 | if (DEBUG_MODE) then 1094 | print*,'IsComplex',SolverSettings%IsComplex 1095 | print*,'Blocksize',SolverSettings%BlockSize 1096 | print*,'Diagdom',SolverSettings%DiagDom 1097 | print*,'SolverType',SolverSettings%SolverType 1098 | print*,'SolverSettings%PrecondMaxFill',SolverSettings%PrecondMaxFill 1099 | print*,'Elements_pr_row',Elements_pr_row 1100 | end if 1101 | 1102 | !reset pointers 1103 | if (allocated(SolverSettings%PrecondVals)) deallocate(SolverSettings%PrecondVals,Stat=ErrorCode) 1104 | if (allocated(SolverSettings%PrecondCols)) deallocate(SolverSettings%PrecondCols,Stat=ErrorCode) 1105 | if (allocated(SolverSettings%PrecondRows)) deallocate(SolverSettings%PrecondRows,Stat=ErrorCode) 1106 | if (allocated(SolverSettings%BPrecondVals)) deallocate(SolverSettings%BPrecondVals,Stat=ErrorCode) 1107 | if (allocated(SolverSettings%BPrecondIdxs)) deallocate(SolverSettings%BPrecondIdxs,Stat=ErrorCode) 1108 | if (allocated(SolverSettings%BPrecondDiag)) deallocate(SolverSettings%BPrecondDiag,Stat=ErrorCode) 1109 | if (allocated(SolverSettings%BlockS)) deallocate(SolverSettings%BlockS,Stat=ErrorCode) 1110 | if (allocated(SolverSettings%PrecondcVals)) deallocate(SolverSettings%PrecondcVals,Stat=ErrorCode) 1111 | 1112 | Dummy=StartOpenMP(2) 1113 | NumberOfBlocks=omp_get_max_threads() 1114 | if(.not.(iSolvertype.eq.3)) call BlockSplitter(A%NoRows,NumberOfBlocks,SolverSettings%BlockSize,SolverSettings%BlockS) 1115 | SolverSettings%NBlocks=size(SolverSettings%BlockS,1) 1116 | !Factorization of the preconditioner is done 1117 | RuntimeBegin = omp_get_wtime() 1118 | SELECT CASE(SolverSettings%SolverType) 1119 | CASE(2) 1120 | !BLUT 1121 | call Compute_Precondition_BLUT_real(A,SolverSettings) 1122 | CASE(3) 1123 | !Pardiso (not implemented here) 1124 | END SELECT 1125 | RuntimeEnd = omp_get_wtime() 1126 | SolverSettings%output%FactorizationTime=RuntimeEnd-RunTimeBegin 1127 | print*,'Factorization Time:',SolverSettings%output%FactorizationTime 1128 | SolverSettings%IsPrepared=.true. 1129 | return 1130 | end subroutine SetSolverSettings 1131 | 1132 | subroutine SolverNormalizeMatrix(SolverSettings,ioA,Norm) 1133 | ! Kristoffer Andersen, Dec 2016 1134 | ! Normalizes the input matrix ioA using the normalization vectors in 1135 | ! solversettings. There is no check on the size of the vectors compared to the matrix 1136 | ! Norm controls whether normalization or the renormalization vector is used. 1137 | ! If SolverSettings%UseNormalization = .false. nothing is performed 1138 | ! IO 1139 | ! I - SolverSettings - settings with the normalization vectors. 1140 | ! If normalization is not used nothing happens to ioA 1141 | ! IO - ioA - the matrix which is transformed to D*A*D 1142 | ! I,optional - Norm - controls whether normalization or renormalization is used. 1143 | ! .true. (default) for normalization 1144 | ! .false. for renormalization 1145 | ! 1146 | implicit none 1147 | type(TSolverSettings),intent(in) :: SolverSettings 1148 | type(Tsparsemat),intent(inout) :: ioA 1149 | logical,optional,intent(in) :: Norm 1150 | !var 1151 | logical :: normalize 1152 | if(SolverSettings%USENORMALIZATION) then ! only if normalization is used 1153 | normalize = .true. 1154 | if(present(Norm)) normalize = norm 1155 | 1156 | if(normalize) then 1157 | call SMatDAD(ioA,SolverSettings%Normalization) 1158 | else 1159 | call SMatDAD(ioA,SolverSettings%ReNormalization) 1160 | endif 1161 | endif 1162 | 1163 | RETURN 1164 | END subroutine SolverNormalizeMatrix 1165 | 1166 | 1167 | subroutine SelectSolver(iA,SolverSettings,iDiagDom,iSGS_threshold) 1168 | !Tue Dec, 2014 1169 | !This routine selects the most appropiete solvertype based on the diagonal dominance and various thresholds. 1170 | !IO 1171 | !iA -> input matrix, in CSR format, and of the type TSparseMat 1172 | !SolverSettings -> This is where we save all the relevant settings about the solve we want to do. 1173 | !iDiagDom -> Maximum diagonal dominance factor for the matrix. 1174 | !iSGS_threshold -> threshold between BLUT and SGS preconditioner 1175 | implicit none 1176 | type (TSparseMat),intent(in) :: iA 1177 | type(TSolverSettings),intent(inout) :: SolverSettings 1178 | real*8,intent(in) :: iDiagDom 1179 | real*8,intent(in) :: iSGS_threshold 1180 | 1181 | if (SolverSettings%IsComplex) then 1182 | SolverSettings%SolverType = 12 !LU0 1183 | else 1184 | if (iDiagDom.gt.iSGS_threshold) then 1185 | !BLUT 1186 | SolverSettings%SolverType=2 1187 | else 1188 | !SGS 1189 | SolverSettings%SolverType=1 1190 | end if 1191 | end if 1192 | end subroutine SelectSolver 1193 | 1194 | 1195 | 1196 | subroutine CheckSolverSettings(iSolverSettings) 1197 | !Tue, dec 2014 1198 | !Checks whether the solversettings have been set and if informationlevel is sufficiently high it also reports the solver settings 1199 | ! 1200 | !use mError 1201 | implicit none 1202 | type(TSolverSettings),intent(in) :: iSolverSettings 1203 | character*200 :: S 1204 | integer :: Output 1205 | logical :: PrintToLog 1206 | 1207 | if(.NOT.iSolverSettings%IsPrepared) goto 901 1208 | 1209 | return 1210 | 901 continue 1211 | print*, 'Critical error! Solversettings not set, make sure SetSolverSettings is called before running the solver' 1212 | stop 1213 | end subroutine CheckSolverSettings 1214 | 1215 | 1216 | logical function isSolverNormalized(SolverSettings) 1217 | ! Kristoffer Andersen, Dec 2016 1218 | ! return the normalization state of the solver 1219 | implicit none 1220 | type(TSolverSettings),intent(in) :: SolverSettings 1221 | isSolverNormalized = SolverSettings%USENORMALIZATION 1222 | RETURN 1223 | END FUNCTION isSolverNormalized 1224 | 1225 | 1226 | subroutine FreeSolverSettings(SolverSettings) 1227 | !Tue June 2015 1228 | !This routine frees the memory allocated by SolverSettings. 1229 | !This routine should always be called whenever Solversettings is no longer needed, since caller takes resposibility to free TSolverSettings. 1230 | ! 1231 | implicit none 1232 | type(TSolverSettings),intent(inout) :: SolverSettings 1233 | integer :: ErrorCode 1234 | real*8 :: dummyVal(1) 1235 | integer :: dummyInt(1) 1236 | integer :: dummy 1237 | 1238 | !reset pointers 1239 | if (allocated(SolverSettings%PrecondVals)) deallocate(SolverSettings%PrecondVals,Stat=ErrorCode) 1240 | if (allocated(SolverSettings%PrecondCols)) deallocate(SolverSettings%PrecondCols,Stat=ErrorCode) 1241 | if (allocated(SolverSettings%PrecondRows)) deallocate(SolverSettings%PrecondRows,Stat=ErrorCode) 1242 | if (allocated(SolverSettings%BPrecondVals)) deallocate(SolverSettings%BPrecondVals,Stat=ErrorCode) 1243 | if (allocated(SolverSettings%BPrecondIdxs)) deallocate(SolverSettings%BPrecondIdxs,Stat=ErrorCode) 1244 | if (allocated(SolverSettings%BPrecondDiag)) deallocate(SolverSettings%BPrecondDiag,Stat=ErrorCode) 1245 | if (allocated(SolverSettings%BlockS)) deallocate(SolverSettings%BlockS,Stat=ErrorCode) 1246 | if (allocated(SolverSettings%PrecondcVals)) deallocate(SolverSettings%PrecondcVals,Stat=ErrorCode) 1247 | if (allocated(SolverSettings%Renormalization)) deallocate(SolverSettings%Renormalization,Stat=ErrorCode) 1248 | if (allocated(SolverSettings%normalization)) deallocate(SolverSettings%normalization,Stat=ErrorCode) 1249 | 1250 | end subroutine FreeSolverSettings 1251 | 1252 | 1253 | subroutine qsplit(a,ind,n,ncut) 1254 | implicit none 1255 | real*8 a(n) 1256 | integer ind(n), n, ncut,j,mid 1257 | !c----------------------------------------------------------------------- 1258 | !c does a quick-sort split of a real array. 1259 | !c on input a(1:n). is a real array 1260 | !c on output a(1:n) is permuted such that its elements satisfy: 1261 | !c 1262 | !c abs(a(i)) .ge. abs(a(ncut)) for i .lt. ncut and 1263 | !c abs(a(i)) .le. abs(a(ncut)) for i .gt. ncut 1264 | !c 1265 | !c ind(1:n) is an integer array which permuted in the same way as a(*). 1266 | !c----------------------------------------------------------------------- 1267 | real*8 tmp, abskey 1268 | integer itmp, first, last 1269 | !c----- 1270 | first = 1 1271 | last = n 1272 | if (ncut .lt. first .or. ncut .gt. last) return 1273 | !c 1274 | !c outer loop -- while mid .ne. ncut do 1275 | !c 1276 | 1 mid = first 1277 | abskey = abs(a(mid)) 1278 | do 2 j=first+1, last 1279 | if (abs(a(j)) .gt. abskey) then 1280 | mid = mid+1 1281 | !c interchange 1282 | tmp = a(mid) 1283 | itmp = ind(mid) 1284 | a(mid) = a(j) 1285 | ind(mid) = ind(j) 1286 | a(j) = tmp 1287 | ind(j) = itmp 1288 | endif 1289 | 2 continue 1290 | !c 1291 | !c interchange 1292 | !c 1293 | tmp = a(mid) 1294 | a(mid) = a(first) 1295 | a(first) = tmp 1296 | !c 1297 | itmp = ind(mid) 1298 | ind(mid) = ind(first) 1299 | ind(first) = itmp 1300 | !c 1301 | !c test for while loop 1302 | !c 1303 | if (mid .eq. ncut) return 1304 | if (mid .gt. ncut) then 1305 | last = mid-1 1306 | else 1307 | first = mid+1 1308 | endif 1309 | goto 1 1310 | !c----------------end-of-qsplit------------------------------------------ 1311 | !c----------------------------------------------------------------------- 1312 | end 1313 | 1314 | 1315 | end module mSolver 1316 | -------------------------------------------------------------------------------- /SparseMatrix.f90: -------------------------------------------------------------------------------- 1 | module mSMat 2 | use mParallel, only: StartOpenMP, ReturnOldThreadNum 3 | implicit none 4 | ! logical :: UseDenseAlgebra,VoxelInversion 5 | integer,private :: FInitNoElements=1000000 !Initial allocation size of the matrices 6 | logical :: FCheckDoubleEntries=.false. !Debug var. When .true. the code checks for double entries 7 | logical :: FShowMessages=.false. !when adding new values to the matrix and checks that the column indices of the 8 | logical,private :: ColIdxCorrected=.false. !Internal variable ensuring that column index correction is only applied once. VERY IMPORTANT 9 | !TO TOGGLE THIS VALUE TRUE IN THE BEGINNING OF ROUTINES THAT ALTER THE INDEX AND FALSE IN THE END!!!!! 10 | logical,private :: UseMKLSparseBLAS=.true. !If true the intel math kernel library is used for high performance sparse BLAS. If false, full source routines from SparseKit are used. 11 | logical :: SMatLogParameters=.true. 12 | !$OMP THREADPRIVATE(ColIdxCorrected) 13 | 14 | type TSparseMat 15 | !Public variables(General matrix information) 16 | character*(256) :: Label 17 | integer :: NoElements !Number of elements in the matrix 18 | integer :: NoRows !Number of rows of the matrix 19 | integer :: NoCols !Number of columns of the matrix 20 | 21 | logical :: IsComplex=.false. !True if the matrix is complex 22 | 23 | real*8 , dimension(:),pointer :: Vals=>null() !Values of the matrix elements if Real 24 | Complex*16, dimension(:),pointer :: cVals=>null() !Values of the matrix elements if Complex 25 | integer, dimension(:),pointer :: RowIdxs =>null() !Row indices of the elements 26 | integer, dimension(:),pointer :: ColIdxs=>null() !Column indices of the elements 27 | end type TSparseMat 28 | 29 | 30 | contains 31 | subroutine SMatCreate(ioSparseMat,iNoRows,iNoCols,iLabel,iIsComplex,iAllocSize) 32 | !Casper Kirkegaard, November 2009. 33 | !This routine creates a TSparseMat structure, ie. initializes 34 | !parameters allocates memory etc. 35 | ! 36 | !Updated 2014 Tue, Now it takes complex numbers as well 37 | ! 38 | ! CK, october 2015. 39 | ! Removed initialization of value and column index vectors. It takes a long time when creating small matrices and is not necessary. 40 | ! KRA, Dec 2016. Added optional allocation size. Has to be >= 1 41 | !IO 42 | ! oSparseMat -> Sparse matrix to create 43 | ! iNoRows -> Number of rows in the matrix 44 | ! iNoCols -> Number of cols in the matrix 45 | ! iLabel -> Label of the matrix. Setting this to eg. 'G' or 'B' is very useful 46 | ! when debugging or optimizing 47 | ! iIsComplex -> Default=.false., if true it means that the matrix is complex 48 | ! iAllocSize -> Default=FInitNoElements, is the allocation size. The default value is a 49 | ! model parameter. Has to >=1 50 | implicit none 51 | type (TSparseMat),intent(inout) :: ioSparseMat 52 | integer,intent(in) :: iNoRows 53 | integer,intent(in) :: iNoCols 54 | character*(*),intent(in) :: iLabel 55 | logical,intent(in),optional :: iIsComplex 56 | integer,intent(in),optional :: iAllocSize 57 | integer :: FInitNoElements=1000000 !Initial allocation size of the matrices 58 | ! var 59 | integer :: NoAlloc 60 | !Implementation 61 | !If the matrix already exists, destroy it and recreate. 62 | if (associated(ioSparseMat%Vals)) then 63 | call SMatDestroy(ioSparseMat) 64 | end if 65 | !First initialize the vars of the input matrix 66 | ioSparseMat%IsComplex=.false. 67 | NoAlloc = FInitNoElements 68 | 69 | if (present(iIsComplex)) ioSparseMat%IsComplex=iIsComplex 70 | if (present(iAllocSize)) then 71 | if(iAllocSize>0) NoAlloc = iAllocSize 72 | endif 73 | 74 | ioSparseMat%NoRows=iNoRows 75 | ioSparseMat%NoCols=iNoCols 76 | ioSparseMat%Label=iLabel 77 | ioSparseMat%NoElements=0 78 | !Now allocate an initial size for the arrays holding the matrix 79 | allocate(ioSparseMat%ColIdxs(NoAlloc)) 80 | if (ioSparseMat%IsComplex) then 81 | allocate(ioSparseMat%cVals(NoAlloc)) 82 | else 83 | allocate(ioSparseMat%Vals(NoAlloc)) 84 | end if 85 | allocate(ioSparseMat%RowIdxs(iNoRows+1)) 86 | ioSparseMat%RowIdxs(:)=0 87 | end subroutine SMatCreate 88 | 89 | 90 | subroutine SMatWriteSparse(iA,iV,iNDigits,iName,iLabel) 91 | !Gianluca Fiandaca, April 2013, AU 92 | !This routine writes the sparse matrix iA in 3 different files, one containing 93 | !the matrix row indices, one containing the matrix column indices and one containing 94 | !the matrix values. The names of the output files are built concatenating the file name 95 | !iName and the words "RowIdxs.txt", "ColIdxs.txt" & "Values.txt" 96 | !IO 97 | ! iA -> Sparse matrix to be written 98 | ! iV -> RHS vector 99 | ! iName -> the name of the 3 output files 100 | ! iLabel -> Label to be written before the matrix 101 | ! iNDigits -> Number of digits after comma in the output format (in scientific notation) 102 | implicit none 103 | type (TSparseMat),intent(inout) :: iA 104 | real*8,dimension(:) :: iV 105 | integer ,intent(in), optional :: iNDigits 106 | character*(*) ,intent(in), optional :: iName 107 | character*(*) ,intent(in), optional :: iLabel 108 | 109 | integer :: i,j,k,Error 110 | integer :: NIntegerDigits,NScientificDigits 111 | integer,parameter :: F=99 112 | real*8 :: Val 113 | character*256 :: RowName,ColName,ValName,S,DefName,RHSName 114 | integer(kind=1) :: CSR 115 | 116 | !digits to be used in the formats 117 | if(present(iNDigits)) then 118 | NScientificDigits=iNDigits 119 | else 120 | NScientificDigits=4 121 | end if 122 | NIntegerDigits=nint(log10(1.d0*iA%NoElements))+1 123 | !names and legend 124 | if(present(iName)) then 125 | DefName=trim(iName)//'_Definition.txt' 126 | RHSName=trim(iName)//'_RHS.txt' 127 | RowName=trim(iName)//'_RowIdxs.txt' 128 | ColName=trim(iName)//'_ColIdxs.txt' 129 | ValName=trim(iName)//'_Values.txt' 130 | else 131 | DefName='Definition.txt' 132 | RHSName='RHS.txt' 133 | RowName='RowIdxs.txt' 134 | ColName='ColIdxs.txt' 135 | ValName='Values.txt' 136 | end if 137 | 138 | !Definition file 139 | open(unit=F,file=DefName,status='unknown') 140 | if(present(iLabel)) then 141 | write(F,'(A)') trim(iLabel) 142 | else 143 | write(F,'(A)') 144 | end if 145 | write(F,30) iA%NoRows,iA%NoCols,iA%NoElements,CSR 146 | close(F) 147 | 148 | 149 | !RHS 150 | open(unit=F,file=RhsName,status='unknown') 151 | if(present(iLabel)) then 152 | write(F,'(A)') trim(iLabel) 153 | else 154 | write(F,'(A)') 155 | end if 156 | do i=1,iA%NoCols 157 | write(F,10) iV(i) 158 | end do 159 | close(F) 160 | 161 | 162 | !Row indices 163 | open(unit=F,file=RowName,status='unknown') 164 | if(present(iLabel)) then 165 | write(F,'(A)') trim(iLabel) 166 | else 167 | write(F,'(A)') 168 | end if 169 | do i=1,iA%NoRows+1 170 | write(F,20) iA%RowIdxs(i) 171 | end do 172 | close(F) 173 | 174 | 175 | 176 | !Col indices 177 | open(unit=F,file=ColName,status='unknown') 178 | if(present(iLabel)) then 179 | write(F,'(A)') trim(iLabel) 180 | else 181 | write(F,'(A)') 182 | end if 183 | do i=1,iA%NoElements 184 | write(F,20) iA%ColIdxs(i) 185 | end do 186 | close(F) 187 | 188 | !Values 189 | open(unit=F,file=ValName,status='unknown') 190 | if(present(iLabel)) then 191 | write(F,'(A)') trim(iLabel) 192 | else 193 | write(F,'(A)') 194 | end if 195 | do i=1,iA%NoElements 196 | write(F,10) iA%Vals(i) 197 | end do 198 | close(F) 199 | 200 | return 201 | 30 format(4(1xi)) 202 | 20 format(3(1xi)) 203 | 10 format(1x1pe<7+NScientificDigits>.) 204 | end subroutine SMatWriteSparse 205 | 206 | 207 | subroutine SMatReadSparse(oA,oV, iFileName,BlockSize) 208 | !Gianluca Fiandaca, April 2013, AU 209 | !This routine reads a sparse matrix iA in 3 different files, one containing 210 | !the matrix row indices, one containing the matrix column indices and one containing 211 | !the matrix values. The names of the output files are built concatenating the file name 212 | !iName and the words "RowIdxs.txt", "ColIdxs.txt" & "Values.txt" 213 | !IO 214 | ! iA -> Sparse matrix to be written 215 | ! iName -> the name of the 3 output files 216 | ! iLabel -> Label to be written before the matrix 217 | ! BlockSize -> The size of each block in the matrix (Used for blocksplitting) 218 | !------------------------------------ 219 | !Updated Tue & Casper, September 2014. 220 | !The reader can now read both CSR and dense format 221 | !Furthermore the reader now assumes the file starts with a label line at the very top 222 | !------------------------------------ 223 | use mArrays 224 | implicit none 225 | type (TSparseMat),intent(out) :: oA 226 | real*8,dimension(:),pointer :: oV 227 | character*(*) ,intent(in), optional :: iFileName 228 | integer, intent(out) :: Blocksize 229 | integer i,j,k,Error 230 | integer,parameter :: F=99 231 | real*8 :: Val 232 | character*256 :: RowFileName,ColFileName,ValFileName,S,DefFileName,RHSFileName 233 | integer :: NoCols,NoRows,NoElements,IsCSR,Pos 234 | logical :: ErrorCode 235 | character*256 :: str 236 | 237 | !names and legend 238 | if(present(iFileName)) then 239 | DefFileName=trim(iFileName)//'_definition.txt' 240 | RHSFileName=trim(iFileName)//'_RHS.txt' 241 | RowFileName=trim(iFileName)//'_RowIdxs.txt' 242 | ColFileName=trim(iFileName)//'_ColIdxs.txt' 243 | ValFileName=trim(iFileName)//'_Values.txt' 244 | else 245 | DefFileName='Definition.txt' 246 | RHSFileName='RHS.txt' 247 | RowFileName='RowIdxs.txt' 248 | ColFileName='ColIdxs.txt' 249 | ValFileName='Values.txt' 250 | end if 251 | 252 | open(unit=F,file=DefFileName,status='old') 253 | do i=1,4 254 | read(F,'(A)') str ! read line 255 | str = adjustl(str) ! remove left blanks 256 | pos = scan(str,'!') 257 | if(pos.gt.0) then 258 | str = str(1:pos-1) ! extract the parameter value 259 | end if 260 | select case(i) 261 | case(1) 262 | read(str,*) NoRows 263 | case(2) 264 | read(str,*) NoCols 265 | case(3) 266 | read(str,*) NoElements 267 | case(4) 268 | read(str,*) Blocksize 269 | end select 270 | end do 271 | close(F) 272 | 273 | call SMatCreate(oA,NoRows,NoCols,'from file') 274 | call mArraysReallocateReals(oA%Vals,NoElements) 275 | call mArraysReallocateInts(oA%ColIdxs,NoElements) 276 | oA%NoRows=NoRows 277 | oA%NoCols=NoCols 278 | oA%NoElements=NoElements 279 | 280 | !Row indices 281 | open(unit=F,file=RowFileName,status='unknown') 282 | read(F,'(A)') S 283 | do i=1,NoRows+1 284 | read(F,'(A)') S 285 | read(S,*) oA%RowIdxs(i) 286 | end do 287 | close(F) 288 | 289 | !Col indices 290 | open(unit=F,file=ColFileName,status='unknown') 291 | read(F,'(A)') S 292 | do i=1,NoElements 293 | read(F,'(A)') S 294 | read(S,*) oA%ColIdxs(i) 295 | end do 296 | close(F) 297 | 298 | !Vals indices 299 | open(unit=F,file=ValFileName,status='unknown') 300 | read(F,'(A)') S 301 | do i=1,NoElements 302 | read(F,'(A)') S 303 | read(S,*) oA%Vals(i) 304 | end do 305 | close(F) 306 | 307 | !RHS 308 | allocate(oV(NoCols)) 309 | open(unit=F,file=RHSFileName,status='unknown') 310 | read(F,'(A)') S 311 | do i=1,NoCols 312 | read(F,'(A)') S 313 | read(S,*) oV(i) 314 | end do 315 | close(F) 316 | return 317 | 318 | end subroutine SMatReadSparse 319 | 320 | 321 | subroutine SMatDestroy(ioSparseMat) 322 | !Casper Kirkegaard, November 2009. 323 | !This routine destroys a TSparseMat structures, ie. 324 | !deallocates memory etc. 325 | ! 326 | !Updated 2014 Tue, Now it takes complex numbers as well 327 | ! 328 | !Updated Dec 2016 Kristoffer Andersen. Now also resets IsSorted and IsNativeCSR 329 | ! 330 | !IO 331 | ! ioSparseMat -> Sparse matrix to destroy 332 | implicit none 333 | type (TSparseMat),intent(inout) :: ioSparseMat 334 | !Implementation 335 | !Deallocate the coordinate form arrays(Always allocated) 336 | if (associated(ioSparseMat%Vals)) then 337 | deallocate(ioSparseMat%Vals) 338 | deallocate(ioSparseMat%ColIdxs) 339 | deallocate(ioSparseMat%RowIdxs) 340 | end if 341 | if (associated(ioSparseMat%cVals)) then 342 | deallocate(ioSparseMat%cVals) 343 | deallocate(ioSparseMat%ColIdxs) 344 | deallocate(ioSparseMat%RowIdxs) 345 | end if 346 | nullify(ioSparseMat%Vals) 347 | nullify(ioSparseMat%CVals) 348 | nullify(ioSparseMat%ColIdxs) 349 | nullify(ioSparseMat%RowIdxs) 350 | !Reset vars 351 | ioSparseMat%NoRows=0 352 | ioSparseMat%NoCols=0 353 | ioSparseMat%NoElements=0 354 | ioSparseMat%IsComplex=.false. 355 | end subroutine SMatDestroy 356 | 357 | subroutine SMatGetDiag(iA,oDiag,iEnsurePositiveDefinite) 358 | !Tue June 2015 359 | !This function should be called through its interface SMatGetDiag 360 | !This function returns the value of the diagonal elements in a matrix saved in CSR format 361 | !IO 362 | !iA -> input sparse matrix in CSR format 363 | !oDiag -> output oDiag(i) contains the inverse value of the i'th diagonal element in the CSR matrix given in iA 364 | !iEnsurePositiveDefinite -> (default = false) can be used to check if the diagonal is positive - NOT FULL POSITIVE DEFINITENESS 365 | ! 366 | !CK, october 2015 367 | !Changed oDiag to no longer be allocatable - it not possible to have an output array alloctable. 368 | ! 369 | ! KRA, Dec 2016 370 | ! Added Positive diagonal check and check of the size of oDiag compared to iA%NoRows 371 | ! 372 | implicit none 373 | type (TSparseMat),intent(inout) :: iA 374 | real*8,intent(out) :: oDiag(:) 375 | logical,intent(in),optional :: iEnsurePositiveDefinite 376 | integer :: i,k 377 | integer :: oError 378 | real*8 :: tmp 379 | character*256 :: S 380 | logical :: Error = .False. 381 | integer :: OldThreads 382 | logical :: CheckPositiveDefinite 383 | !allocate(oDiag(iA%NoRows)) 384 | if(size(oDiag,1).lt.iA%NoRows) goto 903 385 | if(present(iEnsurePositiveDefinite)) then 386 | CheckPositiveDefinite=iEnsurePositiveDefinite 387 | else 388 | CheckPositiveDefinite=.False. 389 | end if 390 | 391 | 392 | if(CheckPositiveDefinite) then 393 | !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i,tmp,oError) 394 | !$OMP DO SCHEDULE(STATIC) 395 | do i=1,iA%NoRows 396 | tmp=SMatGetValue(iA,i,i,oError) 397 | if (tmp.gt.0d0) then 398 | oDiag(i)=tmp 399 | else 400 | Error=.true. 401 | k=i 402 | end if 403 | end do 404 | !$OMP END DO 405 | !$OMP END PARALLEL 406 | else 407 | !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i,tmp,oError) 408 | !$OMP DO SCHEDULE(STATIC) 409 | do i=1,iA%NoRows 410 | oDiag(i)=SMatGetValue(iA,i,i,oError) 411 | end do 412 | !$OMP END DO 413 | !$OMP END PARALLEL 414 | end if 415 | 416 | call ReturnOldThreadNum() 417 | if (Error) goto 901 418 | return 419 | 901 continue 420 | print*, 'Critical failure in SMatGetDiag_real, matrix-diagonal is ',SMatGetValue(iA,k,k,oError),' at ',k 421 | 903 continue 422 | print*, 'Output array allocation too small in SMatGetDiag_real' 423 | End subroutine SMatGetDiag 424 | 425 | 426 | real*8 function SMatGetValue(iA,iRowNo,iColNo,oError) 427 | !Casper Kirkegaard, November 2009. 428 | !This routine locates and returns the value of a given row,col entry. 429 | !NOTE THAT THE MATRIX IS FINALIZED WHEN CALLING THIS PROCEDURE!!! 430 | !IO 431 | ! iA -> Sparse matrix to search for value 432 | ! iRowNo -> Index of the row to search for 433 | ! iColNo -> Index of the column to search for 434 | ! oError -> -1 if value does not exist, ie. entry equal to zero 435 | ! Positive value if the value entry exists. 436 | !IO parameters 437 | implicit none 438 | type (TSparseMat),intent(inout) :: iA 439 | integer,intent(in) :: iColNo 440 | integer,intent(in) :: iRowNo 441 | integer,intent(out) :: oError 442 | !Variable declarations 443 | integer :: CorColNo 444 | !Implementation 445 | !Apply column index correction if necessary. Also make sure to switch back the ColIdxCorrected status flag 446 | !at the end of this routine if this is the routine applying the correction. 447 | CorColNo=iColNo 448 | !Return the value 449 | call SMatLocateCSRIdx(iA,iRowNo,CorColNo,oError) 450 | if (oError.eq.-1) then 451 | SMatGetValue=0d0 452 | else 453 | SMatGetValue=iA%Vals(oError) 454 | end if 455 | 456 | !Switch back the col index correction lock if necessary. 457 | end function SMatGetValue 458 | 459 | subroutine SMatDAD(A,iD) 460 | type (TSparseMat),intent(inout) :: A 461 | real*8,dimension(:),intent(in) :: iD 462 | integer :: i,j 463 | do i=1,A%NoRows 464 | do j=A%RowIdxs(i),A%RowIdxs(i+1)-1 465 | A%Vals(j)=A%Vals(j)*iD(i)*iD(A%colIdxs(j)) 466 | end do 467 | end do 468 | 469 | end subroutine SMatDAD 470 | 471 | subroutine SMatLocateCSRIdx(iA,iRowNo,iColNo,oIndex,iFindNearest) 472 | !Casper Kirkegaard, November 2009. 473 | !This routine locates and returns the CSR index of a given row,col entry. 474 | !------------------------------------- 475 | !Updated September 2014, by Tue 476 | !Added the optional parameter iFindNearest. 477 | !------------------------------------ 478 | !IO 479 | ! iA -> Sparse matrix to search 480 | ! iRowNo -> Index of the row to search for 481 | ! iColNo -> Index of the column to search for 482 | ! oIndex -> Index in iA%Vals of the entry iRow,iColNo 483 | ! Set to -1 if the entry does not exits in the matrix 484 | ! iFindNearest -> optional parameter, which tells whether to return the nearest match if iIndex does not exist. 485 | ! default is 0, which returns -1 if index is not found 486 | ! if set to 1, it returns the nearest index searching forward 487 | ! if set to -1, it returns the nearest index searching backwards 488 | !IO parameters 489 | implicit none 490 | type (TSparseMat),intent(inout) :: iA 491 | integer,intent(in) :: iColNo 492 | integer,intent(in) :: iRowNo 493 | integer,intent(out) :: oIndex 494 | integer,intent(in),optional :: iFindNearest 495 | !Variable declarations 496 | integer :: CorColNo 497 | integer :: StartIndex,EndIndex 498 | integer :: FindNearest 499 | !Implementation 500 | if (present(iFindNearest)) then 501 | FindNearest = iFindNearest 502 | else 503 | FindNearest = 0 504 | end if 505 | !Apply column index correction if necessary. Also make to switch back the ColIdxCorrected status flag 506 | !at the end of this routine if this is the routine applying the correction. 507 | CorColNo=iColNo 508 | 509 | !Do an n*log(n) search by exploiting that the column indices of each 510 | !row are sorted by increasing value. Chop up the entire interval in halfs 511 | !and determine in which of the intervals the index lies. Continue until the value 512 | !is found or the width of the search interval is 1 and the values has not been found. 513 | StartIndex=iA%RowIdxs(iRowNo) 514 | EndIndex=iA%RowIdxs(iRowNo+1)-1 515 | call SMatLocateIdx(iA%ColIdxs,StartIndex,EndIndex,CorColNo,oIndex,FindNearest) 516 | !Switch back the col index correction lock if necessary. 517 | end subroutine SMatLocateCSRIdx 518 | 519 | subroutine SMatLocateIdx(iArray,iBegin,iEnd,iIndex,oIndex,iFindNearest) 520 | !Casper Kirkegaard, November 2009. 521 | !This routine locates the position of a given value in a sorted integer array. 522 | !------------------------------------- 523 | !Updated September 2014, by Tue 524 | !Added the optional parameter iFindNearest. 525 | !------------------------------------ 526 | !IO 527 | ! iArray -> Array to search through 528 | ! iBegin -> Index in iArray to start from 529 | ! iEnd -> Index in iArray to end at 530 | ! iIndex -> Integer to locate index in iArray for 531 | ! oIndex -> index in iarray of the value iIndex 532 | ! Set to -1 if the entry does not exits in the 533 | ! iFindNearest -> optional parameter, which tells whether to return the nearest match if iIndex does not exist. 534 | ! default is 0, which returns -1 if index is not found 535 | ! if set to 1, it returns the nearest index searching forward 536 | ! if set to -1, it returns the nearest index searching backwards 537 | !IO parameters 538 | implicit none 539 | integer, dimension(:),intent(in) :: iArray 540 | integer,intent(in) :: iBegin 541 | integer,intent(in) :: iEnd 542 | integer,intent(in) :: iIndex 543 | integer,intent(out) :: oIndex 544 | integer,intent(in),optional :: iFindNearest 545 | !Variable declarations 546 | integer :: i,j 547 | integer :: StartIndex 548 | integer :: MidIndex 549 | integer :: EndIndex 550 | real*8 :: Dummy 551 | integer :: BruteIndex 552 | logical :: Searching 553 | integer :: Count 554 | integer :: SearchForNearest 555 | !Implementation 556 | !BruteIndex=-1 557 | !do i=iA%RowIdxs(iRowNo),iA%RowIdxs(iRowNo+1)-1 558 | ! if (iA%ColIdxs(i).eq.iColNo) then 559 | ! BruteIndex=i 560 | ! end if 561 | !end do 562 | !Do a log(n) search by exploiting that the column indices of each 563 | !row are sorted by increasing value. Chop up the entire interval in halfs 564 | !and determine in which of the intervals the index lies. Continue until the value 565 | !is found or the width of the search interval is 1 and the values has not been found. 566 | if (present(iFindNearest)) then 567 | SearchForNearest = iFindNearest 568 | else 569 | SearchForNearest = 0 570 | end if 571 | Searching=.true. 572 | oIndex=-1 573 | StartIndex=iBegin 574 | EndIndex=iEnd 575 | !Don't search if the row is empty ... 576 | if (EndIndex.lt.StartIndex) Searching=.False. 577 | Count=1 578 | do while(Searching) 579 | Count=Count+1 580 | if (EndIndex-StartIndex.lt.2) then 581 | Searching=.false. 582 | do i=StartIndex,EndIndex 583 | if (iArray(i).eq.iIndex) then 584 | oIndex=i 585 | exit 586 | end if 587 | end do 588 | end if 589 | Dummy=(StartIndex+EndIndex)/2 590 | MidIndex=AnInt(Dummy) 591 | 592 | if (iArray(MidIndex).eq.iIndex) then 593 | Searching=.false. 594 | oIndex=MidIndex 595 | else 596 | if (iArray(MidIndex).gt.iIndex) then 597 | EndIndex=MidIndex 598 | else 599 | StartIndex=MidIndex 600 | end if 601 | end if 602 | end do 603 | !If the exact index was not found, do we look for nearest index? 604 | if ((oIndex.eq.-1).and.(SearchForNearest.ne.0)) then 605 | if (SearchForNearest.eq.1) then 606 | if (iArray(StartIndex).gt.iIndex) then 607 | oIndex=StartIndex 608 | else 609 | oIndex=EndIndex 610 | end if 611 | else if(SearchForNearest.eq.-1) then 612 | if (iArray(EndIndex).lt.iIndex) then 613 | oIndex=EndIndex 614 | else 615 | oIndex=StartIndex 616 | end if 617 | end if 618 | end if 619 | end subroutine SMatLocateIdx 620 | 621 | subroutine SMatGetDiagonalDominant(iA,oDiagDom) 622 | !Tue Dec, 2014 623 | !Determines the 624 | ! 625 | !IO 626 | !iA -> matrix in CSR format 627 | !oDiagDom -> on output the diagonal dominance of the matrix, if DiagDom<1 it is strictly diagonal dominant 628 | implicit none 629 | type (TSparseMat),intent(inout) :: iA 630 | real*8,intent(out) :: oDiagDom 631 | !Local vars 632 | integer :: i, j,k 633 | integer,allocatable,dimension(:) :: DiagIdxs 634 | real*8 :: Diag,RowVal,MinVal,Maxval 635 | character*256 :: S 636 | logical :: Error =.False. 637 | integer :: OldThreads 638 | allocate(DiagIdxs(iA%NoRows)) 639 | Call SMatGetDiagIdx(iA,DiagIdxs) 640 | oDiagDom=0 641 | !$OMP PARALLEL IF(StartOpenMP(2,1)) PRIVATE(I,J,Diag,RowVal) SHARED(DiagIdxs,iA,k,error) DEFAULT(NONE) REDUCTION(+:oDiagDom) 642 | !$OMP DO SCHEDULE(STATIC) 643 | do i=1,iA%NoRows 644 | Diag=(abs(iA%Vals(DiagIdxs(i)))) 645 | if(Diag.eq.0) then 646 | Error=.true. 647 | k=i 648 | end if 649 | RowVal=0d0 650 | do j=iA%RowIdxs(i),iA%RowIdxs(i+1)-1 651 | RowVal=RowVal+abs(iA%Vals(j)) 652 | end do 653 | if (Error) then 654 | !Set oDiagDom to something very large if the diagelement is 0. 655 | oDiagDom=50d0 656 | else 657 | oDiagDom=oDiagDom+(RowVal-Diag)/Diag 658 | end if 659 | end do 660 | !$OMP END DO 661 | !$OMP END PARALLEL 662 | oDiagDom=oDiagDom/((iA%NoRows-2)*0.5d0) 663 | call ReturnOldThreadNum() 664 | deallocate(DiagIdxs) 665 | return 666 | end subroutine SMatGetDiagonalDominant 667 | 668 | 669 | subroutine SMatAmultV(A,iV,oV) 670 | !Casper Kirkegaard, November 2009. 671 | !This routine calculates the product of the input matrix A with a vector iV 672 | !IO 673 | ! A -> Sparse A matrix 674 | ! iV -> Dense vector to multiply with 675 | ! oV -> Product A*iV 676 | !IO parameters 677 | implicit none 678 | type (TSparseMat),intent(inout) :: A 679 | real*8,dimension(:),intent(in) :: iV 680 | real*8,dimension(:),intent(out) :: oV 681 | !Variable declarations 682 | integer,Dimension(:),allocatable :: TempVals 683 | integer :: i,j 684 | integer :: AFirst,ALast 685 | integer :: RowStart 686 | integer :: RowEnd 687 | real*8 :: Sum 688 | integer :: OldThreads 689 | logical :: Dummy 690 | !Implementation 691 | dummy = StartOpenMP(2,1) 692 | call amux (A%NoRows, iV, oV, A%Vals,A%ColIdxs,A%RowIdxs) 693 | call ReturnOldThreadNum() 694 | end subroutine SMatAmultV 695 | 696 | subroutine amux (n, x, y, a,ja,ia) 697 | real*8 x(*), y(*), a(*) 698 | integer n, ja(*), ia(*) 699 | !c----------------------------------------------------------------------- 700 | !c A times a vector 701 | !c----------------------------------------------------------------------- 702 | !c multiplies a matrix by a vector using the dot product form 703 | !c Matrix A is stored in compressed sparse row storage. 704 | !c 705 | !c on entry: 706 | !c---------- 707 | !c n = row dimension of A 708 | !c x = real array of length equal to the column dimension of 709 | !c the A matrix. 710 | !c a, ja, 711 | !c ia = input matrix in compressed sparse row format. 712 | !c 713 | !c on return: 714 | !c----------- 715 | !c y = real array of length n, containing the product y=Ax 716 | !c 717 | !c----------------------------------------------------------------------- 718 | !c local variables 719 | !c 720 | real*8 t 721 | integer i, k 722 | !c----------------------------------------------------------------------- 723 | !call mkl_dcsrgemv('n', n, a, ia, ja, x, y) 724 | !return 725 | !$OMP PARALLEL SHARED(n, x, y, a,ja,ia) PRIVATE(t,i,k) 726 | !$OMP DO 727 | do i = 1,n 728 | t = 0 729 | do k=ia(i), ia(i+1)-1 730 | t = t + a(k)*x(ja(k)) 731 | end do 732 | y(i) = t 733 | end do 734 | !$OMP END DO 735 | !$OMP END PARALLEL 736 | !c---------end-of-amux--------------------------------------------------- 737 | !c----------------------------------------------------------------------- 738 | end 739 | 740 | subroutine smatgetUpperTriangular (ioA) 741 | ! Kristoffer Andersen, Dec 2016 742 | ! extracts the upper trangular part of ioA. 743 | type(tsparsemat),intent(inout) :: ioA 744 | !real*8 a(*), ao(*), t 745 | !integer ia(*), ja(*), iao(*), jao(*) 746 | !----------------------------------------------------------------------- 747 | ! On entry 748 | !----------- 749 | ! nrow = dimension of the matrix a. 750 | ! a, ja, 751 | ! ia = matrix stored in compressed row sparse format 752 | ! 753 | ! nzmax = length of arrays ao, and jao. 754 | ! 755 | ! On return: 756 | !----------- 757 | ! ao, jao, 758 | ! iao = lower part of input matrix (a,ja,ia) stored in compressed sparse 759 | ! row format format. 760 | ! 761 | ! ierr = integer error indicator. 762 | ! ierr .eq. 0 means normal return 763 | ! ierr .eq. i means that the code has stopped when processing 764 | ! row number i, because there is not enough space in ao, jao 765 | ! (according to the value of nzmax) 766 | ! 767 | !----------------------------------------------------------------------- 768 | !var 769 | integer :: ierr,ko,i,kold,kdiag,k,RowIDold 770 | !prog 771 | ko = 0 772 | do i=1, ioA%NoRows 773 | RowIdOld = ioA%RowIdxs(i) 774 | ioA%RowIdxs(i) = ko+1 775 | do k = RowIdOld, ioA%RowIdxs(i+1) -1 776 | if (ioA%ColIdxs(k) .ge. i) then ! upper trangle 777 | ko = ko+1 ! next new index 778 | ioA%Vals(ko) = ioA%Vals(k) ! store value 779 | ioA%ColIdxs(ko) = ioA%ColIdxs(k) 780 | endif 781 | enddo 782 | 783 | enddo 784 | 785 | ioA%RowIDxs(ioA%NoRows+1) = ko+1 786 | ioA%NoElements = ko 787 | return 788 | end subroutine smatgetUpperTriangular 789 | 790 | 791 | 792 | subroutine SMatGetInvDiag(iA,oInvDiag) 793 | !Tue September 2014 794 | !This function should be called through its interface SMatGetInvDiag 795 | !This function returns the inverse value of the diagonal elements in a matrix saved in CSR format 796 | !IO 797 | !iA -> input sparse matrix in CSR format 798 | !oInvDiag -> output oInvDiag(i) contains the inverse value of the i'th diagonal element in the CSR matrix given in iA 799 | ! 800 | !CK, october 2015 801 | !Removed allocatable property on oInvDiag. Cannot allocate and pass an allocatable outside a subroutine. 802 | implicit none 803 | type (TSparseMat),intent(inout) :: iA 804 | real*8,intent(out) :: oInvDiag(:) 805 | !var 806 | integer :: i,k 807 | integer :: oError 808 | real*8 :: tmp 809 | character*256 :: S 810 | logical :: Error 811 | integer :: OldThreads 812 | Error = .False. 813 | !allocate(oInvDiag(iA%NoRows)) 814 | !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i,tmp,oError) 815 | !$OMP DO SCHEDULE(STATIC) 816 | do i=1,iA%NoRows 817 | tmp=SMatGetValue(iA,i,i,oError) 818 | if (abs(tmp).gt.0d0) then 819 | oInvDiag(i)=1/tmp 820 | else 821 | !!$OMP CRITICAL (Errorfound) ! might be neccesary 822 | Error=.true. 823 | k=i 824 | !!$OMP END CRITICAL (Errorfound) 825 | end if 826 | end do 827 | !$OMP END DO 828 | !$OMP END PARALLEL 829 | call ReturnOldThreadNum() 830 | return 831 | End subroutine SMatGetInvDiag 832 | 833 | double precision function ddot(n,dx,incx,dy,incy) 834 | double precision dx(*),dy(*),dtemp 835 | integer i,incx,incy,ix,iy,m,mp1,n 836 | ddot = 0.0d0 837 | dtemp = 0.0d0 838 | if(n<=0)return 839 | if(incx==1.AND.incy==1)go to 20 840 | ix = 1 841 | iy = 1 842 | if(incx<0)ix = (-n+1)*incx + 1 843 | if(incy<0)iy = (-n+1)*incy + 1 844 | do 10 i = 1,n 845 | dtemp = dtemp + dx(ix)*dy(iy) 846 | ix = ix + incx 847 | iy = iy + incy 848 | 10 continue 849 | ddot = dtemp 850 | return 851 | 852 | 20 m = mod(n,5) 853 | if( m == 0 ) go to 40 854 | do 30 i = 1,m 855 | dtemp = dtemp + dx(i)*dy(i) 856 | 30 continue 857 | if( n < 5 ) go to 60 858 | 40 mp1 = m + 1 859 | do 50 i = mp1,n,5 860 | dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) + dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4) 861 | 50 continue 862 | 60 ddot = dtemp 863 | return 864 | end 865 | 866 | subroutine SMatGetDiagIdx(iA,oDiagIdxs) 867 | !Tue September 2014 868 | !This function should be called through its interface SMatGetDiagIdx 869 | !This function returns the indices for the diagonal elements in a matrix saved in CSR format 870 | !CK, october 2015 871 | !Changed oDiagIdxs to no longer be allocatable - it not possible to have an output array alloctable. 872 | !IO 873 | !iA -> input sparse matrix in CSR format 874 | !SMatGetDiagIdx_type -> output SMatGetDiagIdx_type(i) contains the indices for the i'th diagonal element in the CSR matrix given in iA 875 | implicit none 876 | type (TSparseMat),intent(inout) :: iA 877 | integer,intent(out) :: oDiagIdxs(:) 878 | integer :: i 879 | integer :: OldThreads 880 | 881 | !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i) 882 | !$OMP DO SCHEDULE(STATIC) 883 | do i=1,iA%NoRows 884 | call SMatLocateCSRIdx(iA,i,i,oDiagIdxs(i)) 885 | end do 886 | !$OMP END DO 887 | !$OMP END PARALLEL 888 | call ReturnOldThreadNum() 889 | End subroutine SMatGetDiagIdx 890 | 891 | subroutine SMatGetBlockRowIdx(iA,iBlocks,oBRowIdxs) 892 | !Tue September 2014 893 | !This routine finds the first and last element in each row which are within the blocks given in iBlocks. 894 | !In other words, this routine is used to sort out the elements which are not used in a blocksplitting of the matrix. 895 | !CK, October 2015 896 | !Removed allocation of browidxs - the allocation hasto be performed from outside not to cause data corruption. 897 | !IO 898 | !iA -> input sparse matrix in CSR format 899 | !iBlocks -> The blocksplitting we wish to use on iA. 900 | !oBRowIdxs -> oBRowIdxs(i,1) is the first indices to first element in row i within the block, oBRowIdxs(i,2) is the last element. 901 | implicit none 902 | type (TSparseMat),intent(inout) :: iA 903 | integer,dimension(:,:),intent(in):: iBlocks 904 | integer, intent(out) :: oBRowIdxs(:,:) 905 | integer :: i,k 906 | integer :: threads 907 | integer :: OldThreads 908 | !allocate(oBRowIdxs(iA%NoRows,2)) 909 | threads = size(iBlocks,1) 910 | do k=1,threads 911 | !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i) 912 | !$OMP DO SCHEDULE(STATIC) 913 | do i = iBlocks(k,1),iBlocks(k,2) 914 | call SMatLocateCSRIdx(iA,i,iBlocks(k,1),oBRowIdxs(i,1),1) 915 | call SMatLocateCSRIdx(iA,i,iBlocks(k,2),oBRowIdxs(i,2),-1) 916 | end do 917 | !$OMP END DO 918 | !$OMP END PARALLEL 919 | end do 920 | call ReturnOldThreadNum() 921 | End subroutine SMatGetBlockRowIdx 922 | 923 | 924 | function SMatFindMaxColDif(iA) 925 | !Tue September 2014 926 | !This function finds the maximum distance two elements can be apart in the same row in the matrix 927 | !IO 928 | !iA -> input sparse matrix in CSR format 929 | !FindMaxColDif -> output FindMaxColDif contains the maximum difference between the first and last element in each row across all rows. 930 | implicit none 931 | type (TSparseMat),intent(in) :: iA 932 | integer :: SMatFindMaxColDif 933 | integer :: i,j 934 | SMatFindMaxColDif=0 935 | do i=1,iA%NoRows 936 | SMatFindMaxColDif=max(SMatFindMaxColDif,(iA%ColIdxs(iA%RowIdxs(i+1)-1)-iA%ColIdxs(iA%RowIdxs(i))+1)) 937 | end do 938 | end function SMatFindMaxColDif 939 | 940 | 941 | 942 | end module mSMat 943 | 944 | -------------------------------------------------------------------------------- /Technical_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tueboesen/Sparse-iterative-parallel-linear-solver/fe82bd08e2050716c2ed43996337d5e8f4723c44/Technical_report.pdf --------------------------------------------------------------------------------