├── Example.f90
├── LICENSE
├── Load.f90
├── Matrix
    ├── 001_ColIdxs.txt
    ├── 001_Definition.txt
    ├── 001_RHS.txt
    ├── 001_RowIdxs.txt
    ├── 001_Values.txt
    ├── Matrix_Spy_Map.png
    └── Settings.ini
├── Misc.f90
├── Parallelization.f90
├── Quicksort.f90
├── RCM.f90
├── Readme.MD
├── Settings.ini
├── Solver.f90
├── SparseMatrix.f90
└── Technical_report.pdf


/Example.f90:
--------------------------------------------------------------------------------
 1 |     !MIT License
 2 |     !
 3 |     !Copyright (c) [2017] [HydroGeophysics Group, Aarhus University, Denmark]
 4 |     !
 5 |     !Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     !of this software and associated documentation files (the "Software"), to deal
 7 |     !in the Software without restriction, including without limitation the rights
 8 |     !to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     !copies of the Software, and to permit persons to whom the Software is
10 |     !furnished to do so, subject to the following conditions:
11 |     !
12 |     !The above copyright notice and this permission notice shall be included in all
13 |     !copies or substantial portions of the Software.
14 |     !
15 |     !THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     !IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     !FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     !AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     !LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     !OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     !SOFTWARE.
22 | 
23 |     !This is a Simple example that shows the sparse Iterative Solver from Aarhus Hydrogeogphysical Group in action.
24 |     !This example loads a linear system from txt files in CSR format, it loads an ini-file with various settings, and then it solves the linear system
25 |     program Solver_example
26 |     use mParallel
27 |     use mSolver
28 |     use mSMat
29 |     use mInput
30 |     implicit none
31 |     integer :: AffinityMode,NCPUs,UseNested,StartThread,NCPUsLow,NCPUInner,NCPUOuter,NCPUInnerLow,AffinityPattern
32 |     integer :: n,i,nCols,nRows,UseRCM
33 |     real*8, pointer :: rhs(:),x(:),rhs_backup(:),Residual(:)
34 |     character*256 :: SettingFileName,SparseMatrixFiles,OutFile
35 |     CHARACTER(len=256) :: arg
36 |     type(TSparseMat) :: A
37 |     type(Tsettings)  :: Set
38 |     !Solver vars
39 |     integer :: MaxIter, NmodPar, NoRhs, Solverinformation, solvertype,UseNormalization
40 |     real*8  :: FillingFactor,error,t1,t2
41 |     type(TSolverSettings) :: SolverSettings
42 | 
43 |     !First we start by loading the additional arguments, which should have been provided with the call. 
44 |     CALL get_command_argument(1, arg) !Argument 1 is the name of the settingsfile
45 |     IF (LEN_TRIM(arg) == 0) then
46 |       print*,'ERROR not enough arguments given'  
47 |     end if
48 |     read(arg,*) SettingFileName
49 |   
50 |     CALL get_command_argument(2, arg) !Argument 2 is the name of the SparseMatrix files.
51 |     IF (LEN_TRIM(arg) == 0) then
52 |       print*,'ERROR not enough arguments given'  
53 |     end if
54 |     read(arg,*) SparseMatrixFiles
55 | 
56 |     CALL get_command_argument(3, arg) !Argument 3 is the name of the output file.
57 |     IF (LEN_TRIM(arg) == 0) then
58 |       print*,'ERROR not enough arguments given'  
59 |     end if
60 |     read(arg,*) OutFile
61 |     
62 |     print*,'SettingFileName:',SettingFileName
63 |     print*,'SparseMatrixFiles:',SparseMatrixFiles
64 |     print*,'OutFile',OutFile
65 |     call LoadSettings(SettingFileName,set)
66 |     
67 |     !Before we do anything else openMp related, we call initOpenMp()
68 |     call InitOpenMP(set%AffinityMode,set%AffinityPattern,set%StartThread,set%UseNested,set%NCPUs,set%NCPUsLow,set%NCPUOuter)
69 | 
70 |     !Now lets create a sparse linear system to work on
71 |     call SMatReadSparse(A,rhs, SparseMatrixFiles,Set%BlockSize) !Read the sparsematrix files into memory
72 |     allocate(x(A%norows))
73 |     allocate(rhs_backup(A%noRows))
74 |     allocate(Residual(A%noRows))
75 |     rhs_backup=rhs
76 |     UseRCM=0
77 |     x=0
78 |     t1=omp_get_wtime() !Start a timer
79 |     call SetSolverSettings(SolverSettings,A,UseRCM,set%UseNormalization,set%SolverType,set%MaxIter,set%blocksize,set%FillingFactor) !Set the solver and create the preconditioner factorization
80 |     call SingleRHS_wrapper(A,rhs,x,SolverSettings) !Propagate the actual linear system
81 |     t2=omp_get_wtime() 
82 |     call SolverNormalizeMatrix(solversettings,A,.false.) !Renormalize matrix A (This is only needed because we want to validate that the solution did indeed solve the initial matrix).
83 |     
84 |     call SMatAmultV(A,x,rhs) !Show that the solution we found is indeed a solution to the linear system by multiplying the solution on the matrix
85 |     Residual=rhs-rhs_backup
86 |     Error=sqrt(ddot(A%NoRows,Residual,1,Residual,1))
87 |     print*,'Error',Error
88 |     print*,'Time for solve',t2-t1
89 |     open(78,file=trim(OutFile),position='append')
90 |     write(78,*) t2-t1
91 |     write(78,*) solversettings%output%status
92 |     write(78,*) Error
93 |     write(78,*) omp_get_max_threads()
94 |     write(78,*) set%SolverType
95 |     close(78)
96 |     
97 |     end program Solver_example
98 | 
99 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Tue Boesen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Load.f90:
--------------------------------------------------------------------------------
 1 | module mInput
 2 |     implicit none 
 3 |     !Loads the settings used by the parallelization framework and the iterative solver      
 4 |   type Tsettings
 5 |     integer :: AffinityMode
 6 |     integer :: AffinityPattern
 7 |     integer :: NCPUs
 8 |     integer :: UseNested
 9 |     integer :: StartThread
10 |     integer :: NCPUsLow
11 |     integer :: NCPUOuter
12 |     integer :: NCPUInner
13 |     integer :: NCPUInnerLow
14 |     integer :: SolverType
15 |     integer :: MaxIter
16 |     integer :: BlockSize
17 |     integer :: UseNormalization
18 |     real*8  :: FillingFactor
19 |   end type Tsettings
20 |     contains
21 |     
22 |     subroutine LoadSettings(FName,Settings)
23 |     character*256,intent(in)           :: Fname 
24 |     type(Tsettings),intent(inout)      :: Settings
25 |     integer :: Nsettings,i,j,f,pos    
26 |     character*256 :: str,str2
27 |     F=55
28 |     Nsettings=13
29 |     open(unit=F,file=FName,status='old')
30 |     do i=1,Nsettings  
31 |       read(F,'(A)') str ! read line
32 |       str = adjustl(str) ! remove left blanks
33 |       pos = scan(str,'!') 
34 |       if(pos.gt.0) then
35 |         str = str(1:pos-1)  ! extract the parameter value
36 |       end if
37 |       select case(i)    
38 |       case(2)  
39 |         read(str,*) Settings%AffinityMode 
40 |       case(3)  
41 |         read(str,*) Settings%AffinityPattern
42 |       case(4)  
43 |         read(str,*) Settings%UseNested
44 |       case(5)  
45 |         read(str,*) Settings%StartThread 
46 |       case(6)  
47 |         read(str,*) Settings%NCPUs 
48 |       case(7)  
49 |         read(str,*) Settings%NCPUsLow 
50 |       case(8)  
51 |         read(str,*) Settings%NCPUOuter 
52 |       case(9)  
53 |         read(str,*) Settings%SolverType 
54 |       case(10)  
55 |         read(str,*) Settings%MaxIter 
56 |       case(11)  
57 |         read(str,*) Settings%BlockSize 
58 |       case(12)  
59 |         read(str,*) Settings%FillingFactor 
60 |       case(13)  
61 |         read(str,*) Settings%UseNormalization 
62 |       end select
63 |     end do
64 |     close(F)  
65 |     end subroutine LoadSettings
66 |     
67 |     end module mInput


--------------------------------------------------------------------------------
/Matrix/001_Definition.txt:
--------------------------------------------------------------------------------
1 | 42332    !Number of rows       
2 | 42332    !Number of columns    
3 | 545828   !Number of elements   
4 | 38       !Sounding size         
5 | 


--------------------------------------------------------------------------------
/Matrix/Matrix_Spy_Map.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tueboesen/Sparse-iterative-parallel-linear-solver/fe82bd08e2050716c2ed43996337d5e8f4723c44/Matrix/Matrix_Spy_Map.png


--------------------------------------------------------------------------------
/Matrix/Settings.ini:
--------------------------------------------------------------------------------
 1 | Example of an ini file - this is skipped during the load and can contain anything
 2 | -99      !AffinityMode
 3 | -99      !AffinityPattern
 4 | 0        !UseNested
 5 | -99      !StartThread
 6 | 0        !NCPUs
 7 | -99      !NCPUsLow
 8 | -99      !NCPUOuter
 9 | -1       !SolverType
10 | -1       !Max number of iterations to run during the propagation
11 | 1        !Blocksize, Each block will be a modulo of this size 
12 | -1       !FillingFactor, A factor that scales the number of elements saved in each row during the IC factorization
13 | 1        !Determines whether to normalize the matrix before solving it.


--------------------------------------------------------------------------------
/Misc.f90:
--------------------------------------------------------------------------------
  1 | Module mMisc
  2 |   !Tue a module for small usefull routines that doesn't fit in anywhere else, but is used through different modules
  3 |   implicit none
  4 |   interface LoadOptionalParam
  5 |     module procedure LoadOptionalParam_int, LoadOptionalParam_real, LoadOptionalParam_real2
  6 |   end interface
  7 |   contains
  8 | 
  9 |   Function LoadOptionalParam_int(DefaultValue,iParam)
 10 |     !Tue Dec, 2014
 11 |     !This function is used to load optional parameters
 12 |     !IO
 13 |     !DefaultValue -> A default value which we load if the optional parameter does not exist.
 14 |     !iParam       -> the optional parameter that we want to load
 15 |     implicit none
 16 |     integer, intent(in)          :: DefaultValue
 17 |     integer, intent(in),optional :: iParam
 18 |     integer                      :: LoadOptionalParam_int
 19 |     if(present(iParam)) then
 20 |         LoadOptionalParam_int=iParam
 21 |       else 
 22 |         LoadOptionalParam_int=DefaultValue
 23 |       end if
 24 |   end function LoadOptionalParam_int
 25 |   Function LoadOptionalParam_real(DefaultValue,iParam)
 26 |     !Tue Dec, 2014
 27 |     !This function is used to load optional parameters
 28 |     !IO
 29 |     !DefaultValue -> A default value which we load if the optional parameter does not exist.
 30 |     !iParam       -> the optional parameter that we want to load
 31 |     implicit none
 32 |     real*8, intent(in)          :: DefaultValue
 33 |     real*8, intent(in),optional :: iParam
 34 |     real*8                      :: LoadOptionalParam_real
 35 |     if(present(iParam)) then
 36 |         LoadOptionalParam_real=iParam
 37 |       else 
 38 |         LoadOptionalParam_real=DefaultValue
 39 |       end if
 40 |   end function LoadOptionalParam_real
 41 |   Function LoadOptionalParam_real2(DefaultValue,iParam)
 42 |     !Tue Dec, 2014
 43 |     !This function is used to load optional parameters
 44 |     !IO
 45 |     !DefaultValue -> A default value which we load if the optional parameter does not exist.
 46 |     !iParam       -> the optional parameter that we want to load
 47 |     implicit none
 48 |     integer, intent(in)          :: DefaultValue
 49 |     real*8, intent(in),optional :: iParam
 50 |     real*8                      :: LoadOptionalParam_real2
 51 |     if(present(iParam)) then
 52 |         LoadOptionalParam_real2=iParam
 53 |       else 
 54 |         LoadOptionalParam_real2=DefaultValue
 55 |       end if
 56 |   end function LoadOptionalParam_real2
 57 |     end module mMisc
 58 | 
 59 |     
 60 |     module mArrays
 61 |    interface ArraysCheckAndIncAlloc
 62 |     module procedure mArraysCheckAndIncAllocReal, mArraysCheckAndIncAllocInt
 63 |    end interface
 64 |    
 65 |   contains  
 66 |     subroutine mArraysReallocateInts(iArray,iNewSize)
 67 |     implicit none
 68 |     integer,dimension(:),pointer,intent(inout) :: iArray
 69 |     integer,intent(in)                         :: iNewSize
 70 |     integer,dimension(:),pointer               :: TempArray 
 71 |     integer                                    :: CurSize 
 72 |     integer                                    :: TransferSize 
 73 |     integer                                    :: Error
 74 |     character*256                              :: S
 75 |     !Find the size to transfer to the resized array to be allocated.
 76 |     CurSize=Size(iArray)
 77 | 
 78 |     if(CurSize==0) then
 79 |       !goto 902
 80 |       !CS: Below: in case we don't want an error message
 81 |       allocate(iArray(iNewSize),Stat=Error)
 82 |       if (Error.ne.0)&
 83 |         goto 900
 84 |       return
 85 |     end if  
 86 | 
 87 |     if (CurSize.ne.iNewsize) then
 88 |       TransferSize=CurSize
 89 |       if (TransferSize.gt.iNewSize)&
 90 |         TransferSize=iNewSize      
 91 |       !Allocate temporary arrays.
 92 |       allocate(TempArray(TransferSize),Stat=Error) 
 93 |       if (Error.ne.0)&
 94 |         goto 900
 95 |       !Transfer values to temporary arrays - first the real*8 array holding the values.    
 96 |       TempArray(1:TransferSize)=iArray(1:TransferSize)
 97 |       !Deallocate old array
 98 |       if (associated(iArray))&
 99 |         deallocate(iArray)
100 |       !Allocate new array of updated length and transfer the old values
101 |       allocate(iArray(iNewSize),Stat=Error) 
102 |       if (Error.ne.0)&
103 |         goto 900
104 |       if (TransferSize+1.le.iNewSize) then
105 |         iArray(TransferSize+1:iNewSize)=0
106 |       end if
107 |       iArray(1:TransferSize)=TempArray(1:TransferSize)  
108 |       deallocate(TempArray)  
109 |     end if
110 |     return
111 | 900 continue
112 |     S='Error re-allocating memory. Please increase the free physical memory or decrease the number of data.'
113 |      go to 999
114 | 901 continue
115 |     S='Error de-allocating memory. Please increase the free physical memory or decrease the number of data.'
116 |      go to 999
117 | 902 continue
118 |     S='Error in mArraysReallocateInts iArray array has a 0 size at the entry of the subroutine.'
119 |      go to 999
120 | 999 print*, S    
121 |     end subroutine mArraysReallocateInts
122 |       
123 | !******************************************************************************      
124 |     subroutine mArraysReallocateReals(iArray,iNewSize)
125 |     implicit none
126 |     real*8,dimension(:),pointer,intent(inout) :: iArray
127 |     integer,intent(in)                        :: iNewSize
128 |     real*8,dimension(:),pointer               :: TempArray 
129 |     integer                                   :: CurSize 
130 |     integer                                   :: TransferSize 
131 |     integer                                   :: Error
132 |     character*256                             :: S
133 |     !Find the size to transfer to the resized array to be allocated.
134 |     CurSize=Size(iArray)
135 | 
136 |     if(CurSize==0) then
137 |       !goto 902
138 |       !CS: Below: in case we don't want an error message
139 |       allocate(iArray(iNewSize),Stat=Error)
140 |       iArray(:)=0.d0
141 |       if (Error.ne.0)&
142 |         goto 900
143 |       return
144 |     end if  
145 | 
146 |     if (CurSize.ne.iNewsize) then
147 |       TransferSize=CurSize
148 |       if (TransferSize.gt.iNewSize)&
149 |         TransferSize=iNewSize      
150 |       !Allocate temporary arrays.
151 |       allocate(TempArray(TransferSize),Stat=Error) 
152 |       if (Error.ne.0)&
153 |         goto 900
154 |       !Transfer values to temporary arrays - first the real*8 array holding the values.    
155 |       TempArray(1:TransferSize)=iArray(1:TransferSize)
156 |       !Deallocate old array
157 |       if (associated(iArray)) then
158 |         deallocate(iArray,Stat=Error)
159 |         if (Error.ne.0)&
160 |           goto 901
161 |       end if  
162 |       !Allocate new array of updated length and transfer the old values
163 |       allocate(iArray(iNewSize),Stat=Error) 
164 |       iArray(:)=0.d0
165 |       if (Error.ne.0)&
166 |         goto 900
167 |       iArray(1:TransferSize)=TempArray(1:TransferSize) 
168 |       deallocate(TempArray,Stat=Error) 
169 |       if (Error.ne.0)&
170 |         goto 901  
171 |     end if
172 |     return
173 | 900 continue
174 |     S='Error re-allocating memory. Please increase the free physical memory or decrease the number of data.'
175 |      go to 999
176 | 901 continue
177 |     S='Error de-allocating memory. Please increase the free physical memory or decrease the number of data.'
178 |      go to 999
179 | 902 continue
180 |     S='Error in mArraysReallocateReals iArray array has a 0 size at the entry of the subroutine.'
181 |      go to 999
182 | 999 print*, S    
183 |     end subroutine mArraysReallocateReals
184 | 
185 |         subroutine mArraysCheckAndIncAllocReal(Nnew,ioAR)
186 | ! This routine increases the allocation of ioAR if
187 | ! Nnew is larger than the allocation of ioAR.
188 | ! The Nnew allocation will be 2*Nnew to avoid future allocations
189 | ! No data are transferred !    
190 |     integer,intent(in) :: Nnew
191 |     real*8,allocatable,intent(inout) :: ioAR(:)
192 |     integer :: k
193 |     if (allocated(ioAR)) then
194 |         K=ubound(ioAR,dim=1)
195 |         if (k.lt.Nnew) then
196 |             deallocate(ioAR)  
197 |             allocate(ioAR(2*Nnew+1))
198 |         end if    
199 |     else
200 |         allocate(ioAR(2*Nnew))
201 |     end if
202 |     end subroutine mArraysCheckAndIncAllocReal
203 |     
204 | 	!dir$ attributes offload : mic :: marrayscheckAnsIncAllocInt
205 |     subroutine mArraysCheckAndIncAllocInt(Nnew,ioAR)
206 | ! This routine increases the allocation of ioAR if
207 | ! Nnew is larger than the allocation of ioAR.
208 | ! The Nnew allocation will be 2*Nnew to avoid future allocations
209 | ! No data are transferred !    
210 |     integer,intent(in) :: Nnew
211 |     integer,allocatable,intent(inout) :: ioAR(:)
212 |     integer :: k
213 |     if (allocated(ioAR)) then
214 |         K=ubound(ioAR,dim=1)
215 |         if (k.lt.Nnew) then
216 |             deallocate(ioAR)  
217 |             allocate(ioAR(2*Nnew+1))
218 |         end if    
219 |     else
220 |         allocate(ioAR(2*Nnew))
221 |     end if
222 |     end subroutine mArraysCheckAndIncAllocInt    
223 | end module mArrays    


--------------------------------------------------------------------------------
/Parallelization.f90:
--------------------------------------------------------------------------------
  1 | Module mParallel
  2 |   !Tue Boesen 2017  
  3 |   !This module is meant to work as a general parallelization framework, and is meant to replace all other parallelization existing in a code.
  4 |   !The general structure of the code is as follows:
  5 |   !On initialization the subroutine InitOpenMP() should be called, it is imperative that this routine is called before any OpenMP routines are called - even omp_get_wtime()
  6 |   !When creating parallel regions anywhere in the code, these regions should in general always be created with an if call to StartOpenMP 
  7 |   use omp_lib
  8 |   use IFPORT, only: SETENVQQ
  9 |   implicit none
 10 | 
 11 |   !Module variables - These variables changes dynamically during a run  
 12 |   logical,private :: RunningSingleParallelRegion !Used to lock against further parallelization while running a nonnested region.
 13 |   integer,private :: OldThreadNumber             !Saves the previous threadnumber so we can return to this number afterwards if desired.
 14 |   !Initial settings - These variables get set in InitOpenMP, and then remain static after that
 15 |   integer,private :: UseNested                   !Use nested parallelization?   
 16 |   integer,private :: NCPUs                       !Total number of threads
 17 |   integer,private :: NCPUsLow                    !Total number of threads in a memory bandwidth intense region  
 18 |   integer,private :: NCPUOuter                   !Number of CPUs in the outer region of a nested parallelization
 19 |   integer,private :: NCPUInner                   !Number of CPUs in the inner region of a nested parallelization
 20 |   integer,private :: NCPUInnerLow                !Number of CPUs in the inner region of a nested parallelization with memory bandwith limits
 21 |   integer,private :: AffinityMode                !Dedicated or shared server?
 22 |   integer,private :: AffinityPattern             !Scattered or compact affinity
 23 |   integer,private :: StartThread                 !ThreadID of first affinity binding
 24 |   contains
 25 | 
 26 |   subroutine InitOpenMP(iAffinityMode,iAffinityPattern,iStartThread,iUseNested, iNCPUs,iNCPUsLow,iNCPUOuter)
 27 |     !This is the initialization routine for OpenMP. 
 28 |     !It NEEDS to be run before any OpenMP routines are called - even omp_get_wtime().
 29 |     !This routine setup the entire parallel framework. 
 30 |     !Based on the input variables it determines the number of threads to use, the affinity which they should be used with, the parallelization structure 
 31 |     !On output this routine will have set all the following private module variables: NCPUs, NCPUInner, NCPUOuter, NCPUInnerLow, NCPUsLow, AffinityMode, UseNested, StartThread.
 32 |     !All these variables will be used by the subroutine StartOpenMP, which should be called everytime a parallel region is potentially about to be created.
 33 |     
 34 |     !IO
 35 |       !Note any variable can be set to -99 for automatic determination. 
 36 |       !AffinityMode    -> Specifies different modes thread binding can occur. 
 37 |       !  0=shared mode. 
 38 |       !                 Threads bind to a socket, but are free to migrate between the cores of a socket.
 39 |       !                 This mode is usefull when running on a server with multiple users, however,
 40 |       !                 performance can also be lower than the dedicated mode. Shared mode does not use nested parallelization in any way.
 41 |       !                 Note if all CPU's are used, AffinityMode will automatically be changed from shared to dedicated
 42 |       !  1=Dedicated mode. 
 43 |       !                 Threads bind to individual cores, and hence are locked in place. This gives better performance, but only if the cores are not used for anything else.  
 44 |       !  2=manual mode. No affinity settings are made inside the program. It is left to the user to set proper environment variables, before starting
 45 |       !                 the program.
 46 |       !iAffinityPattern -> Sets the affinity pattern for the threadbinding. This is mostly relevant on NUMA systems, or when using nested parallelization.
 47 |       !  0=Scatter threads.
 48 |       !                 Threads are distributed as far apart as possible.
 49 |       !  1=Compact threads.
 50 |       !                 Threads are clustered together.  
 51 |       !iStartThread    -> The threadID the first thread is bound to. 
 52 |       !iUseNested      -> Determines whether to enable nested parallelization or not                   (0=no nested parallelization allowed,1=Nested parallelization enabled)
 53 |       !iNCPUs          -> Total number of CPUs to use                                                  (0=all, -1=all-1, -2=all-2, ect.)  
 54 |       !iNCPUsLow       -> Total number of CPUs to use in a bandwidth intensive regions                 (0=all, -1=all-1, -2=all-2, ect.)  
 55 |     !The following are only relevant if nested parallelization is used.                                
 56 |       !iNCPUOuter      -> Number of CPUs to use in outer parallel region.                              (Should be set between 2..and Number of numanodes)                               
 57 |     use ifwin
 58 |     implicit none
 59 |     integer,intent(in) :: iAffinityMode,iAffinityPattern,iStartThread,iUseNested,iNCPUs,iNCPUsLow,iNCPUOuter
 60 |     !Local vars
 61 |     logical(4)           :: Success
 62 |     integer              :: i,j
 63 |     integer              :: numaNodeCount,processorPackageCount,processorCoreCount,logicalProcessorCount
 64 |     integer,dimension(3) :: processorCacheCount
 65 |     real*8               :: rand
 66 |     logical              :: Debug = .false. !Only used for debugging, to see how the threads are bound and what was detected.
 67 |     character(1024)      :: KmpAff          
 68 |     !Vars for manual binding
 69 |     integer(HANDLE) Process
 70 |     integer(DWORD_PTR) AffinityMask
 71 |     integer Core
 72 |     integer(BOOL) Retval
 73 |     integer(DWORD) iError
 74 |     
 75 |       call GetMachineArch(numaNodeCount,processorPackageCount,processorCoreCount, logicalProcessorCount,processorCacheCount ) !Get machine architecture
 76 |       if(Debug) then
 77 |         print*, "GetLogicalProcessorInformation results:"
 78 |         print*, "  Number of NUMA nodes: ", numaNodeCount
 79 |         print*, "  Number of physical processor packages: ", processorPackageCount
 80 |         print*, "  Number of processor cores: ", processorCoreCount
 81 |         print*, "  Number of logical processors: ", logicalProcessorCount
 82 |         print*, "  Number of processor L1/L2/L3 caches: ",processorCacheCount    
 83 |       end if
 84 |       
 85 |       !We set StartThread
 86 |       StartThread = iStartThread
 87 |       if (StartThread.le.-1) then
 88 |          call Random_seed()
 89 |          call Random_Number(rand) !Random number between 0-1
 90 |          StartThread=floor(rand*logicalProcessorCount) !scaled to 0-(logicalprocessorcount-1)
 91 |       end if   
 92 |       StartThread=max(0,StartThread) !Sanity check
 93 |       StartThread=min(logicalProcessorCount-1,StartThread) !Sanity check
 94 |       
 95 |       !We set NCPUs
 96 |       NCPUs=iNCPUs
 97 |       if (NCPUs.eq.-99) then
 98 |         NCPUs=processorCoreCount/2 !We use approximately half the cores if nothing else is specified.   
 99 |       end if      
100 |       if (NCpus.le.0) then
101 |         NCPUs = max(processorCoreCount+NCpus,1)
102 |       end if
103 |       NCPUs=min(logicalProcessorCount,NCPUs) !Sanity check
104 |       NCPUs=max(1,NCPUs) !Sanity check
105 |       
106 |       !We set NCPUsLow
107 |       NCPUsLow=iNCPUsLow
108 |       if (NCPUsLow.eq.-99) then
109 |         NCPUsLow=NCPUs*2/3 !We use approximately 2/3 of the threads specified for a bandwith intensive region, if nothing else is specified.   
110 |       end if      
111 |       if (NCpusLow.le.0) then
112 |         NCPUsLow = max(processorCoreCount+NCpusLow,1)
113 |       end if
114 |       NCPUsLow=min(NCPUs,NCPUsLow) !Sanity check
115 |       NCPUsLow=max(1,NCPUsLow) !Sanity check
116 |             
117 |       !We set AffinityMode
118 |       AffinityMode=iAffinityMode
119 |       if (AffinityMode.eq.-99) then
120 |         AffinityMode=0
121 |       end if
122 |       if(NCpus.ge.logicalProcessorCount.AND.AffinityMode.EQ.0) then !We use all cores, no reason to run shared mode in that case.
123 |         AffinityMode=1 
124 |       end if
125 |       if (AffinityMode.lt.0.OR.AffinityMode.gt.2) then !Sanity check
126 |         print*, 'Warning! Affinity mode not set correctly. Setting shared AffinityMode'        
127 |         AffinityMode=0
128 |       end if
129 |       
130 |       !We set UseNested
131 |       UseNested=iUseNested
132 |       if ((AffinityMode.eq.0).OR.(NCPUs.lt.2).OR.(UseNested.ne.1)) then
133 |         UseNested=0
134 |       end if
135 |       
136 |       !We set AffinityPattern
137 |       AffinityPattern=iAffinityPattern
138 |       if((AffinityPattern.ne.0).OR.(AffinityPattern.ne.1)) then
139 |         if(UseNested) then
140 |           AffinityPattern=0 !Scatter
141 |         else
142 |           AffinityPattern=1 !Compact
143 |         end if
144 |       end if
145 |       
146 |       if(UseNested.eq.1) then
147 |         !We set NCPUOuter
148 |         if (NCPUOuter.eq.-99) then
149 |           NCPUOuter=numaNodeCount 
150 |         end if
151 |         NCPUOuter=min(max(NCPUOuter,2),numaNodeCount) !Sanity check
152 |         NCPUOuter=min(NCPUOuter,NCPUs) !Sanity check
153 |         
154 |         !We set NCPUInner
155 |         NCPUInner=NCPUs/NCPUOuter 
156 |  
157 |         !We set NCPUInnerLow
158 |         NCPUInnerLow=NCPUsLow/NCPUOuter 
159 |         
160 |         if(NCPUs.ne.NCPUOuter*NCPUInner) then
161 |           !The original CPU numbers do not match a multiplum of the detected NumaNodes
162 |           NCPUs=NCPUOuter*NCPUInner
163 |           if(iNCPUs.ne.-99) then
164 |             print*,'NCPUs was originally chosen as',iNCPUs
165 |             print*,'In order to preserve even load across the NUMAnodes NCPUs is changed to:', NCPUs
166 |           end if
167 |         end if
168 |         if(NCPUsLow.ne.NCPUOuter*NCPUInnerLow) then
169 |           !The original Low CPU numbers do not match a multiplum of the detected NumaNodes
170 |           NCPUsLow=NCPUOuter*NCPUInnerLow
171 |           if(iNCPUsLow.ne.-99) then
172 |             print*,'NCPUsLow was originally chosen as',iNCPUsLow
173 |             print*,'In order to preserve even load across the NUMAnodes NCPUs is changed to:', NCPUsLow
174 |           end if
175 |         end if
176 |       else
177 |         NCPUOuter = NCPUs
178 |         NCPUInner = 1
179 |       end if
180 |       
181 |       !All parameters determined and set, now we set all the actual environment variables
182 |       if(NCPUs.ge.1.AND.AffinityMode.ne.2) then       
183 |         call GetKmpAffinityStr(KmpAff,NCPUs,numaNodeCount,processorCoreCount, logicalProcessorCount,AffinityMode,AffinityPattern,StartThread)
184 |         Success = SETENVQQ(KmpAff) 
185 |       else
186 |         Process = GetCurrentProcess()
187 |         Core = StartThread 
188 |         AffinityMask = ishft(1_DWORD_PTR, Core)
189 |         Retval = SetProcessAffinityMask(Process,AffinityMask)
190 |         if(Retval == 0) then
191 |           iError = GetLastError()
192 |           write(*,*) 'Failed to set the affinity properly',iError
193 |           !stop
194 |         end if  
195 |       end if
196 |       if(Debug) then
197 |         print*,'omp_bind_string:',KmpAff
198 |         print*,'NCPU',NCPUs
199 |         print*,'NCPUsLow',NCPUsLow
200 |         print*,'NCPUouter',NCPUouter
201 |         print*,'NCPUInner',NCPUinner
202 |         print*,'NCPUInnerLow',NCPUinnerLow
203 |         print*,'UseNested',UseNested
204 |         print*,'AffinityMode',AffinityMode
205 |         print*,'AffinityPattern',AffinityPattern
206 |         print*,'StartThread',StartThread
207 |       end if
208 |       call KMP_SET_STACKSIZE_S(8000000) !In our code this is essential otherwise we get very subtle stack memory corruption in some of our routines
209 |       if (UseNested) then
210 |         call omp_set_nested(1) 
211 |         call omp_set_max_active_levels(2)        
212 |       else 
213 |         call omp_set_nested(0)
214 |       end if
215 |       call omp_set_num_threads(NCPUs)     
216 |       !OpenMP settings are applied when the first parrallel loop is found. We do a dummy loop here to get it done now..
217 |       !$OMP PARALLEL DEFAULT(PRIVATE) SHARED(NCpus) REDUCTION(+:J)
218 |         J=0
219 |       !$OMP DO
220 |         do i=1,NCpus
221 |           J=J+1      
222 |         end do   
223 |       !$OMP END DO
224 |       !$OMP END PARALLEL
225 |       if(debug) then
226 |         print*,'InitOpenMP complete'
227 |       end if            
228 |       return
229 |   end subroutine InitOpenMP
230 | 
231 |   subroutine GetMachineArch(numaNodeCount,processorPackageCount,processorCoreCount, logicalProcessorCount,processorCacheCount )
232 |     !This routine gets all the current hardware system information.
233 |     !IO
234 |     !numaNodeCount         -> Number of NUMA nodes
235 |     !processorPackageCount -> Number of physical processor packages
236 |     !processorCoreCount    -> Number of processor cores
237 |     !logicalProcessorCount -> Number of logical processors
238 |     !processorCacheCount   -> Number of processor L1/L2/L3 caches
239 |     use, intrinsic :: ISO_C_BINDING
240 |     use kernel32
241 |     implicit none   
242 |     integer,intent(out)              :: numaNodeCount,processorPackageCount,processorCoreCount,logicalProcessorCount
243 |     integer,dimension(3),intent(out) :: processorCacheCount     
244 |     !Local vars
245 |     procedure(GetLogicalProcessorInformation), pointer :: glpi
246 |     type(T_SYSTEM_LOGICAL_PROCESSOR_INFORMATION), allocatable, dimension(:) :: buffer
247 |     integer(DWORD) :: returnLength = 0
248 |     integer(DWORD) :: ret
249 |     integer :: nlpi, lpi_element_length, i      
250 |       processorCacheCount=0
251 |       numaNodeCount = 0
252 |       processorCoreCount = 0
253 |       logicalProcessorCount = 0
254 |       processorPackageCount = 0
255 |       call c_f_procpointer( & !Get kernel information
256 |         transfer( &
257 |         GetProcAddress( &
258 |         GetModuleHandle("kernel32"//C_NULL_CHAR), &
259 |         "GetLogicalProcessorInformation"//C_NULL_CHAR &
260 |         ), &
261 |         C_NULL_FUNPTR &
262 |         ), &
263 |         glpi)
264 |     
265 |       if (.not. associated(glpi)) then
266 |         print*,  "GetLogicalProcessorInformation not supported"
267 |         error stop
268 |       end if
269 |       !We don't know in advance the size of the buffer we need. We'll pick a number, allocate it,
270 |       !and see if that's sufficient.  If not, we'll use the returned size information and reallocate
271 |       !the buffer to the required size.
272 |       allocate (buffer(20))
273 |       lpi_element_length = C_SIZEOF(buffer(1))
274 |       returnLength = C_SIZEOF(buffer)
275 |       ret = glpi(buffer, returnLength)
276 |       if (ret == FALSE) then ! Failed
277 |         if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) then
278 |           deallocate (buffer)
279 |           allocate (buffer(returnLength/lpi_element_length))
280 |           ret = glpi(buffer, returnLength)
281 |           if (ret == FALSE) then
282 |             print*,  "GetLogicalProcessorInformation call failed with error code ", GetLastError()
283 |             error stop
284 |           end if
285 |         else
286 |           print*, "GetLogicalProcessorInformation call failed with error code ", GetLastError()
287 |           error stop
288 |         end if
289 |       end if
290 |       !Now we can iterate through the elements of buffer and see what we can see
291 |       do i=1, returnLength / lpi_element_length ! Number of elements in buffer
292 |         select case (buffer(i)%Relationship)
293 |         case(RelationNumaNode)
294 |           numaNodeCount = numaNodeCount + 1
295 |         case(RelationProcessorCore)
296 |           processorCoreCount = processorCoreCount + 1
297 |           !A Hyperthreaded core supplies more than one logical processor
298 |           logicalProcessorCount = logicalProcessorCount + popcnt(buffer(i)%processorMask)
299 |         case(RelationCache)
300 |           !One cache descriptor for each cache
301 |           if (buffer(i)%Cache%Level > 0 .and. buffer(i)%Cache%Level <= 3) then
302 |             processorCacheCount(buffer(i)%Cache%Level) = processorCacheCount(buffer(i)%Cache%Level) + 1
303 |           else
304 |             print*, "Invalid processor cache level ", buffer(i)%Cache%Level
305 |           end if
306 |         case(RelationProcessorPackage)
307 |           !Logical processors share a physical package (socket)
308 |           processorPackageCount = processorPackageCount + 1
309 |         case default
310 |           print*, "Unrecognized relationship code ", buffer(i)%Relationship
311 |         end select
312 |       end do
313 |       if(allocated(buffer)) deallocate(buffer)
314 |   end subroutine GetMachineArch
315 |     
316 |     
317 |   subroutine GetKmpAffinityStr(Str,NCPUs,numaNodeCount,processorCoreCount, logicalProcessorCount,AffinityMode,IsClose,StartThread)   
318 |     !This routine builds a custom AFFINITY string, that binds each thread.
319 |     !The actual binding depends on whether AffinityMode is 0 or 1. 
320 |     !  AffinityMode=0 
321 |     !                The binding will be loose and each thread will only be bound to a processor package.
322 |     !                Furthermore all threads will be distributed among the NUMA nodes round-robin style (ie. like dealing cards). (IsClose is not respected in this case) 
323 |     !  AffinityMode=1
324 |     !                Each thread will be bound to a logical processor. 
325 |     !IO
326 |     !Str           -> On output the string will contain a KMP_AFFINITY enviroment variable string.
327 |     !NCPUs         -> Number of threads we wish to use
328 |     !numaNodeCount -> Number of numa nodes in the system
329 |     !LogicalProcessorCount -> Number of logical processors in the system (ie. including hyperthreads).
330 |     !processorCoreCount    -> Number of physical processors in the system (ie. excluding hyperthreads).
331 |     !AffinityMode          -> Affinitymode to generate string for. 0->shared server, 1->dedicated server
332 |     !IsClose               -> If 1, then threads will be distributed on the same NUMA node first until full, before moving on to the next, if not then threads will be spread across all NUMA nodes
333 |     !StartThread           -> which threadID to start the affinity binding with
334 |     implicit none
335 |     character*(*),intent(inout) :: Str      
336 |     integer,intent(in)          :: numaNodeCount,logicalProcessorCount,AffinityMode,processorCoreCount,NCPUs,StartThread,IsClose
337 |     !Local vars
338 |     integer                     :: FirstIdx,LastIdx,i,j,ij,ProcPrNode,thread,bound,inc,CoresPrNuma,NumaID,NInc,reminder,k,numa
339 |     character(len=4)            :: TmpStr
340 |     integer,allocatable         :: usedthreads(:)
341 |     logical                     :: ThreadAccepted, Matchfound
342 |     character*256               :: String
343 |       if (affinityMode.eq.0) then !Shared server
344 |         str="KMP_AFFINITY=verbose,granularity=fine,proclist=["   !48 characters in string
345 |         FirstIdx=49
346 |         LastIdx=49
347 |         ProcPrNode=LogicalProcessorCount/numaNodeCount
348 |         numa=StartThread/ProcPrNode+1
349 |         do i=1,numaNodeCount
350 |           if (i.ne.1) then
351 |             Str(FirstIdx:FirstIdx)=','          
352 |             FirstIdx=FirstIdx+1
353 |           end if      
354 |           Str(FirstIdx:FirstIdx)='{'
355 |            FirstIdx=FirstIdx+1
356 |           do j=1,ProcPrNode
357 |             ij=(numa-1)*ProcPrNode+j-1
358 |             if (j.ne.1) then
359 |               Str(FirstIdx:FirstIdx)=','          
360 |               FirstIdx=FirstIdx+1
361 |             end if    
362 |             LastIdx=FirstIdx
363 |             if (ij.gt.9) then
364 |               LastIdx=FirstIdx+1
365 |             end if
366 |             if (ij.gt.99) then
367 |               LastIdx=FirstIdx+2
368 |             end if
369 |             write (TmpStr, '(I3)') ij
370 |             str(FirstIdx:LastIdx)=TmpStr(3-(LastIdx-FirstIdx):3)
371 |             FirstIdx=LastIdx+1
372 |           end do
373 |           Str(FirstIdx:FirstIdx)='}'
374 |           FirstIdx=FirstIdx+1
375 |           numa=numa+1
376 |           if (numa.gt.numaNodeCount) then
377 |             numa=1
378 |           end if
379 |         end do    
380 |         Str(FirstIdx:FirstIdx+10)="],explicit" 
381 |         print *,Str
382 |       else if (affinityMode.eq.1) then !Dedicated server
383 |         inc=LogicalProcessorCount/processorCoreCount    !Takes care of hyperthreading
384 |         CoresPrNuma=LogicalProcessorCount/NumaNodeCount 
385 |         str="OMP_PLACES="   !12 characters in string
386 |         FirstIdx=12
387 |         LastIdx=12
388 |         bound=min(NCPUs,LogicalProcessorCount)
389 |         thread=StartThread
390 |         allocate(usedthreads(bound))
391 |         usedthreads=-1
392 |         do i=1,bound
393 |           if(i.eq.1) then
394 |             str(FirstIdx:LastIdx)="{"  
395 |           else
396 |             LastIdx=FirstIdx+1
397 |             str(FirstIdx:LastIdx)=",{"
398 |           end if
399 |           FirstIdx=LastIdx+1
400 |           LastIdx=FirstIdx
401 |           do k=1,bound          !We keep looping until we find a thread that is acceptable to all our criterias, or until we run out of threads 
402 |             threadaccepted=.true. 
403 |             matchfound=.false.    !First we check whether we already have bound this particular thread
404 |             do j=1,bound
405 |               if(usedthreads(j).eq.thread) then
406 |                 matchfound=.true.
407 |                 threadaccepted=.false.  !we take another round
408 |                 exit
409 |               end if
410 |             end do
411 |             if (matchfound) then  !If we have bound this before we jump to the next thread
412 |               thread=thread+inc    
413 |             end if
414 |             if(thread.ge.LogicalProcessorCount) then   !If we are trying to bound to a thread higher than the number of threads available, we loop around
415 |               thread=mod(thread,LogicalProcessorCount)
416 |               threadaccepted=.false. !we take another round
417 |             end if
418 |             if(threadaccepted) then
419 |               exit !Finally we found a thread that fits our criteria, so we can exit
420 |             end if
421 |           end do
422 |           if (.not.threadaccepted) then
423 |             !exit   !If you really insist on binding more threads than cpu's just enable this Exit.
424 |             goto 11 !Safety check, we could not find a thread to bind to. This can for instance happen if you try to bind more threads than you have on your system, (hyperthreads do not count)
425 |           else
426 |             usedthreads(i)=thread 
427 |           end if
428 |           !Finally we bind the thread
429 |           if(thread.gt.9) then
430 |             LastIdx=FirstIdx+1
431 |           end if
432 |           if (thread.gt.99) then
433 |             LastIdx=FirstIdx+2
434 |           end if
435 |           write (TmpStr, '(I3)') thread
436 |           str(FirstIdx:LastIdx)=TmpStr(3-(LastIdx-FirstIdx):3)
437 |           FirstIdx=LastIdx+1
438 |           LastIdx=FirstIdx
439 |           str(FirstIdx:LastIdx)="}"
440 |           FirstIdx=LastIdx+1
441 |           !Next up we increase the thread number, since this is distributed, we increase it by jumping up one numa node.
442 |           if(IsClose) then
443 |             thread=thread+inc         !we increase to next non-hyperthread
444 |           else
445 |             thread=thread+CoresPrNuma !we increase to next numa node                 
446 |           end if
447 |         end do  
448 |       end if
449 |       return
450 |   11  print*,'No suitable thread to bind the process to was found, make sure your hardware settings are correct. If they are report this issue.'
451 |   end subroutine GetKmpAffinityStr
452 |     
453 |   function StartOpenMP(iType,iSaveOldThreadNumber) result(UseOpenMP)
454 |     use mMisc
455 |     !This function will return a logical parameter which tells whether further parallelization should occur.
456 |     !Furthermore it will set the number of threads depending on: iType, the level of parallelization set in InitOpenMP and the level of parallelization already enabled.
457 |     !
458 |     !IO
459 |     !iType -> !Determine what kind of parallel region we wish to make
460 |               !iType = 1 : (Default) Adaptive parallelization. 
461 |                   !If no previous parallelization is detected it set the number of threads to NCPUs and locks against further parallelization. 
462 |                   !If 1 layer of previous parallelization is detected it sets the number of threads to NCPUInner.
463 |                   !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false.
464 |               !iType = 2 : Adaptive parallelization for memorybandwidth intensive parallelization. Locks against further parallelization. 
465 |                   !If no previous parallelization is detected it set the number of threads to NCPUsLow and locks against further parallelization.  
466 |                   !If 1 layer of previous parallelization is detected it sets the number of threads to NCPUInnerLow.
467 |                   !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false.
468 |               !iType = 3 : NUMA spreading. Only Locks against further parallelization if previous parallelization is detected.
469 |                   !If no previous parallelization is detected it set the number of threads to NCPUOuter. 
470 |                   !If 1 layer of previous parallelization is detected it set the number of threads to NCPUInner. 
471 |                   !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false.
472 |               !iType = 4 : NUMA spreading for memorybandwidth intensive parallelization. Only Locks against further parallelization if previous parallelization is detected.
473 |                   !If no previous parallelization is detected it spawns NCPUOuter threads  in the parallelization spread across the different NUMA-nodes. 
474 |                   !If 1 layer of previous parallelization is detected it set the number of threads to NCPUInnerLow. 
475 |                   !If 2 or more layers of parallelization is detected it sets the number of threads to 1 and returns UseOpenMP=false.
476 |               !iType = 5 : XeonPhi Not implemented
477 |               !Note only Use iType=3/4 if you are sure further parallelization will occur before the CPU intensive tasks 
478 |     !iSaveOldThreadNumber -> if set to 1 it saves the old threadnumber such that it can be restored after the parallel region is done, if desired. In order to recall it call, the subroutine ReturnOldThreadNumber()
479 |     integer,intent(in)             :: iType 
480 |     integer,intent(in),optional    :: iSaveOldThreadNumber 
481 |     !Local vars
482 |     logical  :: UseOpenMP 
483 |     integer  :: i,saveOldThreadNumber,CPUs   
484 |       !Do we wish to save the old threadnumber before changing it?
485 |       SaveOldThreadNumber=LoadOptionalParam(0,iSaveOldThreadNumber)
486 |       if(SaveOldThreadNumber.eq.1) then
487 |           OldThreadNumber = omp_get_max_threads()
488 |       end if
489 |       i=omp_get_active_level() !Get previous level of parallelization
490 |       if (i.eq.0) then !No previous parallelization detected
491 |         RunningSingleParallelRegion = .false. !We unlock any potential previous parallelization locks
492 |         UseOpenMP=.true.
493 |         SELECT CASE (iType) 
494 |         CASE(1)
495 |           RunningSingleParallelRegion = .true.
496 |           CPUs=NCPUs
497 |           call omp_set_num_threads(NCPUs)
498 |         CASE(2)
499 |           RunningSingleParallelRegion = .true.
500 |           call omp_set_num_threads(NCPUsLow)
501 |         CASE(3,4)
502 |           call omp_set_num_threads(NCPUOuter)
503 |           RunningSingleParallelRegion=.false.
504 |         END SELECT
505 |       elseif(i.eq.1.AND.UseNested.AND.(.not.RunningSingleParallelRegion)) then !Outer parallelization detected, but room for inner parallelization
506 |         UseOpenMP=.true.
507 |         SELECT CASE (iType) 
508 |         CASE(1,3)
509 |           call omp_set_num_threads(NCPUInner)
510 |         CASE(2,4)
511 |           call omp_set_num_threads(NCPUInnerLow)     
512 |         END SELECT
513 |       else !Full parallelization already detected
514 |         UseOpenMP=.false.
515 |         call omp_set_num_threads(1)
516 |       end if
517 |   end function StartOpenMP
518 | 
519 |   subroutine ReturnOldThreadNum()
520 |     !This returns the thread number to the value saved in the module variable OldThreadNumber
521 |       call omp_set_num_threads(OldThreadNumber)
522 |   end subroutine ReturnOldThreadNum
523 | 
524 | end module mParallel
525 |     
526 |     
527 | 


--------------------------------------------------------------------------------
/Quicksort.f90:
--------------------------------------------------------------------------------
  1 | ! Recursive Fortran 95 quicksort routine
  2 | ! sorts real numbers into ascending numerical order
  3 | ! Author: Juli Rew, SCD Consulting (juliana@ucar.edu), 9/03
  4 | ! Based on algorithm from Cormen et al., Introduction to Algorithms,
  5 | ! 1997 printing
  6 | 
  7 | ! Made F conformant by Walt Brainerd
  8 | !
  9 | !Updated 2016 Tue
 10 | !The RealQsort now uses real*8 instead of real.
 11 | !
 12 | ! Dec 2016 KRA, added integer version
 13 | !
 14 | 
 15 | module mQuickSort
 16 | 
 17 | implicit none
 18 | public :: RealQsortC
 19 | private :: RealPartition
 20 | public :: IntQsortC
 21 | private :: IntPartition
 22 | 
 23 | interface  IntPartition
 24 |   module procedure IntPartition_real,IntPartition_complex,IntPartition_int
 25 | end interface
 26 | interface IntQsortC
 27 |   module procedure IntQsortC_complex, IntQsortC_real, IntQsortC_int
 28 | end interface
 29 | 
 30 | contains
 31 | recursive subroutine RealQsortC(A)
 32 |   real*8, intent(in out), dimension(:) :: A
 33 |   integer :: iq
 34 | 
 35 |   if(size(A) > 1) then
 36 |      call RealPartition(A, iq)
 37 |      call RealQsortC(A(:iq-1))
 38 |      call RealQsortC(A(iq:))
 39 |   endif
 40 | end subroutine RealQsortC
 41 | 
 42 | subroutine RealPartition(A, marker)
 43 |   real*8, intent(in out), dimension(:) :: A
 44 |   integer, intent(out) :: marker
 45 |   integer :: i, j
 46 |   real*8 :: temp
 47 |   real*8 :: x      ! pivot point
 48 |   x = A(1)
 49 |   i= 0
 50 |   j= size(A) + 1
 51 | 
 52 |   do
 53 |      j = j-1
 54 |      do
 55 |         if (A(j) <= x) exit
 56 |         j = j-1
 57 |      end do
 58 |      i = i+1
 59 |      do
 60 |         if (A(i) >= x) exit
 61 |         i = i+1
 62 |      end do
 63 |      if (i < j) then
 64 |         ! exchange A(i) and A(j)
 65 |         temp = A(i)
 66 |         A(i) = A(j)
 67 |         A(j) = temp
 68 |      elseif (i == j) then
 69 |         marker = i+1
 70 |         return
 71 |      else
 72 |         marker = i
 73 |         return
 74 |      endif
 75 |   end do
 76 | 
 77 | end subroutine RealPartition
 78 | 
 79 | recursive subroutine IntQsortC_int(SortInts,Secondarys)
 80 |   integer, intent(inout), dimension(:) :: SortInts
 81 |   integer, intent(inout), dimension(:) :: Secondarys
 82 |   integer                              :: iq
 83 | 
 84 |   if(size(SortInts) > 1) then
 85 |      call IntPartition(SortInts,Secondarys, iq)
 86 |      call IntQsortC(SortInts(:iq-1),Secondarys(:iq-1))
 87 |      call IntQsortC(SortInts(iq:),Secondarys(iq:))
 88 |   endif
 89 | end subroutine IntQsortC_int
 90 | recursive subroutine IntQsortC_real(SortInts,Secondarys)
 91 |   integer, intent(inout), dimension(:) :: SortInts
 92 |   real*8, intent(inout), dimension(:) :: Secondarys
 93 |   integer                              :: iq
 94 | 
 95 |   if(size(SortInts) > 1) then
 96 |      call IntPartition(SortInts,Secondarys, iq)
 97 |      call IntQsortC(SortInts(:iq-1),Secondarys(:iq-1))
 98 |      call IntQsortC(SortInts(iq:),Secondarys(iq:))
 99 |   endif
100 | end subroutine IntQsortC_real
101 | 
102 | 
103 | recursive subroutine IntQsortC_complex(SortInts,Secondarys)
104 |   integer, intent(inout), dimension(:) :: SortInts
105 |   complex*16, intent(inout), dimension(:) :: Secondarys
106 |   integer                              :: iq
107 | 
108 |   if(size(SortInts) > 1) then
109 |      call IntPartition(SortInts,Secondarys, iq)
110 |      call IntQsortC(SortInts(:iq-1),Secondarys(:iq-1))
111 |      call IntQsortC(SortInts(iq:),Secondarys(iq:))
112 |   endif
113 | end subroutine IntQsortC_complex
114 | 
115 | subroutine IntPartition_int(SortInts, Secondarys,marker)
116 |   integer, intent(inout), dimension(:) :: SortInts
117 |   integer, intent(inout), dimension(:)  :: Secondarys
118 |   integer, intent(out) :: marker
119 |   integer :: i, j
120 |   integer :: temp
121 |   real*8  :: rtemp
122 |   integer :: x      ! pivot point
123 |   x = SortInts(1)
124 |   i= 0
125 |   j= size(SortInts) + 1
126 |   do
127 |      j = j-1
128 |      do
129 |         if (SortInts(j) <= x) exit
130 |         j = j-1
131 |      end do
132 |      i = i+1
133 |      do
134 |         if (SortInts(i) >= x) exit
135 |         i = i+1
136 |      end do
137 |      if (i < j) then
138 |         ! exchange A(i) and A(j)
139 |         temp = SortInts(i)
140 |         SortInts(i) = SortInts(j)
141 |         SortInts(j) = temp
142 |         rtemp=Secondarys(i)
143 |         Secondarys(i)=Secondarys(j)
144 |         Secondarys(j)=rtemp
145 |      elseif (i == j) then
146 |         marker = i+1
147 |         return
148 |      else
149 |         marker = i
150 |         return
151 |      endif
152 |   end do
153 | 
154 | end subroutine IntPartition_int
155 | subroutine IntPartition_real(SortInts, Secondarys,marker)
156 |   integer, intent(inout), dimension(:) :: SortInts
157 |   real*8, intent(inout), dimension(:)  :: Secondarys
158 |   integer, intent(out) :: marker
159 |   integer :: i, j
160 |   integer :: temp
161 |   real*8  :: rtemp
162 |   integer :: x      ! pivot point
163 |   x = SortInts(1)
164 |   i= 0
165 |   j= size(SortInts) + 1
166 |   do
167 |      j = j-1
168 |      do
169 |         if (SortInts(j) <= x) exit
170 |         j = j-1
171 |      end do
172 |      i = i+1
173 |      do
174 |         if (SortInts(i) >= x) exit
175 |         i = i+1
176 |      end do
177 |      if (i < j) then
178 |         ! exchange A(i) and A(j)
179 |         temp = SortInts(i)
180 |         SortInts(i) = SortInts(j)
181 |         SortInts(j) = temp
182 |         rtemp=Secondarys(i)
183 |         Secondarys(i)=Secondarys(j)
184 |         Secondarys(j)=rtemp
185 |      elseif (i == j) then
186 |         marker = i+1
187 |         return
188 |      else
189 |         marker = i
190 |         return
191 |      endif
192 |   end do
193 | 
194 | end subroutine IntPartition_real
195 | 
196 | 
197 | subroutine IntPartition_complex(SortInts, Secondarys,marker)
198 |   integer, intent(inout), dimension(:) :: SortInts
199 |   complex*16, intent(inout), dimension(:)  :: Secondarys
200 |   integer, intent(out) :: marker
201 |   integer :: i, j
202 |   integer :: temp
203 |   complex*16  :: rtemp
204 |   integer :: x      ! pivot point
205 |   x = SortInts(1)
206 |   i= 0
207 |   j= size(SortInts) + 1
208 |   do
209 |      j = j-1
210 |      do
211 |         if (SortInts(j) <= x) exit
212 |         j = j-1
213 |      end do
214 |      i = i+1
215 |      do
216 |         if (SortInts(i) >= x) exit
217 |         i = i+1
218 |      end do
219 |      if (i < j) then
220 |         ! exchange A(i) and A(j)
221 |         temp = SortInts(i)
222 |         SortInts(i) = SortInts(j)
223 |         SortInts(j) = temp
224 |         rtemp=Secondarys(i)
225 |         Secondarys(i)=Secondarys(j)
226 |         Secondarys(j)=rtemp
227 |      elseif (i == j) then
228 |         marker = i+1
229 |         return
230 |      else
231 |         marker = i
232 |         return
233 |      endif
234 |   end do
235 | 
236 | end subroutine IntPartition_complex
237 | 
238 | end module mQuickSort
239 | 
240 | 
241 | 
242 | 


--------------------------------------------------------------------------------
/RCM.f90:
--------------------------------------------------------------------------------
  1 | module mRCM
  2 | ! By Kristoffer Andersen, Dec 2016
  3 | ! Updated by Tue Boesen, August 2017
  4 | ! These routines can be used to reorder a matrix.     
  5 | ! The only dependecies are to mSmat, mError, mArrays and mQuicksort
  6 | !
  7 | !   The main routines are>
  8 | !       RCMSort - performs a reverse Cuthill-McKee ordering of a matrix
  9 | !       SymPerm - performs a symmetric permutation of a matrix: PAP^t
 10 | !       HalfPermOnlyTranspose - by using permutation operations this 
 11 | !               routine finds the transpose of a matrix    
 12 |     
 13 |     use mSMat
 14 |     implicit none
 15 |     
 16 |     contains
 17 |     subroutine RCMSort(ioA,oPerm)
 18 |     ! Kristoffer Andersen, Dec 2016
 19 |     ! this routine finds the reverse Cuthill-McKee ordering of matrix ioA
 20 |     ! and makes the symmetric permutation A = PAP^t
 21 |     !
 22 |     ! IO
 23 |     ! IO - ioA - on input the unpermuted matrix
 24 |     !            on output the permuted matrix
 25 |     ! O - oPerm - the permuatation vector - this can ie be used for the vectors used together with the matrix
 26 |     !
 27 |     implicit none
 28 |     type(Tsparsemat),intent(inout)  :: ioA
 29 |     integer,intent(out)             :: oPerm(:)
 30 |     ! prog
 31 |     call RCMOrdering(ioA,oPerm)
 32 |     call symperm(ioA,oPerm)
 33 |     end subroutine
 34 |     
 35 |     
 36 |     subroutine RCMOrdering(iA,oRCMind,opStartIndex)
 37 |     ! Kristoffer Andersen, Dec 2016
 38 |     ! this routine finds the reverse Cuthill-McKee ordering for a 
 39 |     ! symmetric sparse matrix iA.
 40 |     ! IO
 41 |     ! IO - iA - the unpermuted matrix
 42 |     ! O - RCMind - the permutation vector - this can ie be used for the vectors used together with the matrix
 43 |     ! I (optional) - opStartIndex - optional start index (default = 1)
 44 |     !
 45 |     use mArrays
 46 |     use mQuickSort
 47 |     implicit none
 48 |     type(Tsparsemat),intent(inout)  :: iA
 49 |     integer,intent(out)             :: oRCMind(:)
 50 |     integer,intent(in),optional     :: opStartIndex
 51 |     !var
 52 |     logical, allocatable        :: inRCMind(:)
 53 |     integer :: i,j,k,ni,ni1,nneighbours,jn,ad,itot,inArray,MinEle,n,MaxEle
 54 |     integer,allocatable :: Neighbours(:),Adj(:),R(:),ele(:)
 55 |     !prog
 56 |     
 57 |     allocate(inRCMind(iA%NoRows))
 58 |     allocate(Ele(iA%NoRows))
 59 |     allocate(R(iA%NoRows))
 60 |  
 61 |     inRCMInd(:) = .false. ! non sorted
 62 |     
 63 |     MaxEle=-1
 64 |     Ele=-1
 65 |     do j=1,iA%NoRows
 66 |        n=iA%RowIdxs(j+1)-iA%RowIdxs(j)
 67 |        MaxEle=max(MaxEle,n)
 68 |        Ele(j)=n
 69 |     end do
 70 |     
 71 |     allocate(Neighbours(MaxEle))
 72 |     allocate(Adj(MaxEle))    
 73 |         
 74 |     
 75 |     R=-1
 76 |     itot = 1
 77 |     inArray = 0
 78 |     do 
 79 |         if(itot.gt.iA%NoRows) exit ! termination
 80 |         if(inArray.ge.iA%NoRows ) exit
 81 |         
 82 |         if(R(itot).eq.-1) then
 83 |           MinEle=999999
 84 |           inArray=inArray+1
 85 |           do j=1,iA%NoRows
 86 |             if (inRCMInd(j)) then
 87 |               cycle    
 88 |             end if            
 89 |             n=iA%RowIdxs(j+1)-iA%RowIdxs(j)  
 90 |             if (n.lt.MinEle) then
 91 |               MinEle=n
 92 |               R(itot)=j
 93 |             end if
 94 |           end do
 95 |         end if
 96 |         i = R(itot)  !present index
 97 |         inRCMInd(i) = .true. !Mark it 
 98 |         
 99 |          
100 |         
101 |         ! get neighbours
102 |         ni = iA%Rowidxs(I)
103 |         ni1 = iA%Rowidxs(I+1)
104 |         Nneighbours = ni1-ni
105 |         Neighbours(1:Nneighbours) = ia%Colidxs(ni:(ni1-1))
106 |         ! remove already present neighbours - this will also remove the self reference
107 |         jn = 1
108 |         do j = 1,Nneighbours
109 |             if(.not.inRCMInd(Neighbours(J))) then 
110 |                 Neighbours(Jn) = Neighbours(J)
111 |                 jn = jn+1
112 |             endif
113 |         enddo
114 |         Nneighbours = jn-1
115 |         
116 |         Adj(1:Nneighbours)=Ele(Neighbours(1:Nneighbours))        
117 |         ! sorting
118 |         call IntQSortC(adj(1:Nneighbours),Neighbours(1:Nneighbours))  ! we need a real type routine for this. 
119 |         
120 |         !Add new nodes to the queue
121 |         do j = 1,Nneighbours
122 |             inArray = inArray+1
123 |             R(inArray) = Neighbours(j) ! add neighbours
124 |             inRCMInd(Neighbours(J)) = .true. ! mark as included
125 |         enddo
126 |         itot = itot+1        
127 |     enddo
128 |     
129 |     ! finally reverse the order
130 |     do i=1,(iA%NoRows)
131 |         oRCMind(i) = R(iA%NoRows-i+1)
132 |     enddo
133 |     
134 |     return
135 |     ! error messages
136 |     end subroutine RCMOrdering
137 |     
138 |     
139 |     subroutine halfperm(iA,oB,iPerm,opColCounts)
140 |     ! by Kristoffer Andersen, Dec 2016
141 |     ! this routine performs the operation B = (P*A)^t, where P is er 
142 |     ! permutation vector of A
143 |     ! The out is sorted independent on the status of the input - nice feature!
144 |     !
145 |     ! IO
146 |     ! I iA - unpermuted matrix
147 |     ! O oB - permuted and sorted matrix
148 |     ! I iPerm - permutation vector
149 |     ! I opColCounts (optional) - used to speed up repeated calls of the routine. 
150 |     implicit none
151 |     !io
152 |     type(Tsparsemat),intent(in) :: iA
153 |     type(Tsparsemat),intent(out) :: oB
154 |     integer,intent(in) :: iPerm(:)
155 |     integer,intent(in),optional :: opColCounts(:)
156 |     !var
157 |     integer :: pi,i,j,jp,p,pm1,pm2,jpt
158 |     !prog
159 |     
160 |     call SMatCreate(oB,ia%NoCols,iA%NoRows,'(PA)^T',.false.,iA%NoElements)
161 |     
162 |     ! create column structure in permuted matrix - the column count is unchanged by row permutations
163 |     if(present(opColCounts)) then ! when halfperm is called twice in symperm, one already knows this
164 |         oB%RowIdxs(1:iA%NoRows) = opColCounts(1:iA%NoRows)
165 |     else
166 |         oB%RowIdxs(:) = 0
167 |         do i=1,iA%NoRows
168 |             do j=iA%RowIDxs(i),iA%RowIdxs(i+1)-1
169 |                 oB%RowIdxs(iA%ColIDxs(j)) = oB%RowIdxs(iA%ColIDxs(j))+1  ! find coumn counts
170 |             enddo
171 |         enddo
172 |     endif
173 |     ! create column pointers
174 |     pm2 = 0
175 |     pm1 = 1
176 |     do i=3,iA%NoCols+3
177 |         p = pm1+ob%RowIdxs(i-2) 
178 |         ob%RowIdxs(i-2) = pm2
179 |         pm2 =pm1
180 |         pm1 = p
181 |     enddo
182 |     
183 |     ! fill matrix
184 |     do i=1,iA%NoRows
185 |         pi = iPerm(i) ! get the permuted row
186 |         do jp=iA%RowIDxs(pi),iA%RowIdxs(pi+1)-1
187 |             j = iA%ColIdxs(jp)
188 |             jpt = oB%RowIdxs(j+1)
189 |             ob%ColIdxs(jpt) = i
190 |             oB%Vals(jpt) = iA%Vals(jp)
191 |             oB%RowIdxs(j+1) = jpt+1
192 |         enddo
193 |     enddo
194 |     oB%RowIdxs(1) = 1
195 |     oB%NoElements = iA%NoElements
196 |     
197 |     return
198 |     ! error messages
199 | 
200 |     end subroutine halfperm
201 |     
202 |     subroutine halfpermOnlyTranspose(iA,oB)
203 |     ! by Kristoffer Andersen, Dec 2016
204 |     ! this routine performs the operation B = A^t
205 |     ! The out is sorted independent on the status of the input - nice feature!
206 |     !
207 |     ! IO
208 |     ! I iA - unpermuted matrix
209 |     ! O oB - permuted and sorted matrix
210 |     !
211 |     implicit none
212 |     !io
213 |     type(Tsparsemat),intent(in) :: iA
214 |     type(Tsparsemat),intent(out) :: oB
215 |     !var
216 |     integer :: pi,i,j,jp,p,pm1,pm2,jpt
217 |     !prog
218 |     call SMatCreate(oB,ia%NoCols,iA%NoRows,'(A)^T',.false.,iA%NoElements)
219 |     
220 |     ! create column structure in permuted matrix - the column count is unchanged by row permutations
221 |     oB%RowIdxs(:) = 0
222 |     do i=1,iA%NoRows
223 |         do j=iA%RowIDxs(i),iA%RowIdxs(i+1)-1
224 |             oB%RowIdxs(iA%ColIDxs(j)) = oB%RowIdxs(iA%ColIDxs(j))+1  ! find coumn counts
225 |         enddo
226 |     enddo
227 |     
228 |     ! create column pointers
229 |     pm2 = 0
230 |     pm1 = 1
231 |     do i=3,iA%NoCols+3
232 |         p = pm1+ob%RowIdxs(i-2) 
233 |         ob%RowIdxs(i-2) = pm2
234 |         pm2 =pm1
235 |         pm1 = p
236 |     enddo
237 |     
238 |     ! fill matrix
239 |     do i=1,iA%NoRows
240 |         pi = i ! get row
241 |         do jp=iA%RowIDxs(pi),iA%RowIdxs(pi+1)-1
242 |             j = iA%ColIdxs(jp)
243 |             jpt = oB%RowIdxs(j+1)
244 |             ob%ColIdxs(jpt) = i
245 |             oB%Vals(jpt) = iA%Vals(jp)
246 |             oB%RowIdxs(j+1) = jpt+1
247 |         enddo
248 |     enddo
249 |     oB%RowIdxs(1) = 1
250 |     oB%NoElements = iA%NoElements
251 |     
252 |     return
253 |     ! error messages
254 | 
255 |     end subroutine halfpermOnlyTranspose
256 |     
257 |     subroutine symperm(ioA,iPerm)
258 |     ! by Kristoffer Andersen, Dec 2016
259 |     ! performs the halfperm twice in order to get B = PAP^t = (P(PA)^t)^t
260 |     !
261 |     ! IO
262 |     ! IO ioA - in input: unpermuted matrix
263 |     !          on output: permuted and sorted matrix
264 |     ! I iPerm- permutation vector
265 |     implicit none
266 |     !io
267 |     type(Tsparsemat),intent(inout) :: ioA
268 |     integer,intent(in) :: iPerm(:)
269 |     !var
270 |     type(tsparsemat) :: temp
271 |     integer,allocatable :: ColCounts(:)
272 |     integer :: i,pi,N
273 |     !prog
274 |     call halfperm(ioA,temp,iPerm)
275 |     N = ioA%NoRows
276 |     allocate(ColCounts(N))
277 |     do i = 1,N
278 |         pi = iPerm(i)
279 |         ColCounts(i) = ioA%RowIdxs(pi+1)-ioA%RowIdxs(pi)
280 |     enddo
281 |     call halfperm(temp,ioA,iPerm,ColCounts)
282 |     return
283 |     end subroutine symperm
284 |     
285 |     subroutine PermuteVector(ioV,iPerm)
286 |     ! by Kristoffer Andersen, Dec 2016
287 |     ! this routine permutes a vector according to iPerm
288 |     !
289 |     ! IO
290 |     ! IO ioV - in input: unpermuted vector
291 |     !          on output: permuted vector
292 |     ! I iPerm- permutation vector
293 |     real*8,intent(inout) :: ioV(:)
294 |     integer,intent(in) :: iPerm(:)
295 |     ! var
296 |     real*8,allocatable :: temp(:)
297 |     integer :: i,N
298 |     !prog
299 |     N = ubound(ioV,1)
300 |     allocate(temp(N))
301 |     temp(1:N) = ioV(1:N)
302 |     do i=1,N
303 |         ioV(iPerm(i)) = temp(i)
304 |     enddo
305 |     return
306 |     end subroutine PermuteVector
307 |     
308 | end module mRCM
309 |     
310 |     
311 |     


--------------------------------------------------------------------------------
/Readme.MD:
--------------------------------------------------------------------------------
 1 | # Sparse Iterative parallel linear solver
 2 | 
 3 | ## Introduction
 4 | 
 5 | This is a sparse iterative linear solver I developed while doing my Phd at the HydroGeophysics Group, Aarhus University, Denmark.
 6 | The code is developed in Fortran/OpenMP, and is designed with focus on speed especially in parallel.
 7 | The code uses a block-splitting of the linear system without any overlap. 
 8 | It is intended to be used on matrices which have already been reordered using a reverse Cuthill-Mckee reordering.
 9 | 
10 | 
11 | ## What is in this pack
12 | 
13 | | Filename    | Description |
14 | | ----------- | ----------- |
15 | | Example.f90 | Contains a small example, that initialize the parallelization, loads a matrix and solves the linear system.       |
16 | | Solver.f90  | Contains the iterative sparse solver. |
17 | | SparseMatrix.f90  | Contains various sparse matrix functions needed by the iterative solver. |
18 | | Paralllelization.f90  | Contains the parallelization framework. |
19 | | Misc.f90  | Contains a few helper routines needed by the other modules. |
20 | | Load.f90  | Contains a minimalistic structure for loading and setting the various settings for the example. |
21 | | Matrix folder  | Contains the matrix used in the example saved in compressed sparse row format.|
22 | | Technical_Report.pdf  | A technical report which introduces both the parallelization framework and the iterative solver.|
23 | | Settings.ini  | Settings for the parallelization framework and the iterative solver.|
24 | 
25 | ## How to set it up
26 | 
27 | Open either the parallelization solution file or project file, or build your own project by using Example.f90 as the main program and the rest as dependencies.
28 | 
29 | Once the project has been built, place all files in the same folder and run it like this:
30 | 
31 | -> solver.exe settings.ini 001 outputfilename        (where 001 is the name of the sparseMatrix in the matrix folder)
32 | 


--------------------------------------------------------------------------------
/Settings.ini:
--------------------------------------------------------------------------------
 1 | Example of an ini file - this is skipped during the load and can contain anything
 2 | -99      !AffinityMode
 3 | -99      !AffinityPattern
 4 | 0        !UseNested
 5 | -99      !StartThread
 6 | 0        !NCPUs
 7 | -99      !NCPUsLow
 8 | -99      !NCPUOuter
 9 | -1       !SolverType
10 | -1       !Max number of iterations to run during the propagation
11 | 1        !Blocksize, Each block will be a modulo of this size 
12 | -1       !FillingFactor, A factor that scales the number of elements saved in each row during the IC factorization
13 | 1        !Determines whether to normalize the matrix before solving it.


--------------------------------------------------------------------------------
/Solver.f90:
--------------------------------------------------------------------------------
   1 |     !MIT License
   2 |     !
   3 |     !Copyright (c) [2017] [HydroGeophysics Group, Aarhus University, Denmark]
   4 |     !
   5 |     !Permission is hereby granted, free of charge, to any person obtaining a copy
   6 |     !of this software and associated documentation files (the "Software"), to deal
   7 |     !in the Software without restriction, including without limitation the rights
   8 |     !to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   9 |     !copies of the Software, and to permit persons to whom the Software is
  10 |     !furnished to do so, subject to the following conditions:
  11 |     !
  12 |     !The above copyright notice and this permission notice shall be included in all
  13 |     !copies or substantial portions of the Software.
  14 |     !
  15 |     !THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16 |     !IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17 |     !FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18 |     !AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19 |     !LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20 |     !OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  21 |     !SOFTWARE.
  22 |     
  23 | !Tue jan, 2015
  24 | !This module contains iterative solvers for linear systems
  25 | 
  26 | module mSolver
  27 |   use mMisc
  28 |   use mSMat
  29 |   implicit none 
  30 | 
  31 |   type TSolverStatus
  32 |     !This type is a subtype in TSolverSettings, meant for output from the solve
  33 |     logical, private :: Success=.false.    !True if system converged
  34 |     integer          :: Status             !Status is the variable that gets reported to the surrounding code, on a succesfull solve, status=nint(iter) but otherwise it can contain error messages or -1 if no convergence was reached
  35 |     real*8, private  :: AbsError           !Average absolute error
  36 |     real*8, private  :: RelError           !Average relative error
  37 |     real*8, private  :: FactorizationTime  !Time spent factorizing
  38 |     real*8, private  :: SolverTime         !Time spent solving
  39 |     
  40 |     !MultiRHS variables    
  41 |     real*8, private   :: Iter              !Average number of iterations used    
  42 |     integer, private  :: Iter_min          !Minimum number of iterations
  43 |     integer, private  :: Iter_max          !Maximum number of iterations
  44 |     real*8, private   :: AbsError_min      !Minimum absolute error 
  45 |     real*8, private   :: AbsError_max      !Maximum absolute error
  46 |     real*8, private   :: RelError_min      !Minimum relative error
  47 |     real*8, private   :: RelError_max      !Maximum relative error
  48 |   end type TSolverStatus
  49 |  
  50 |   type TSolverSettings
  51 |     !This type is meant to contain all settings needed for the solver to run, in addition it will also contain the output from the solve.
  52 |     logical, private :: IsPrepared=.false. !Before solver runs it checks this to see whether solversettings have been prepared.
  53 |     logical, private :: IsComplex          !True if system is complex
  54 |     integer, private :: UseNormalization   !If True, the matrix is normalized as: D^-1*A*D^-1 (D*x) = D^-1*b, where D=sqrt(diag(A))
  55 |     integer, private :: UseRCM             !If True, the matrix is reordered using Reverse Cuthill-Mckee reordering    
  56 |     integer, private :: NoRHS              !Number of right hands sides
  57 |     integer, private :: PrecondMaxFill     !Max number of elements each row in LUT factorization can contain (excluding the diagonal)
  58 |     integer, private :: NoElements         !Actual number of elements used in LUT factorization (not sure this is set)
  59 |     integer, private :: MaxIter            !Max number of Iterations
  60 |     integer, private :: SolverType         !Which solver to use
  61 |     integer, private :: Inform             !Set how much information solver should print, 0 = silent, 1 = basic statistics, 2 = solversettings+basic statistics, 3=full information
  62 |     integer, private :: Blocksize          !The minimum blocksize 
  63 |     integer, private :: NBlocks            !The number of blocks used in the parallel preconditioner computation
  64 |     integer, private :: FillingFactor      !The filling factor used for computing PrecondMaxFill
  65 |     real*8, private  :: PrecondTolerance   !DropTolerance of ILUT filling
  66 |     real*8, private  :: StepLimit          !When to stop if convergence is not reached
  67 |     real*8, private  :: AbsTol             !Convergence criteria 
  68 |     real*8, private  :: RelTol             !Convergence criteria 
  69 |     real*8, private  :: DiagDom            !Diagonal dominance, an empirical parameter that gives a rough measure for how difficult this system will be to solve, and hence how many elements we need in LUT factorization
  70 |     type(TSolverStatus) :: Output
  71 | 
  72 |     !RCM ordering indices
  73 |     integer,dimension(:),allocatable,private    :: RCMIndices   !Indices returned from RCM reordering, in case it should be reversed at the end
  74 |     !Normalization factor
  75 |     real*8,dimension(:),allocatable,private    :: Normalization   !Normalization is used if the logical Variable UseNormalization is true
  76 |     real*8,dimension(:),allocatable,private    :: ReNormalization !Renormalization is used if the logical variable UseNormalization is true, this brings back the original Matrix afterwards.
  77 |     !Block splitting
  78 |     integer,dimension(:,:),allocatable, private :: Blocks         !The actual blocks the matrix will be split up in during a parallel run
  79 |     !LU0/LUT preconditioner
  80 |     integer,dimension(:),allocatable, private   :: PrecondCols    !Used for LUT precondition to store the ColIdx in CSR format    
  81 |     integer,dimension(:),allocatable, private   :: PrecondRows     !Used for LUT precondition to store the RowIdx in CSR format
  82 |     real*8,dimension(:),allocatable, private    :: PrecondVals     !Used in real LU0/LUT preconditioning to store the values.
  83 |     !LU0 - complex values
  84 |     complex*16,dimension(:),allocatable, private  :: PrecondcVals  !Used in complex LU0 preconditioning to store the values
  85 |     !BLUT preconditioner
  86 |     integer,dimension(:,:),allocatable, private   :: BPrecondIdxs  !Used for BLUT precondition to store the ColIdx in block CSR format  
  87 |     integer,dimension(:,:),allocatable, private   :: BPrecondDiag  !Used for BLUT precondition to store the RowIdx in block CSR format
  88 |     real*8,dimension(:,:),allocatable, private    :: BPrecondVals  !Used in BLUT preconditioning to store the values of each block.
  89 |     !settings for PARDISO solver
  90 |     
  91 |   end type TSolverSettings
  92 | 
  93 |   interface SolverAxB
  94 |     module procedure SingleRHS_wrapper   
  95 |   end interface    
  96 |   
  97 |   contains
  98 | 
  99 | 
 100 | !****************************************************************************************    
 101 |    
 102 |       
 103 |     subroutine SingleRHS_wrapper(iA,iRHS,ioX,SolverSettings)
 104 |     !Tue, Dec 2014
 105 |     !This routine is a wrapper routine for all the our solvers which handle systems with a single righthandside.
 106 |     !IO
 107 |     !iA -> input matrix, in CSR format, and of the type TSparseMat
 108 |     !iRHS                    -> a system of right hand sides, RHS(:,1) is one right hand side vector
 109 |     !ioX                     -> on input this matrix contains the solution guess for each vector, on output it contains the found solutions
 110 |     !SolverSettings          -> contains all the relevant settings the solvers need, it is also where stats about the solve is saved. 
 111 |     !                           This needs to be set before calling this routine.
 112 |     !
 113 |     use omp_lib
 114 |     use msMat
 115 |     implicit none
 116 |     type (TSparseMat),intent(inout)    :: iA
 117 |     type(TSolverSettings),intent(inout):: SolverSettings 
 118 |     real*8,intent(inout),dimension(:)  :: iRHS
 119 |     real*8,intent(inout),dimension(:)  :: ioX
 120 |     character*200                      :: S
 121 |     integer                            :: i,j
 122 |     integer                            :: NoNumaNodes,Threads,ostatus,error
 123 |     real*8                             :: RunTimeBegin,RunTimeEnd
 124 |     real*8,dimension(:),allocatable    :: InvDiag,tmp
 125 |     integer,parameter                  :: NoRowsDenseLimit=35000 !with 35000 elements the dense solver takes about 150 seconds to solve the problem
 126 |     integer                            :: OldThreads
 127 |     logical                            :: dummy
 128 |     logical(4)                        :: Success
 129 |     allocate(tmp(iA%NoRows))
 130 |     dummy = startOpenMP(2,1)
 131 |     
 132 |     RuntimeBegin =  omp_get_wtime()
 133 |     Call checkSolverSettings(SolverSettings)
 134 |  
 135 |     if(SolverSettings%UseRCM) then
 136 |       tmp(:)=iRHS(SolverSettings%RCMIndices)
 137 |       iRHS=tmp
 138 |       tmp(:)=ioX(SolverSettings%RCMIndices)
 139 |       ioX=tmp
 140 |     end if
 141 |     if(SolverSettings%UseNormalization) then
 142 |       do i=1,iA%NoRows
 143 |         iRHS(i)=iRHS(i)*SolverSettings%Normalization(i)
 144 |         ioX(i)=ioX(i)*SolverSettings%ReNormalization(i) 
 145 |       end do       
 146 |     end if
 147 |     
 148 |     SELECT CASE (SolverSettings%SolverType)
 149 |     CASE(1,2)
 150 |       call SingleRHS(iA,iRHS,ioX,SolverSettings)
 151 |     CASE(3)
 152 |      !Pardiso (Not implemented here)   
 153 |     END SELECT
 154 |     
 155 |     !If we used Normalization, we now need to renormalize our solution vector x and our input iRHS.
 156 |     if(SolverSettings%UseNormalization) then
 157 |       do i=1,iA%NoRows
 158 |         ioX(i)=ioX(i)*SolverSettings%Normalization(i)
 159 |         iRHS(i)=iRHS(i)*SolverSettings%ReNormalization(i)
 160 |       end do
 161 |     end if
 162 |     if(SolverSettings%UseRCM) then !This is not right, it needs to be the inverse here, but should be easily fixed
 163 |       do i=1,iA%NoRows  
 164 |         tmp(SolverSettings%RCMIndices(i)) = iRHS(i)
 165 |       end do      
 166 |       iRHS=tmp
 167 |       do i=1,iA%NoRows  
 168 |         tmp(SolverSettings%RCMIndices(i)) = ioX(i)
 169 |       end do      
 170 |       ioX=tmp
 171 |     end if
 172 | 
 173 |     RuntimeEnd=omp_get_wtime()
 174 |     SolverSettings%Output%SolverTime=RunTimeEnd-RunTimeBegin
 175 |     if (.not.SolverSettings%Output%success) SolverSettings%Output%Status = -SolverSettings%Output%Status
 176 |     call ReturnOldThreadNum()
 177 |     return
 178 |     end subroutine SingleRHS_wrapper
 179 |      
 180 | !****************************************************************************************    
 181 |     
 182 | 
 183 |     
 184 |     subroutine SingleRHS(iA,iRHS,ioX,SolverSettings)
 185 |     !Tue, November 2014
 186 |     !Block Preconditioned Conjugate Gradient, with uma.
 187 |     !This routine solves linear systems using the Preconditioned Conjugate gradient method, as a preconditioner the method uses either SOR or BLUT depending on SolverSettings 
 188 |     !More information on the method can be found in the book "Iterative methods for sparse linear systems".
 189 |     !IO
 190 |     !iA -> input matrix, in CSR format, and of the type TSparseMat
 191 |     !iRHS                    -> a system of right hand sides, RHS(:,1) is one right hand side vector
 192 |     !ioX                     -> on input this matrix contains the solution guess for each vector, on output it contains the found solutions
 193 |     !SolverSettings          -> contains all the relevant settings the solvers need, it is also where stats about the solve is saved. 
 194 |     !                           This needs to be set before calling this routine.
 195 |     !
 196 |     !
 197 |     use omp_lib
 198 |     implicit none
 199 | 
 200 |     type (TSparseMat),intent(inout)   :: iA
 201 |     type(TSolverSettings),intent(inout)  :: SolverSettings 
 202 | 
 203 |     real*8,intent(in),dimension(:)       :: iRHS
 204 |     real*8,intent(inout),dimension(:)    :: ioX
 205 |     
 206 |     !Parallel vars
 207 |     Integer             :: Threads
 208 |     !Local vars
 209 |     integer            :: i
 210 |     integer,save       :: j
 211 |     integer            :: MaxIter,NoRows,iNuma
 212 |     real*8             :: rzDotProd,eps
 213 |     real*8             :: alfa, beta
 214 |     real*8             :: error,norm
 215 |     real*8,allocatable :: r(:),z(:),p(:),Ap(:),OldX(:),StartX(:)
 216 |     logical            :: KeepLooping 
 217 | 
 218 |     
 219 |     !SOR vars    
 220 |     integer,dimension(:),allocatable           :: DiagIdx
 221 |     integer,dimension(:,:),allocatable         :: BRowIdx
 222 |     real*8,dimension(:),allocatable            :: InvDiag
 223 | 
 224 |         
 225 |     Eps=1e-6
 226 |     iNuma=1
 227 |     NoRows=size(iRHS)
 228 |     allocate(r(NoRows))
 229 |     allocate(z(NoRows))
 230 |     allocate(p(NoRows))
 231 |     allocate(Ap(NoRows))
 232 |     allocate(OldX(NoRows))
 233 |     allocate(StartX(NoRows))
 234 |     OldX=ioX         
 235 |     StartX=ioX
 236 |     !First the block parallisation
 237 |     Threads=OMP_GET_MAX_THREADS ()
 238 |     
 239 |     call SmatAmultV(iA,ioX(:),r(:))
 240 |     call VectorAddition(1,iNuma,NoRows,r,iRHS) !r(:)=iRHS(:)-r(:)
 241 |     
 242 |     SELECT CASE (SolverSettings%SolverType)
 243 |     CASE(1)
 244 |       allocate(DiagIdx(iA%NoRows))  
 245 |       call SMatGetDiagIdx(iA,DiagIdx) 
 246 |       allocate(InvDiag(iA%NoRows))  
 247 |       call SMatGetInvDiag(iA,InvDiag)
 248 |       allocate(BRowIdx(iA%NoRows,2))
 249 |       call SMatGetBlockRowIdx(iA,SolverSettings%Blocks,BRowIdx)        
 250 |       call Apply_BPrecondition_BSGS(iA%ColIdxs,iA%RowIdxs,iA%Vals,BRowIdx,DiagIdx,InvDiag,r,z,SolverSettings%Blocks)
 251 |     CASE(2)
 252 |       call Apply_BPrecondition_BLUT(SolverSettings%Blocks, r, z, SolverSettings%BPrecondVals, SolverSettings%BPrecondIdxs, SolverSettings%BPrecondDiag)
 253 |     END SELECT
 254 |     
 255 |     call VectorAddition(0,iNuma,NoRows,p,z) !p(:)=z(:)
 256 |     
 257 |     do j = 1 , SolverSettings%MaxIter
 258 |       call SMatAMultV(iA,p,Ap)
 259 |     
 260 |       rzDotProd=ddot(NoRows,r,1,z,1)
 261 |       alfa=rzDotProd/ddot(NoRows,Ap(:),1,p,1)
 262 |       call VectorAddition(4,iNuma,NoRows,ioX,p,alfa,r,Ap) !ioX(:) = ioX(:) + alfa*p(:) , r(:) = r(:) - alfa*Ap(:)
 263 |     
 264 |       SELECT CASE (SolverSettings%SolverType)
 265 |       CASE(1)
 266 |        call Apply_BPrecondition_BSGS(iA%ColIdxs,iA%RowIdxs,iA%Vals,BRowIdx,DiagIdx,InvDiag,r,z,SolverSettings%Blocks)
 267 |       CASE(2)
 268 |         call Apply_BPrecondition_BLUT(SolverSettings%Blocks, r, z, SolverSettings%BPrecondVals, SolverSettings%BPrecondIdxs, SolverSettings%BPrecondDiag)
 269 |       END SELECT
 270 |       beta = ddot(NoRows,r,1,z,1)/rzDotProd
 271 |       call VectorAddition(3,iNuma,NoRows,p,z,beta) !p(:) = z(:) + beta * p(:)
 272 |       
 273 |       !Did we converge yet?
 274 |       error=0d0
 275 |       norm =0d0
 276 |       Error=sqrt(ddot(NoRows,r,1,r,1))
 277 |       Norm=sqrt(ddot(NoRows,ioX,1,ioX,1))
 278 |             
 279 |       if (((error/norm).le.SolverSettings%RelTol).or.(error).le.SolverSettings%AbsTol) then
 280 |         SolverSettings%Output%Success=.TRUE.
 281 |         SolverSettings%Output%Status = min(j,SolverSettings%MaxIter)
 282 |         exit
 283 |       end if
 284 |       if (SolverSettings%StepLimit.gt.0) then
 285 |           Keeplooping=.False.
 286 |           do i=1,NoRows
 287 |             if (abs(ioX(i)-OldX(i))/Norm.gt.SolverSettings%StepLimit) then
 288 |               Keeplooping=.True.
 289 |               exit
 290 |             end if
 291 |           end do
 292 |           if (.NOT.keeplooping) then 
 293 |             exit
 294 |           else
 295 |             OldX(:) = ioX(:)
 296 |           end if
 297 |       end if
 298 |     end do
 299 |     SolverSettings%Output%Status=min(j,SolverSettings%MaxIter)
 300 |     SolverSettings%Output%AbsError=error
 301 |     SolverSettings%Output%RelError=error/norm   
 302 |     if (allocated(DiagIdx)) then
 303 |       deallocate(DiagIdx)
 304 |     end if
 305 |     if (allocated(InvDiag)) then
 306 |       deallocate(InvDiag)
 307 |     end if
 308 |     if (allocated(BRowIdx)) then
 309 |       deallocate(BRowIdx)
 310 |     end if
 311 |     if(.NOT.SolverSettings%Output%Success) ioX=StartX
 312 |     if(allocated(r))      deallocate(r)
 313 |     if(allocated(z))      deallocate(z)
 314 |     if(allocated(p))      deallocate(p)
 315 |     if(allocated(Ap))     deallocate(Ap)
 316 |     if(allocated(OldX))   deallocate(OldX)
 317 |     if(allocated(StartX)) deallocate(StartX)
 318 |     
 319 |     end subroutine SingleRHS
 320 | 
 321 |     subroutine VectorAddition(imode,iNuma,iNoRows,ioX,iY,alfa,ioZ,iV)
 322 |     !Tue, Dec 2014
 323 |     !This routine does different kinds of vector additions in parallel depending on the mode selected.
 324 |     !mode = 0 : ioX = iY
 325 |     !mode = 1 : ioX = iY - ioX
 326 |     !mode = 2 : ioX = ioX + alfa*iY
 327 |     !mode = 3 : ioX = alfa*ioX + iY  
 328 |     !mode = 4 : ioX(:) = ioX(:) + alfa*iY(:)    
 329 |     !           ioZ(:) = ioZ(:) - alfa*iV(:)
 330 |     !IO
 331 |     !imode -> selects the type of vector addition used
 332 |     !iNuma, -> Number of Numa Nodes
 333 |     !iNoRows -> Number of Rows
 334 |     !ioX,iY,ioZ,iV     -> vectors
 335 |     !alfa              -> scalar
 336 |     implicit none
 337 |     integer,intent(in)                          :: imode,iNuma,iNoRows
 338 |     real*8,intent(inout),dimension(:)           :: ioX
 339 |     real*8,intent(in),dimension(:)              :: iY
 340 |     real*8,intent(in),optional                  :: alfa
 341 |     real*8,intent(inout),dimension(:),optional  :: ioZ
 342 |     real*8,intent(in),dimension(:),optional     :: iV
 343 |     
 344 |     integer i
 345 |     if (iNuma.gt.1) then
 346 |       if (imode.eq.0) then
 347 |     !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY) PRIVATE(I)
 348 |     !$OMP DO        
 349 |           do i=1,iNoRows
 350 |             ioX(i) = iY(i)    
 351 |           end do
 352 |     !$OMP END DO 
 353 |     !$OMP END PARALLEL               
 354 |       elseif (imode.eq.1) then
 355 |     !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY) PRIVATE(I)
 356 |     !$OMP DO        
 357 |           do i=1,iNoRows
 358 |             ioX(i) = iY(i) - ioX(i)    
 359 |           end do
 360 |     !$OMP END DO 
 361 |     !$OMP END PARALLEL               
 362 |       elseif(imode.eq.2) then
 363 |     !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY,Alfa) PRIVATE(I)
 364 |     !$OMP DO        
 365 |           do i=1,iNoRows
 366 |             ioX(i) = ioX(i) + alfa*iY(i)    
 367 |           end do
 368 |     !$OMP END DO 
 369 |     !$OMP END PARALLEL               
 370 |       elseif(imode.eq.3) then
 371 |     !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY,Alfa) PRIVATE(I)
 372 |     !$OMP DO        
 373 |           do i=1,iNoRows
 374 |             ioX(i) = alfa*ioX(i) +iY(i)    
 375 |           end do
 376 |     !$OMP END DO 
 377 |     !$OMP END PARALLEL               
 378 |       elseif(imode.eq.4) then
 379 |     !$OMP PARALLEL DEFAULT(NONE) SHARED(iNoRows,ioX,iY,alfa,ioZ,iV) PRIVATE(I)
 380 |     !$OMP DO        
 381 |           do i=1,iNoRows
 382 |             ioX(i) = ioX(i) + alfa*iY(i) 
 383 |             ioZ(i) = ioZ(i) - alfa*iV(i)
 384 |           end do
 385 |     !$OMP END DO 
 386 |     !$OMP END PARALLEL  
 387 |       end if
 388 |       
 389 |     else
 390 |         if (imode.eq.0) then
 391 |           ioX(:) = iY(:)    
 392 |         elseif (imode.eq.1) then
 393 |           ioX(:) = iY(:) - ioX(:)    
 394 |         elseif(imode.eq.2) then
 395 |           ioX(:) = ioX(:) + alfa*iY(:)    
 396 |         elseif(imode.eq.3) then
 397 |           ioX(:) = alfa*ioX(:) + iY(:)    
 398 |        elseif(imode.eq.4) then
 399 |           ioX(:) = ioX(:) + alfa*iY(:)    
 400 |           ioZ(:) = ioZ(:) - alfa*iV(:)    
 401 |        end if   
 402 |     end if
 403 |     end subroutine VectorAddition
 404 |    
 405 |     subroutine Compute_Precondition_BLUT_real(iA,SolverSettings)
 406 |     !Tue November 2014
 407 |     !This routine sets up all the parameters needed for the Block LUT matrix factorization in parallel.
 408 |     !After everything has been setup, then the factorization algorithm is called.
 409 |     !IO
 410 |     !iA -> input matrix, in CSR format, and of the type TSparseMat
 411 |     !SolverSettings          -> contains all the relevant settings the solvers need, it is also where stats about the solve is saved. 
 412 |     !                           This needs to be set before calling this routine.
 413 |     use omp_lib
 414 |     implicit none
 415 |     type (TSolverSettings),intent(inout)           :: SolverSettings
 416 |     type (TSparseMat),     intent(inout)           :: iA
 417 |     
 418 |     !Local Vars
 419 |     integer                                :: i,Threads,NoRows,MaxRows,NoPrecondElements,SplitNoRows,ierr
 420 |     integer,dimension(:),allocatable           :: jw   
 421 |     integer,dimension(:,:),allocatable     :: BRowIdx
 422 |     real*8                                 :: Timer1, Timer2    
 423 |     real*8,dimension(:),allocatable        :: w
 424 |     Timer1=omp_get_wtime()     
 425 |     Threads=SolverSettings%NBlocks
 426 |     allocate(BRowIdx(iA%NoRows,2))
 427 |     call SMatGetBlockRowIdx(iA,SolverSettings%BlockS,BRowIdx)
 428 |     MaxRows = 0
 429 |     do i = 1 , threads
 430 |         MaxRows = Max(MaxRows,SolverSettings%BlockS(i,2)-SolverSettings%BlockS(i,1)+1)
 431 |     end do             
 432 |     NoPrecondElements=(2*SolverSettings%PrecondMaxFill+1)*MaxRows
 433 |     Allocate(SolverSettings%BPrecondVals(NoPrecondElements,Threads))
 434 |     Allocate(SolverSettings%BPrecondIdxs(NoPrecondElements,Threads))
 435 |     Allocate(SolverSettings%BPrecondDiag(MaxRows,Threads))
 436 | !$OMP PARALLEL IF(StartOpenMP(2)) DEFAULT(SHARED) PRIVATE(i,SplitNoRows,w,jw,ierr,NoPrecondElements)
 437 | !$OMP DO SCHEDULE(STATIC)
 438 |     do i = 1 , Threads
 439 |       SplitNoRows=SolverSettings%BlockS(i,2)-SolverSettings%BlockS(i,1)+1
 440 |       Allocate(w(SplitNoRows+1))
 441 |       Allocate(jw(2*SplitNoRows))
 442 |       NoPrecondElements=(2*SolverSettings%PrecondMaxFill)*SplitNoRows
 443 |       
 444 |       call BLUT_Factorization_real(SplitNoRows,iA%Vals,iA%ColIdxs,BRowIdx(SolverSettings%BlockS(i,1):SolverSettings%BlockS(i,2),:),&
 445 |           SolverSettings%PrecondMaxFill,SolverSettings%PrecondTolerance,SolverSettings%BPrecondVals(:,i),SolverSettings%BPrecondIdxs(:,i),&
 446 |           SolverSettings%BPrecondDiag(:,i),NoPrecondElements,w,jw,SolverSettings%BlockS(i,1)-1,ierr)
 447 |       !On Numa architechture we may now need to gather LU into one matrix and copy it around, depending whether we use PCG og BPCG routines.    
 448 |       deAllocate(w)
 449 |       deAllocate(jw)
 450 |     end do
 451 | !$OMP END DO
 452 | !$OMP END PARALLEL  
 453 |     Timer2=omp_get_wtime() 
 454 |     SolverSettings%Output%FactorizationTime=Timer2-Timer1 
 455 |     if (allocated(BRowIdx)) then
 456 |       deallocate(BRowIdx)
 457 |     end if  
 458 |     end subroutine Compute_Precondition_BLUT_real
 459 |     
 460 |    
 461 | !*************************************************************            
 462 |     
 463 |   subroutine BlockSplitter(iNoRows,iThreads,iParam,oBlocks)
 464 |     !Tue, September 2014
 465 |     !Splits iNoRows into iThreads Intervals.
 466 |     !Each interval will have a length of iParam*n, where n is a natural number.
 467 |     !The result will be saved in oBlocks, where oBlocks(i,1) is the start of the i'th interval and oBlocks(i,2) is the end of the interval
 468 |     !IO
 469 |     !Update 2015 Tue
 470 |     !Fixed the case if iNoRows < iThreads
 471 |     implicit none
 472 |     Integer,intent(in)                :: iNoRows,iThreads
 473 |     Integer,intent(in)                :: iParam
 474 |     Integer,intent(out),allocatable       :: oBlocks(:,:)
 475 |     !Local vars
 476 |     integer                           :: i
 477 |     integer                           :: BlockSize
 478 |     
 479 |     BlockSize=max(iNoRows/iThreads,1) 
 480 |     BlockSize =mod(IParam-mod(BlockSize,iParam),iParam)+BlockSize
 481 |     Allocate(oBlocks(iThreads,2))
 482 |     do i=1,iThreads
 483 |         oBlocks(i,1) = (i-1)*(BlockSize)+1
 484 |         oBlocks(i,2) = min(i*BlockSize,iNoRows)
 485 |     end do     
 486 |     print*,'Blocks',oBlocks
 487 |   end subroutine BlockSplitter  
 488 |   
 489 |   
 490 | subroutine BLUT_Factorization_real(n,iVals,iColIdxs,iBRowIdxs,lfil,droptol,alu,jlu,ju,iwk,w,jw,Offset,ierr)
 491 | !Tue November 2014
 492 | !This routine is designed to compute an Incomplete Cholesky factorization, of a sparse linear system, limited by  on a system.
 493 | !It has been made by modifying an old ILUT preconditioner made by Saad.
 494 | !A slightly modfified description of the ILUT preconditioner is found below. 
 495 | !
 496 | !IO
 497 | !
 498 | !n                          -> Number of rows in input matrix
 499 | !iVals,iColIdxs             -> Is the values and Column indexes of the input matrix stored in CSR format
 500 | !iBRowIdxs                  -> Is the cutoff row indices for the input matrix. iBRowIdxs(i,1) is the first element in row i we use, iBRowIdxs(i,2) is the last element in row i we use
 501 | !lfil                       -> The fill in parameter, each row of L and each row of U will have a maximum of lfil elements (excluding the diagonal element)
 502 | !droptol                    -> The minimum threshold before we drop a term.
 503 | !alu,jlu,ju                 -> the output LU matrix stored in modified sparse row format
 504 | !iwk                        -> the length of arrays alu,jlu
 505 | !w                          -> work array
 506 | !jw                         -> work array
 507 | !Offset                     -> The offset between the input matrix idx and the LU idx we want to save them in.
 508 | !ierr                       -> status of subroutine
 509 | !
 510 | !----------------------------------------------------------------------*
 511 | !                      *** ILUT preconditioner ***                     *
 512 | !      incomplete LU factorization with dual truncation mechanism      *
 513 | !----------------------------------------------------------------------*
 514 | !c     Author: Yousef Saad *May, 5, 1990, Latest revision, August 1996  *
 515 | !c----------------------------------------------------------------------*
 516 | !c PARAMETERS                                                           
 517 | !c-----------                                                           
 518 | !c
 519 | !c on entry:
 520 | !c========== 
 521 | !c n       = integer. The row dimension of the matrix A. The matrix 
 522 | !c
 523 | !c iVals,iColIdxs, = matrix stored in Compressed Sparse Row format.              
 524 | !c iBRowIdxs is special!
 525 | !c lfil    = integer. The fill-in parameter. Each row of L and each row
 526 | !c           of U will have a maximum of lfil elements (excluding the 
 527 | !c           diagonal element). lfil must be .ge. 0.
 528 | !c           ** WARNING: THE MEANING OF LFIL HAS CHANGED WITH RESPECT TO
 529 | !c           EARLIER VERSIONS. 
 530 | !c
 531 | !c droptol = real*8. Sets the threshold for dropping small terms in the
 532 | !c           factorization. See below for details on dropping strategy.
 533 | !c
 534 | !c  
 535 | !c iwk     = integer. The lengths of arrays alu and jlu. If the arrays
 536 | !c           are not big enough to store the ILU factorizations, ilut
 537 | !c           will stop with an error message. 
 538 | !c
 539 | !c On return:
 540 | !c===========
 541 | !c
 542 | !c alu,jlu = matrix stored in Modified Sparse Row (MSR) format containing
 543 | !c           the L and U factors together. The diagonal (stored in
 544 | !c           alu(1:n) ) is inverted. Each i-th row of the alu,jlu matrix
 545 | !c           contains the i-th row of L (excluding the diagonal entry=1)
 546 | !c           followed by the i-th row of U.
 547 | !c
 548 | !c ju      = integer array of length n containing the pointers to
 549 | !c           the beginning of each row of U in the matrix alu,jlu.
 550 | !c
 551 | !c ierr    = integer. Error message with the following meaning.
 552 | !c           ierr  = 0    --> successful return.
 553 | !c           ierr .gt. 0  --> zero pivot encountered at step number ierr.
 554 | !c           ierr  = -1   --> Error. input matrix may be wrong.
 555 | !c                            (The elimination process has generated a
 556 | !c                            row in L or U whose length is .gt.  n.)
 557 | !c           ierr  = -2   --> The matrix L overflows the array al.
 558 | !c           ierr  = -3   --> The matrix U overflows the array alu.
 559 | !c           ierr  = -4   --> Illegal value for lfil.
 560 | !c           ierr  = -5   --> zero row encountered.
 561 | !c
 562 | !c work arrays:
 563 | !c=============
 564 | !c jw      = integer work array of length 2*n.
 565 | !c w       = real work array of length n+1.
 566 | !c  
 567 | !c----------------------------------------------------------------------
 568 | !c w, ju (1:n) store the working array [1:ii-1 = L-part, ii:n = u] 
 569 | !c jw(n+1:2n)  stores nonzero indicators
 570 | !c 
 571 | !c Notes:
 572 | !c ------
 573 | !c The diagonal elements of the input matrix must be  nonzero (at least
 574 | !c 'structurally'). 
 575 | !c
 576 | !c----------------------------------------------------------------------* 
 577 | !c---- Dual drop strategy works as follows.                             *
 578 | !c                                                                      *
 579 | !c     1) Theresholding in L and U as set by droptol. Any element whose *
 580 | !c        magnitude is less than some tolerance (relative to the abs    *
 581 | !c        value of diagonal element in u) is dropped.                   *
 582 | !c                                                                      *
 583 | !c     2) Keeping only the largest lfil elements in the i-th row of L   * 
 584 | !c        and the largest lfil elements in the i-th row of U (excluding *
 585 | !c        diagonal elements).                                           *
 586 | !c                                                                      *
 587 | !c Flexibility: one  can use  droptol=0  to get  a strategy  based on   *
 588 | !c keeping  the largest  elements in  each row  of L  and U.   Taking   *
 589 | !c droptol .ne.  0 but lfil=n will give  the usual threshold strategy   *
 590 | !c (however, fill-in is then mpredictible).                             *
 591 | !c----------------------------------------------------------------------*      
 592 |       implicit none  
 593 |       integer,intent(in)                :: n,lfil,iwk,offset
 594 |       real*8,intent(in),dimension(:)    :: iVals
 595 |       real*8,intent(inout),dimension(:) :: alu,w
 596 |       real*8,intent(in)                 :: droptol
 597 |       
 598 |       integer,intent(in),dimension(:)   :: iColIdxs
 599 |       integer,intent(in),dimension(:,:) :: iBRowIdxs
 600 |       integer,intent(inout),dimension(:):: jlu,ju,jw
 601 |       integer,intent(inout)             :: ierr
 602 | 
 603 | !c     locals
 604 |       integer ju0,k,j1,j2,j,ii,i,lenl,lenu,jj,jrow,jpos,length 
 605 |       real*8 tnorm, t, abs, s, fact 
 606 |       if (lfil .lt. 0) goto 998
 607 | !c-----------------------------------------------------------------------
 608 | !c     initialize ju0 (points to next element to be added to alu,jlu)
 609 | !c     and pointer array.
 610 | !c-----------------------------------------------------------------------
 611 |       ju0 = n+2
 612 |       jlu(1) = ju0
 613 | !c
 614 | !c     initialize nonzero indicator array. 
 615 | !c
 616 |       do j=1,n
 617 |          jw(n+j)  = 0
 618 |       end do
 619 | !c-----------------------------------------------------------------------
 620 | !c     beginning of main loop.
 621 | !c-----------------------------------------------------------------------
 622 | do ii = 1, n
 623 |          j1 = iBRowIdxs(ii,1)
 624 |          j2 = iBRowIdxs(ii,2)
 625 |         !j1=iRowIdxs(ii)
 626 |         !j2=iRowIdxs(ii+1)-1
 627 |          tnorm = 0.0d0
 628 |          do k=j1,j2
 629 |             tnorm = tnorm+abs(iVals(k))
 630 |          end do
 631 |          if (tnorm .eq. 0.0) goto 999
 632 |          tnorm = tnorm/real(j2-j1+1)
 633 | !c     
 634 | !c     unpack L-part and U-part of row of A in arrays w 
 635 | !c     
 636 |          lenu = 1
 637 |          lenl = 0
 638 |          jw(ii) = ii
 639 |          w(ii) = 0.0
 640 |          jw(n+ii) = ii
 641 | 
 642 |          do j = j1, j2
 643 |             k = iColIdxs(j)-offset
 644 |             t = iVals(j)
 645 |             if (k .lt. ii) then
 646 |                lenl = lenl+1
 647 |                jw(lenl) = k
 648 |                w(lenl) = t
 649 |                jw(n+k) = lenl
 650 |             else if (k .eq. ii) then
 651 |                w(ii) = t
 652 |             else
 653 |                lenu = lenu+1
 654 |                jpos = ii+lenu-1 
 655 |                jw(jpos) = k
 656 |                w(jpos) = t
 657 |                jw(n+k) = jpos
 658 |             endif
 659 |          end do
 660 |          jj = 0
 661 |          length = 0 
 662 | !c     
 663 | !c     eliminate previous rows
 664 | !c     
 665 |  150     jj = jj+1
 666 |          if (jj .gt. lenl) goto 160
 667 | !c-----------------------------------------------------------------------
 668 | !c     in order to do the elimination in the correct order we must select
 669 | !c     the smallest column index among jw(k), k=jj+1, ..., lenl.
 670 | !c-----------------------------------------------------------------------
 671 |          jrow = jw(jj)
 672 |          k = jj
 673 | !c     
 674 | !c     determine smallest column index
 675 | !c     
 676 |          do j=jj+1,lenl
 677 |             if (jw(j) .lt. jrow) then
 678 |                jrow = jw(j)
 679 |                k = j
 680 |             end if
 681 |         end do
 682 |          
 683 |          if (k .ne. jj) then
 684 | !c     exchange in jw
 685 |             j = jw(jj)
 686 |             jw(jj) = jw(k)
 687 |             jw(k) = j
 688 | !c     exchange in jr
 689 |             jw(n+jrow) = jj
 690 |             jw(n+j) = k
 691 | !c     exchange in w
 692 |             s = w(jj)
 693 |             w(jj) = w(k)
 694 |             w(k) = s
 695 |          endif
 696 | !c
 697 | !c     zero out element in row by setting jw(n+jrow) to zero.
 698 | !c     
 699 |          jw(n+jrow) = 0
 700 | !c
 701 | !c     get the multiplier for row to be eliminated (jrow).
 702 | !c     
 703 |          fact = w(jj)*alu(jrow)
 704 |          if (abs(fact) .le. droptol) goto 150
 705 | !c     
 706 | !c     combine current row and row jrow
 707 | !c
 708 |          do k = ju(jrow), jlu(jrow+1)-1
 709 |             s = fact*alu(k)
 710 |             j = jlu(k)
 711 |             jpos = jw(n+j)
 712 |             if (j .ge. ii) then
 713 | !c     
 714 | !c     dealing with upper part.
 715 | !c     
 716 |                if (jpos .eq. 0) then
 717 | !c
 718 | !c     this is a fill-in element
 719 | !c     
 720 |                   lenu = lenu+1
 721 |                   if (lenu .gt. n) goto 995
 722 |                   i = ii+lenu-1
 723 |                   jw(i) = j
 724 |                   jw(n+j) = i
 725 |                   w(i) = - s
 726 |                else
 727 | !c
 728 | !c     this is not a fill-in element 
 729 | !c
 730 |                   w(jpos) = w(jpos) - s
 731 | 
 732 |                endif
 733 |             else
 734 | !c     
 735 | !c     dealing  with lower part.
 736 | !c     
 737 |                if (jpos .eq. 0) then
 738 | !c
 739 | !c     this is a fill-in element
 740 | !c     
 741 |                   lenl = lenl+1
 742 |                   if (lenl .gt. n) goto 995
 743 |                   jw(lenl) = j
 744 |                   jw(n+j) = lenl
 745 |                   w(lenl) = - s
 746 |                else
 747 | !c     
 748 | !c     this is not a fill-in element 
 749 | !c     
 750 |                   w(jpos) = w(jpos) - s
 751 |                endif
 752 |             endif
 753 |         end do
 754 | !c     
 755 | !c     store this pivot element -- (from left to right -- no danger of
 756 | !c     overlap with the working elements in L (pivots). 
 757 | !c     
 758 |          length = length+1 
 759 |          w(length) = fact
 760 |          jw(length)  = jrow
 761 |          goto 150
 762 |          
 763 |  160     continue
 764 | !c     
 765 | !c     reset double-pointer to zero (U-part)
 766 | !c     
 767 |          do k=1, lenu
 768 |             jw(n+jw(ii+k-1)) = 0
 769 |          end do
 770 | !c     
 771 | !c     update L-matrix
 772 | !c     
 773 |          lenl = length 
 774 |          length = min0(lenl,lfil)
 775 | !c     
 776 | !c     sort by quick-split
 777 | !c
 778 |          call qsplit (w,jw,lenl,length)
 779 | !c
 780 | !c     store L-part
 781 | !c 
 782 |          do k=1, length 
 783 |             if (ju0 .gt. iwk) goto 996
 784 |             alu(ju0) =  w(k)
 785 |             jlu(ju0) =  jw(k)
 786 |             ju0 = ju0+1
 787 |         end do
 788 | !c     
 789 | !c     save pointer to beginning of row ii of U
 790 | !c     
 791 |          ju(ii) = ju0
 792 | !c
 793 | !c     update U-matrix -- first apply dropping strategy 
 794 | !c
 795 |          length = 0
 796 |          do k=1, lenu-1
 797 |             if (abs(w(ii+k)) .gt. droptol*tnorm) then 
 798 |                length = length+1
 799 |                w(ii+length) = w(ii+k) 
 800 |                jw(ii+length) = jw(ii+k) 
 801 |             endif
 802 |          enddo
 803 |          lenu = length+1
 804 |          length = min0(lenu,lfil)
 805 | 
 806 |          call qsplit(w(ii+1:ii+lenu), jw(ii+1:ii+lenu), lenu-1,length)
 807 | !c
 808 | !c     copy
 809 | !c 
 810 |          t = abs(w(ii))
 811 |          if (length + ju0 .gt. iwk) goto 997
 812 |          do k=ii+1,ii+length-1 
 813 |             jlu(ju0) = jw(k)
 814 |             alu(ju0) = w(k)
 815 |             t = t + abs(w(k) )
 816 |             ju0 = ju0+1
 817 |          end do
 818 | !c     
 819 | !c     store inverse of diagonal element of u
 820 | !c     
 821 |          if (w(ii) .eq. 0.0) w(ii) = (0.0001 + droptol)*tnorm
 822 |      
 823 |          alu(ii) = 1.0d0/ w(ii) 
 824 | !c     
 825 | !c     update pointer to beginning of next row of U.
 826 | !c     
 827 |          jlu(ii+1) = ju0
 828 | !c-----------------------------------------------------------------------
 829 | !c     end main loop
 830 | !c-----------------------------------------------------------------------
 831 |  end do
 832 |       ierr = 0
 833 |       return
 834 | !c
 835 | !c     incomprehensible error. Matrix must be wrong.
 836 | !c     
 837 |  995  ierr = -1
 838 |       return
 839 | !c     
 840 | !c     insufficient storage in L.
 841 | !c     
 842 |  996  ierr = -2
 843 |       return
 844 | !c     
 845 | !c     insufficient storage in U.
 846 | !c     
 847 |  997  ierr = -3
 848 |       return
 849 | !c     
 850 | !c     illegal lfil entered.
 851 | !c     
 852 |  998  ierr = -4
 853 |       return
 854 | !c     
 855 | !c     zero row encountered
 856 | !c     
 857 |  999  ierr = -5
 858 |       return
 859 | !c----------------end-of-ilut--------------------------------------------
 860 | end subroutine BLUT_Factorization_real
 861 | 
 862 |   
 863 | subroutine Apply_BPrecondition_BLUT(iBlocks, iRHS, oX, iLUVals, iLUIdxs, iLUDiag) 
 864 |     !Tue November 2014
 865 |     !This routine does a forward and backward block solve with the Block LUT matrix
 866 |     !
 867 |     !IO
 868 |     !
 869 |     !iBlocks                 -> The blocksplitting of the Matrix
 870 |     !iRHS                    -> The righthandside we update our solution after
 871 |     !oX                      -> The output solution
 872 |     !iLUVals,iLUIdxs,iLUDiag -> The Block LU matrix where each block is a separate matrix stored in modfied sparse row format.
 873 | 	integer,intent(in),dimension(:,:) :: iBlocks
 874 |     real*8,intent(in),dimension(:)    :: iRHS
 875 |     real*8,intent(out),dimension(:)   ::  oX
 876 |     real*8,intent(in),dimension(:,:)  :: iLUVals
 877 |     integer,intent(in),dimension(:,:) :: iLUIdxs, iLUDiag    
 878 |     !local variables
 879 |     integer             :: i,k,j,threads,m,tmp
 880 |     
 881 |     
 882 |     threads=size(iBlocks,1)
 883 | !$OMP PARALLEL IF(StartOpenMP(2)) DEFAULT(NONE)  SHARED(iBlocks,threads,iRHS,oX,iLUVals,iLUIdxs,iLUDiag) PRIVATE(k,i,j,m,tmp)
 884 | !$OMP DO SCHEDULE(STATIC)
 885 |     do k=1,threads
 886 |         do i = iBlocks(k,1),iblocks(k,2)
 887 |            oX(i) = iRHS(i)
 888 |         end do
 889 | !
 890 | ! forward solve (with U^T)
 891 | !
 892 |        tmp=iBlocks(k,1)-1
 893 |        m=0
 894 |         do i = iBlocks(k,1),iblocks(k,2)
 895 |            m=m+1
 896 |            oX(i) = oX(i) * iLUVals(m,k)
 897 |            do j=iLUDiag(m,k),iLUIdxs(m+1,k)-1
 898 |               oX(tmp+iLUIdxs(j,k)) = oX(tmp+iLUIdxs(j,k)) - iLUVals(j,k)* oX(i)
 899 |            end do
 900 |         end do
 901 | !     
 902 | !     backward solve (with L^T)
 903 | !    
 904 |       do i = iBlocks(k,2),iBlocks(k,1),-1
 905 |        m=i-tmp
 906 | 	   do j=iLUIdxs(m,k),iLUDiag(m,k)-1
 907 |               oX(tmp+iLUIdxs(j,k)) = ox(tmp+iLUIdxs(j,k)) - iLUVals(j,k)*oX(i)
 908 |        end do
 909 |       end do
 910 |     end do
 911 | !$OMP END DO
 912 | !$OMP END PARALLEL      
 913 | end subroutine Apply_BPrecondition_BLUT  
 914 |   
 915 |    subroutine Apply_BPrecondition_BSGS(iColIdxs,iRowIdxs,iVals,iBRowIdx,iDiagIdx,iInvDiag,iRhS,ioX,Blocks)
 916 |     !Tue, August 2014
 917 |     !This routine does a forward and backward block solve with the Block SOR preconditioner
 918 |     !This algorithm is described in 266-267 in the book "Iterative methods for sparse linear systems" in the normal non-block edition.
 919 |     !Note that the SGS preconditioner is not calculated beforehand, but rather calculated on the fly during the applying of it.
 920 |     !IO
 921 |     !ispA     -> Input, Sparse matrix of the linear system
 922 |     !iDiagIdx -> Input, a vector containing indexes to the diagonal entries in the CSR matrix ispA.
 923 |     !iInvDiag -> Input, a vector containing the inverse values of the diagonal element in ispA.   
 924 |     !iRhS     -> Input, the right hand side of the linear system we try to solve.
 925 |     !ioX      -> Input, Solution estimate to update iteratively. Output, our new updated solution
 926 |     !Blocks   -> Input, contains the blocksplitting of ispA Blocks(i,1) contains the rowIdx of the first row in the i'th block, Blocks(i,2) contains the last.
 927 |     implicit none
 928 |     integer,intent(in),dimension(:)   :: iColIdxs,iRowIdxs
 929 |     real*8,intent(in),dimension(:)    :: iVals
 930 |     integer,intent(in),dimension(:,:) :: iBRowIdx
 931 |     integer,intent(in),dimension(:)   :: iDiagIdx
 932 |     real*8,intent(in),dimension(:)    :: iInvDiag
 933 |     real*8,intent(in),dimension(:)    :: iRHS
 934 |     real*8,intent(inout),dimension(:) :: ioX
 935 |     integer,intent(in)                :: Blocks(:,:)
 936 |     !Local vars
 937 |     integer             :: i,j,k
 938 |     real*8              :: tmp
 939 |     real*8              :: Factor
 940 |     integer             :: threads
 941 |     real*8              :: omega
 942 |     threads=size(Blocks,1)
 943 |     
 944 | !$OMP PARALLEL IF(StartOpenMP(2)) DEFAULT(NONE) SHARED(omega,Blocks,iDiagIdx,iInvDiag,iColIdxs,iRowIdxs,iVals,ioX,irhs,threads,iBRowIdx) PRIVATE(i,k,j)
 945 | !c$OMP CRITICAL
 946 | !$OMP DO SCHEDULE(STATIC)
 947 |     do k=1,threads
 948 |       !We start by doing the  forward step
 949 |       do i = Blocks(k,1),Blocks(k,2)
 950 |         ioX(i) = iRHS(i)
 951 |         do j = iBRowIdx(i,1) , iDiagIdx(i)-1
 952 |             ioX(i) = ioX(i) - iVals(j)*iInvDiag(iColIdxs(j))*ioX(iColIdxs(j))
 953 |         end do
 954 |       end do
 955 | 
 956 |       !Now comes the backward step    
 957 |       do i = Blocks(k,2),Blocks(k,1),-1
 958 |         do j = iBRowIdx(i,2) ,iDiagIdx(i)+1,-1
 959 |               ioX(i) = ioX(i) - iVals(j)*ioX(iColIdxs(j))
 960 |         end do
 961 |         ioX(i) = ioX(i)*iInvDiag(i)        
 962 |       end do
 963 |     end do  
 964 | !$OMP END DO
 965 | !c$OMP END CRITICAL
 966 | !$OMP END PARALLEL
 967 |    end subroutine Apply_BPrecondition_BSGS     
 968 |    
 969 |   
 970 |   subroutine    SetSolverSettings(SolverSettings,A,iUseRCM,iUseNormalization,iSolverType,iIter,iBlockSize,iFillingFactor,iStepLimit)
 971 |     !Tue Dec, 2014  
 972 |     !This routine Sets the solversettings prior to a linear solve.
 973 |     !NOTE that this also calculates the precondition matrix, which the caller takes responsibility to free. The freeing should be done by calling the FreeSolverSettings subroutine.
 974 |     !any optional value you wish chosen by the subroutine should be set to -1
 975 |     !IO
 976 |     !SolverSettings          -> This is where we save all the relevant settings about the solve we want to do - note if we have preconditioner this is also saved in here.
 977 |     !A                       -> input matrix, in CSR format, and of the type TSparseMat
 978 |     !(Optional)   The rest are optional parameters which can be chosen manually, any parameter set to -1 will be chosen automatically.  
 979 |     !iUseNormalization       -> (optional) Use Normalization 
 980 |     !iSolverType             -> (optional) manually chooses the solvertype, if none is selected or set to -1, a fitting type will be chosen based on the problem.
 981 |     !                           1  SGS
 982 |     !                           2  BLUT
 983 |     !                           3  Pardiso (not implemented here)
 984 |     !iIter                   -> (optional) manually set the maximum iterations. (default is DEFAULT_MAXITERATIONS)
 985 |     !iBaseBlockSize          -> (optional) The blocksize will be a multiplum of this number for all blocks if specified.
 986 |     !                            this is used for blocksplitting to make sure we don't split in the middle of a block.
 987 |     !iFillingFactor          -> (optional) Determines the number of elements allowed in the IC factorization. Factor is based on the average number of elements in iA; default value 1.5
 988 |     !iStepLimit              -> (optional) sets the relative minimum difference an iterative step should make in the solution 
 989 |     !                           This could be important to set when using block solvers where convergence to the solution is not guaranteed
 990 |     use omp_lib
 991 |     use msMat
 992 |     use mRCM
 993 |     implicit none  
 994 |     type (TSolverSettings),intent(inout)           :: SolverSettings
 995 |     type (TSparseMat),     intent(inout)           :: A
 996 |     integer, intent(in),optional                   :: iUseRCM
 997 |     integer, intent(in),optional                   :: iUseNormalization
 998 |     integer, intent(in),optional                   :: iSolverType
 999 |     integer, intent(in),optional                   :: iIter 
1000 |     integer, intent(in),optional                   :: iBlockSize      
1001 |     real*8, intent(in),optional                   :: iFillingFactor
1002 |     integer, intent(in),optional                   :: iStepLimit
1003 |     character*256                     :: S
1004 |     real*8                            :: NParFactor,DampCutOff,StartTolerance,EndTolerance,DampVal
1005 |     real*8,allocatable                :: Diag(:)
1006 |     logical                           :: UseSlowButProvenSettings=.false.
1007 |     logical                           :: IsLayeredInv
1008 |     integer                           :: ErrorCode,Param,NModPar,NParSec,i,j,MaxWidth
1009 |     integer                           :: OldThreads
1010 |     
1011 |     real*8                            :: Elements_pr_row,DiagDom,RunTimeBegin,RunTimeEnd
1012 |     !Tweakable factors
1013 |     integer,parameter                 :: DEFAULT_USERCM = 0
1014 |     integer,parameter                 :: DEFAULT_USENORMALIZATION = 1
1015 |     integer,parameter                 :: DEFAULT_INFORMATIONLEVEL = 1
1016 |     integer,parameter                 :: DEFAULT_NORHS = 1
1017 |     integer,parameter                 :: DEFAULT_MAXITERATIONS = 500
1018 |     
1019 |     real*8,parameter                  :: DEFAULT_STEPLIMIT = 0 
1020 |     real*8,parameter                  :: DEFAULT_FILLINGFACTOR=2.5
1021 |     real*8,parameter                  :: MAXDIAGDOM=1        !Not made to be adjusted just yet    
1022 |     real*8,parameter                  :: SGS_THRESHOLD=0.1  
1023 |     
1024 |     logical,parameter                 :: DEBUG_MODE = .FALSE.
1025 |     !Test
1026 |     real*8                            :: DiagDomTest,test
1027 |     integer                           :: NumberOfBlocks
1028 |     integer                           :: Error
1029 |     real*8,dimension(:),allocatable   :: DummyVal
1030 |     logical :: Dummy
1031 |     
1032 |     
1033 |     solverSettings%UseRCM=LoadOptionalParam(-1,iUseRCM)
1034 |     if(solverSettings%UseRCM.eq.-1) solverSettings%UseRCM=DEFAULT_USERCM
1035 |     if (solverSettings%UseRCM.eq.1) then
1036 |       if(allocated(solverSettings%RCMIndices)) deallocate(solverSettings%RCMIndices)   
1037 |       allocate(solverSettings%RCMIndices(A%NoRows))  
1038 |       call RCMSort(A,solverSettings%RCMIndices)
1039 |     end if
1040 |     
1041 |     solverSettings%UseNormalization=LoadOptionalParam(-1,iUseNormalization)
1042 |     if(solverSettings%UseNormalization.eq.-1) solverSettings%UseNormalization=DEFAULT_USENORMALIZATION
1043 |     if(present(iSolverType)) then  ! remove normalization for direct solver
1044 |         if(iSolverType.eq.3) solverSettings%UseNormalization = 0
1045 |     endif
1046 |     
1047 |     SolverSettings%FillingFactor=LoadOptionalParam(-1,iFillingFactor)
1048 |     if(SolverSettings%FillingFactor.eq.-1) SolverSettings%FillingFactor=DEFAULT_FILLINGFACTOR
1049 |     
1050 |     if (solverSettings%UseNormalization) then
1051 |       if(allocated(SolverSettings%ReNormalization)) deallocate(SolverSettings%ReNormalization)   
1052 |       if(allocated(SolverSettings%Normalization)) deallocate(SolverSettings%Normalization)   
1053 |       allocate(SolverSettings%ReNormalization(A%NoRows))  
1054 |       allocate(SolverSettings%Normalization(A%NoRows))
1055 |       call SMatGetDiag(A,SolverSettings%ReNormalization,.TRUE.)
1056 |       SolverSettings%ReNormalization = sqrt(SolverSettings%ReNormalization)
1057 |       SolverSettings%Normalization = 1d0/SolverSettings%ReNormalization      
1058 |       call SMatDAD(A,SolverSettings%Normalization)
1059 |     end if
1060 |     
1061 |     SolverSettings%IsPrepared = .False.
1062 |     SolverSettings%output%Success = .False.
1063 |     
1064 |     SolverSettings%IsComplex = A%IsComplex
1065 |     SolverSettings%BlockSize=LoadOptionalParam(-1,iBlockSize)
1066 |     SolverSettings%RelTol=1e-12
1067 |     SolverSettings%AbsTol=1e-9
1068 |       
1069 |     SolverSettings%PrecondTolerance=1e-6
1070 | 
1071 |     SolverSettings%Steplimit=LoadOptionalParam(-1,iSteplimit)
1072 |     if(SolverSettings%Steplimit.eq.-1) SolverSettings%Steplimit=DEFAULT_STEPLIMIT
1073 |       
1074 |             
1075 |     SolverSettings%MaxIter=LoadOptionalParam(-1,iIter)
1076 |     if(SolverSettings%MaxIter.eq.-1) SolverSettings%MaxIter=DEFAULT_MAXITERATIONS     
1077 |       
1078 |     if(.not.(iSolverType.eq.3)) call SMatGetDiagonalDominant(A,SolverSettings%DiagDom)
1079 |       
1080 |     !SolverType is determined
1081 |     SolverSettings%SolverType=LoadOptionalParam(-1,iSolverType)
1082 |     if (SolverSettings%SolverType.le.0)  call SelectSolver(A,SolverSettings,SolverSettings%DiagDom,SGS_threshold)
1083 |     
1084 | 	if(.not.(isolvertype.eq.3)) then
1085 |       Elements_pr_row = (1d0*A%NoElements)/(1d0*A%NoRows)
1086 |       !Maximum number of precondition elements is chosen
1087 |       MaxWidth=SMatFindMaxColDif(A)
1088 |       SolverSettings%PrecondMaxFill=NINT(sqrt(SolverSettings%DiagDom)*SolverSettings%FillingFactor*Elements_pr_row)
1089 |       SolverSettings%PrecondMaxFill=min(A%NoRows,min(SolverSettings%PrecondMaxFill,2*MaxWidth))
1090 |     endif 
1091 | 	
1092 |     !DEBUG_MODE Only relevant in debug mode
1093 |     if (DEBUG_MODE) then
1094 |         print*,'IsComplex',SolverSettings%IsComplex
1095 |         print*,'Blocksize',SolverSettings%BlockSize
1096 |         print*,'Diagdom',SolverSettings%DiagDom
1097 |         print*,'SolverType',SolverSettings%SolverType
1098 |         print*,'SolverSettings%PrecondMaxFill',SolverSettings%PrecondMaxFill
1099 |         print*,'Elements_pr_row',Elements_pr_row     
1100 |     end if
1101 |       
1102 |     !reset pointers
1103 |     if (allocated(SolverSettings%PrecondVals))  deallocate(SolverSettings%PrecondVals,Stat=ErrorCode) 
1104 |     if (allocated(SolverSettings%PrecondCols))  deallocate(SolverSettings%PrecondCols,Stat=ErrorCode) 
1105 |     if (allocated(SolverSettings%PrecondRows))  deallocate(SolverSettings%PrecondRows,Stat=ErrorCode) 
1106 |     if (allocated(SolverSettings%BPrecondVals)) deallocate(SolverSettings%BPrecondVals,Stat=ErrorCode) 
1107 |     if (allocated(SolverSettings%BPrecondIdxs)) deallocate(SolverSettings%BPrecondIdxs,Stat=ErrorCode) 
1108 |     if (allocated(SolverSettings%BPrecondDiag)) deallocate(SolverSettings%BPrecondDiag,Stat=ErrorCode) 
1109 |     if (allocated(SolverSettings%BlockS))       deallocate(SolverSettings%BlockS,Stat=ErrorCode) 
1110 |     if (allocated(SolverSettings%PrecondcVals)) deallocate(SolverSettings%PrecondcVals,Stat=ErrorCode) 
1111 |    
1112 |     Dummy=StartOpenMP(2)
1113 |     NumberOfBlocks=omp_get_max_threads()
1114 |     if(.not.(iSolvertype.eq.3)) call BlockSplitter(A%NoRows,NumberOfBlocks,SolverSettings%BlockSize,SolverSettings%BlockS)
1115 |     SolverSettings%NBlocks=size(SolverSettings%BlockS,1)
1116 |     !Factorization of the preconditioner is done
1117 |     RuntimeBegin =  omp_get_wtime()    
1118 |     SELECT CASE(SolverSettings%SolverType)    
1119 |     CASE(2)
1120 |         !BLUT
1121 |         call Compute_Precondition_BLUT_real(A,SolverSettings)
1122 |     CASE(3)
1123 |         !Pardiso (not implemented here)        
1124 |     END SELECT
1125 |     RuntimeEnd =  omp_get_wtime()    
1126 |     SolverSettings%output%FactorizationTime=RuntimeEnd-RunTimeBegin   
1127 |     print*,'Factorization Time:',SolverSettings%output%FactorizationTime
1128 |     SolverSettings%IsPrepared=.true. 
1129 |     return 
1130 |   end subroutine SetSolverSettings
1131 |  
1132 |      subroutine SolverNormalizeMatrix(SolverSettings,ioA,Norm)
1133 | ! Kristoffer Andersen, Dec 2016
1134 | ! Normalizes the input matrix ioA using the normalization vectors in 
1135 | ! solversettings. There is no check on the size of the vectors compared to the matrix
1136 | ! Norm controls whether normalization or the renormalization vector is used. 
1137 | ! If SolverSettings%UseNormalization = .false. nothing is performed
1138 | ! IO
1139 | ! I - SolverSettings - settings with the normalization vectors.
1140 | !   If normalization is not used nothing happens to ioA
1141 | ! IO - ioA - the matrix which is transformed to D*A*D
1142 | ! I,optional - Norm - controls whether normalization or renormalization is used. 
1143 | !   .true. (default) for normalization
1144 | !   .false. for renormalization
1145 | !
1146 |     implicit none
1147 |     type(TSolverSettings),intent(in)     :: SolverSettings 
1148 |     type(Tsparsemat),intent(inout)       :: ioA
1149 |     logical,optional,intent(in)          :: Norm
1150 |     !var
1151 |     logical  :: normalize
1152 |     if(SolverSettings%USENORMALIZATION) then ! only if normalization is used
1153 |         normalize = .true.
1154 |         if(present(Norm)) normalize = norm
1155 |         
1156 |         if(normalize) then
1157 |             call SMatDAD(ioA,SolverSettings%Normalization)
1158 |         else
1159 |             call SMatDAD(ioA,SolverSettings%ReNormalization)
1160 |         endif
1161 |     endif
1162 |     
1163 |     RETURN
1164 |     END subroutine SolverNormalizeMatrix
1165 |   
1166 |   
1167 |   subroutine SelectSolver(iA,SolverSettings,iDiagDom,iSGS_threshold)
1168 |    !Tue Dec, 2014  
1169 |    !This routine selects the most appropiete solvertype based on the diagonal dominance and various thresholds.
1170 |    !IO
1171 |    !iA                      -> input matrix, in CSR format, and of the type TSparseMat
1172 |    !SolverSettings          -> This is where we save all the relevant settings about the solve we want to do.
1173 |    !iDiagDom                -> Maximum diagonal dominance factor for the matrix.
1174 |    !iSGS_threshold          -> threshold between BLUT and SGS preconditioner
1175 |     implicit none  
1176 |     type (TSparseMat),intent(in)           :: iA
1177 |     type(TSolverSettings),intent(inout)    :: SolverSettings
1178 |     real*8,intent(in)                      :: iDiagDom
1179 |     real*8,intent(in)                      :: iSGS_threshold    
1180 |     
1181 |         if (SolverSettings%IsComplex) then
1182 |               SolverSettings%SolverType = 12 !LU0
1183 |         else
1184 |           if (iDiagDom.gt.iSGS_threshold) then
1185 |             !BLUT
1186 |             SolverSettings%SolverType=2
1187 |           else
1188 |             !SGS
1189 |             SolverSettings%SolverType=1            
1190 |           end if    
1191 |         end if
1192 |   end subroutine SelectSolver
1193 |   
1194 |   
1195 |   
1196 |     subroutine CheckSolverSettings(iSolverSettings)
1197 |     !Tue, dec 2014
1198 |     !Checks whether the solversettings have been set and if informationlevel is sufficiently high it also reports the solver settings
1199 |     !
1200 |     !use mError
1201 |     implicit none
1202 |     type(TSolverSettings),intent(in)    :: iSolverSettings
1203 |     character*200                       :: S
1204 |     integer         :: Output
1205 |     logical         :: PrintToLog
1206 |     
1207 |     if(.NOT.iSolverSettings%IsPrepared) goto 901
1208 |         
1209 |     return
1210 |     901 continue
1211 |     print*, 'Critical error! Solversettings not set, make sure SetSolverSettings is called before running the solver'
1212 |     stop
1213 |     end subroutine CheckSolverSettings
1214 | 
1215 |  
1216 |     logical function isSolverNormalized(SolverSettings)
1217 |     ! Kristoffer Andersen, Dec 2016
1218 |     ! return the normalization state of the solver
1219 |     implicit none
1220 |     type(TSolverSettings),intent(in)     :: SolverSettings 
1221 |     isSolverNormalized = SolverSettings%USENORMALIZATION
1222 |     RETURN
1223 |     END FUNCTION isSolverNormalized
1224 |     
1225 |    
1226 |      subroutine FreeSolverSettings(SolverSettings)
1227 |     !Tue June 2015
1228 |     !This routine frees the memory allocated by SolverSettings.
1229 |     !This routine should always be called whenever Solversettings is no longer needed, since caller takes resposibility to free TSolverSettings.    
1230 |     !
1231 |     implicit none
1232 |     type(TSolverSettings),intent(inout)     :: SolverSettings 
1233 |     integer                                 :: ErrorCode
1234 | 	real*8                                  :: dummyVal(1)
1235 |     integer                                 :: dummyInt(1)
1236 |     integer                                 :: dummy
1237 | 
1238 |     !reset pointers
1239 |     if (allocated(SolverSettings%PrecondVals))  deallocate(SolverSettings%PrecondVals,Stat=ErrorCode) 
1240 |     if (allocated(SolverSettings%PrecondCols))  deallocate(SolverSettings%PrecondCols,Stat=ErrorCode) 
1241 |     if (allocated(SolverSettings%PrecondRows))  deallocate(SolverSettings%PrecondRows,Stat=ErrorCode) 
1242 |     if (allocated(SolverSettings%BPrecondVals)) deallocate(SolverSettings%BPrecondVals,Stat=ErrorCode) 
1243 |     if (allocated(SolverSettings%BPrecondIdxs)) deallocate(SolverSettings%BPrecondIdxs,Stat=ErrorCode) 
1244 |     if (allocated(SolverSettings%BPrecondDiag)) deallocate(SolverSettings%BPrecondDiag,Stat=ErrorCode) 
1245 |     if (allocated(SolverSettings%BlockS))       deallocate(SolverSettings%BlockS,Stat=ErrorCode) 
1246 |     if (allocated(SolverSettings%PrecondcVals)) deallocate(SolverSettings%PrecondcVals,Stat=ErrorCode) 
1247 |     if (allocated(SolverSettings%Renormalization)) deallocate(SolverSettings%Renormalization,Stat=ErrorCode) 
1248 |     if (allocated(SolverSettings%normalization)) deallocate(SolverSettings%normalization,Stat=ErrorCode) 
1249 |         
1250 |      end subroutine FreeSolverSettings
1251 |  
1252 |   
1253 |         subroutine qsplit(a,ind,n,ncut)
1254 |         implicit none
1255 |         real*8 a(n)
1256 |         integer ind(n), n, ncut,j,mid
1257 | !c-----------------------------------------------------------------------
1258 | !c     does a quick-sort split of a real array.
1259 | !c     on input a(1:n). is a real array
1260 | !c     on output a(1:n) is permuted such that its elements satisfy:
1261 | !c
1262 | !c     abs(a(i)) .ge. abs(a(ncut)) for i .lt. ncut and
1263 | !c     abs(a(i)) .le. abs(a(ncut)) for i .gt. ncut
1264 | !c
1265 | !c     ind(1:n) is an integer array which permuted in the same way as a(*).
1266 | !c-----------------------------------------------------------------------
1267 |         real*8 tmp, abskey
1268 |         integer itmp, first, last
1269 | !c-----
1270 |         first = 1
1271 |         last = n
1272 |         if (ncut .lt. first .or. ncut .gt. last) return
1273 | !c
1274 | !c     outer loop -- while mid .ne. ncut do
1275 | !c
1276 |  1      mid = first
1277 |         abskey = abs(a(mid))
1278 |         do 2 j=first+1, last
1279 |            if (abs(a(j)) .gt. abskey) then
1280 |               mid = mid+1
1281 | !c     interchange
1282 |               tmp = a(mid)
1283 |               itmp = ind(mid)
1284 |               a(mid) = a(j)
1285 |               ind(mid) = ind(j)
1286 |               a(j)  = tmp
1287 |               ind(j) = itmp
1288 |            endif
1289 |  2      continue
1290 | !c
1291 | !c     interchange
1292 | !c
1293 |         tmp = a(mid)
1294 |         a(mid) = a(first)
1295 |         a(first)  = tmp
1296 | !c
1297 |         itmp = ind(mid)
1298 |         ind(mid) = ind(first)
1299 |         ind(first) = itmp
1300 | !c
1301 | !c     test for while loop
1302 | !c
1303 |         if (mid .eq. ncut) return
1304 |         if (mid .gt. ncut) then
1305 |            last = mid-1
1306 |         else
1307 |            first = mid+1
1308 |         endif
1309 |         goto 1
1310 | !c----------------end-of-qsplit------------------------------------------
1311 | !c-----------------------------------------------------------------------
1312 |         end
1313 | 
1314 |      
1315 |     end module mSolver      
1316 | 


--------------------------------------------------------------------------------
/SparseMatrix.f90:
--------------------------------------------------------------------------------
  1 | module mSMat
  2 |   use mParallel, only: StartOpenMP, ReturnOldThreadNum
  3 |   implicit none 
  4 | !  logical   :: UseDenseAlgebra,VoxelInversion
  5 |   integer,private                :: FInitNoElements=1000000     !Initial allocation size of the matrices
  6 |   logical                        :: FCheckDoubleEntries=.false. !Debug var. When .true. the code checks for double entries 
  7 |   logical                        :: FShowMessages=.false.       !when adding new values to the matrix and checks that the column indices of the 
  8 |   logical,private                :: ColIdxCorrected=.false.     !Internal variable ensuring that column index correction is only applied once. VERY IMPORTANT
  9 |                                                                 !TO TOGGLE THIS VALUE TRUE IN THE BEGINNING OF ROUTINES THAT ALTER THE INDEX AND FALSE IN THE END!!!!!  
 10 |   logical,private                :: UseMKLSparseBLAS=.true.     !If true the intel math kernel library is used for high performance sparse BLAS. If false, full source routines from SparseKit are used.                                                                 
 11 |   logical                        :: SMatLogParameters=.true. 
 12 | !$OMP THREADPRIVATE(ColIdxCorrected)
 13 |                                                                 
 14 |   type TSparseMat
 15 |   !Public variables(General matrix information)                                                               
 16 |     character*(256)              :: Label
 17 |     integer                      :: NoElements               !Number of elements in the matrix 
 18 |     integer                      :: NoRows                   !Number of rows of the matrix
 19 |     integer                      :: NoCols                   !Number of columns of the matrix 
 20 |     
 21 |     logical                      :: IsComplex=.false.        !True if the matrix is complex                                                            
 22 | 
 23 |     real*8 , dimension(:),pointer    :: Vals=>null()             !Values of the matrix elements if Real
 24 |     Complex*16, dimension(:),pointer :: cVals=>null()            !Values of the matrix elements if Complex 
 25 |     integer,  dimension(:),pointer   :: RowIdxs =>null()         !Row indices of the elements
 26 |     integer,  dimension(:),pointer   :: ColIdxs=>null()          !Column indices of the elements    
 27 |   end type TSparseMat
 28 |   
 29 |   
 30 |     contains
 31 |     subroutine SMatCreate(ioSparseMat,iNoRows,iNoCols,iLabel,iIsComplex,iAllocSize)
 32 | !Casper Kirkegaard, November 2009.
 33 | !This routine creates a TSparseMat structure, ie. initializes
 34 | !parameters allocates memory etc.
 35 | !
 36 | !Updated 2014 Tue, Now it takes complex numbers as well
 37 | !
 38 | ! CK, october 2015.    
 39 | !  Removed initialization of value and column index vectors. It takes a long time when creating small matrices and is not necessary.
 40 | ! KRA, Dec 2016. Added optional allocation size. Has to be >= 1
 41 | !IO
 42 | !   oSparseMat  -> Sparse matrix to create
 43 | !   iNoRows     -> Number of rows in the matrix
 44 | !   iNoCols     -> Number of cols in the matrix 
 45 | !   iLabel      -> Label of the matrix. Setting this to eg. 'G' or 'B' is very useful
 46 | !                  when debugging or optimizing 
 47 | !   iIsComplex  -> Default=.false., if true it means that the matrix is complex
 48 | !   iAllocSize  -> Default=FInitNoElements, is the allocation size. The default value is a 
 49 | !                  model parameter. Has to >=1    
 50 |     implicit none
 51 |     type (TSparseMat),intent(inout) :: ioSparseMat
 52 |     integer,intent(in)              :: iNoRows
 53 |     integer,intent(in)              :: iNoCols
 54 |     character*(*),intent(in)        :: iLabel          
 55 |     logical,intent(in),optional     :: iIsComplex    
 56 |     integer,intent(in),optional     :: iAllocSize
 57 |     integer                         :: FInitNoElements=1000000     !Initial allocation size of the matrices
 58 | !   var
 59 |     integer :: NoAlloc
 60 | !Implementation
 61 |     !If the matrix already exists, destroy it and recreate.
 62 |     if (associated(ioSparseMat%Vals)) then
 63 |       call SMatDestroy(ioSparseMat)
 64 |     end if
 65 |     !First initialize the vars of the input matrix
 66 |     ioSparseMat%IsComplex=.false.
 67 |     NoAlloc = FInitNoElements
 68 | 
 69 |     if (present(iIsComplex)) ioSparseMat%IsComplex=iIsComplex
 70 |     if (present(iAllocSize)) then
 71 |         if(iAllocSize>0) NoAlloc = iAllocSize
 72 |     endif
 73 |     
 74 |     ioSparseMat%NoRows=iNoRows
 75 |     ioSparseMat%NoCols=iNoCols
 76 |     ioSparseMat%Label=iLabel
 77 |     ioSparseMat%NoElements=0
 78 |     !Now allocate an initial size for the arrays holding the matrix
 79 |     allocate(ioSparseMat%ColIdxs(NoAlloc))
 80 |     if (ioSparseMat%IsComplex) then
 81 |       allocate(ioSparseMat%cVals(NoAlloc))
 82 |     else
 83 |       allocate(ioSparseMat%Vals(NoAlloc))
 84 |     end if    
 85 |     allocate(ioSparseMat%RowIdxs(iNoRows+1)) 
 86 |     ioSparseMat%RowIdxs(:)=0
 87 |     end subroutine SMatCreate
 88 | 
 89 |     
 90 |     subroutine SMatWriteSparse(iA,iV,iNDigits,iName,iLabel)
 91 |     !Gianluca Fiandaca, April 2013, AU
 92 |     !This routine writes the sparse matrix iA in 3 different files, one containing
 93 |     !the matrix row indices, one containing the matrix column indices and one containing 
 94 |     !the matrix values. The names of the output files are built concatenating the file name 
 95 |     !iName and the words "RowIdxs.txt", "ColIdxs.txt" & "Values.txt" 
 96 |     !IO
 97 |     !  iA          -> Sparse matrix to be written
 98 |     !  iV          -> RHS vector
 99 |     !  iName       -> the name of the 3 output files 
100 |     !  iLabel      -> Label to be written before the matrix
101 |     !  iNDigits    -> Number of digits after comma in the output format (in scientific notation) 
102 |     implicit none
103 |     type (TSparseMat),intent(inout)        :: iA
104 |     real*8,dimension(:)                    :: iV  
105 |     integer          ,intent(in), optional :: iNDigits
106 |     character*(*)    ,intent(in), optional :: iName
107 |     character*(*)    ,intent(in), optional :: iLabel
108 |     
109 |     integer                         :: i,j,k,Error
110 |     integer                         :: NIntegerDigits,NScientificDigits
111 |     integer,parameter               :: F=99
112 |     real*8                          :: Val
113 |     character*256                   :: RowName,ColName,ValName,S,DefName,RHSName
114 |     integer(kind=1)                 :: CSR
115 |  
116 |     !digits to be used in the formats
117 |     if(present(iNDigits)) then
118 |       NScientificDigits=iNDigits  
119 |     else
120 |       NScientificDigits=4  
121 |     end if
122 |     NIntegerDigits=nint(log10(1.d0*iA%NoElements))+1
123 |     !names and legend
124 |     if(present(iName)) then
125 |       DefName=trim(iName)//'_Definition.txt'
126 |       RHSName=trim(iName)//'_RHS.txt'
127 |       RowName=trim(iName)//'_RowIdxs.txt'
128 |       ColName=trim(iName)//'_ColIdxs.txt'
129 |       ValName=trim(iName)//'_Values.txt'
130 |     else
131 |       DefName='Definition.txt'  
132 |       RHSName='RHS.txt'
133 |       RowName='RowIdxs.txt'
134 |       ColName='ColIdxs.txt'
135 |       ValName='Values.txt'
136 |     end if
137 |     
138 |     !Definition file
139 |     open(unit=F,file=DefName,status='unknown')
140 |     if(present(iLabel)) then
141 |         write(F,'(A)') trim(iLabel)
142 |     else 
143 |         write(F,'(A)')
144 |     end if
145 |     write(F,30) iA%NoRows,iA%NoCols,iA%NoElements,CSR
146 |     close(F)
147 |     
148 |     
149 |     !RHS
150 |     open(unit=F,file=RhsName,status='unknown')
151 |     if(present(iLabel)) then
152 |         write(F,'(A)') trim(iLabel)
153 |     else 
154 |         write(F,'(A)')
155 |     end if
156 |     do i=1,iA%NoCols
157 |       write(F,10) iV(i)
158 |     end do
159 |     close(F)
160 |     
161 |     
162 |     !Row indices
163 |     open(unit=F,file=RowName,status='unknown')
164 |         if(present(iLabel)) then
165 |             write(F,'(A)') trim(iLabel)
166 |         else 
167 |             write(F,'(A)')
168 |         end if
169 |         do i=1,iA%NoRows+1
170 |           write(F,20) iA%RowIdxs(i)
171 |         end do
172 |     close(F)
173 |         
174 |         
175 |     
176 |     !Col indices
177 |     open(unit=F,file=ColName,status='unknown')
178 |     if(present(iLabel)) then
179 |         write(F,'(A)') trim(iLabel)
180 |     else 
181 |         write(F,'(A)')
182 |     end if
183 |     do i=1,iA%NoElements
184 |       write(F,20) iA%ColIdxs(i)
185 |     end do
186 |     close(F)
187 |     
188 |     !Values
189 |     open(unit=F,file=ValName,status='unknown')
190 |     if(present(iLabel)) then
191 |         write(F,'(A)') trim(iLabel)
192 |     else 
193 |         write(F,'(A)')
194 |     end if
195 |     do i=1,iA%NoElements
196 |       write(F,10) iA%Vals(i)
197 |     end do
198 |     close(F)
199 | 
200 |     return
201 | 30  format(4(1xi<NIntegerDigits>))   
202 | 20  format(3(1xi<NIntegerDigits>))   
203 | 10  format(1x1pe<7+NScientificDigits>.<NScientificDigits>)    
204 |     end subroutine SMatWriteSparse   
205 |     
206 |     
207 |     subroutine SMatReadSparse(oA,oV, iFileName,BlockSize)
208 |     !Gianluca Fiandaca, April 2013, AU
209 |     !This routine reads a sparse matrix iA in 3 different files, one containing
210 |     !the matrix row indices, one containing the matrix column indices and one containing 
211 |     !the matrix values. The names of the output files are built concatenating the file name 
212 |     !iName and the words "RowIdxs.txt", "ColIdxs.txt" & "Values.txt" 
213 |     !IO
214 |     !  iA          -> Sparse matrix to be written
215 |     !  iName       -> the name of the 3 output files 
216 |     !  iLabel      -> Label to be written before the matrix
217 |     !  BlockSize   -> The size of each block in the matrix (Used for blocksplitting)    
218 |     !------------------------------------
219 |     !Updated Tue & Casper, September 2014.
220 |     !The reader can now read both CSR and dense format
221 |     !Furthermore the reader now assumes the file starts with a label line at the very top
222 |     !------------------------------------
223 |     use mArrays
224 |     implicit none
225 |     type (TSparseMat),intent(out)          :: oA
226 |     real*8,dimension(:),pointer            :: oV
227 |     character*(*)    ,intent(in), optional :: iFileName
228 |     integer, intent(out)                   :: Blocksize 
229 |     integer i,j,k,Error
230 |     integer,parameter               :: F=99
231 |     real*8                          :: Val
232 |     character*256                   :: RowFileName,ColFileName,ValFileName,S,DefFileName,RHSFileName
233 |     integer :: NoCols,NoRows,NoElements,IsCSR,Pos
234 |     logical :: ErrorCode
235 |     character*256 :: str
236 | 
237 |     !names and legend
238 |     if(present(iFileName)) then
239 |       DefFileName=trim(iFileName)//'_definition.txt'  
240 |       RHSFileName=trim(iFileName)//'_RHS.txt'
241 |       RowFileName=trim(iFileName)//'_RowIdxs.txt'
242 |       ColFileName=trim(iFileName)//'_ColIdxs.txt'
243 |       ValFileName=trim(iFileName)//'_Values.txt'
244 |     else
245 |       DefFileName='Definition.txt'
246 |       RHSFileName='RHS.txt'
247 |       RowFileName='RowIdxs.txt'
248 |       ColFileName='ColIdxs.txt'
249 |       ValFileName='Values.txt'
250 |     end if
251 |     
252 |     open(unit=F,file=DefFileName,status='old')
253 |     do i=1,4
254 |       read(F,'(A)') str ! read line
255 |       str = adjustl(str) ! remove left blanks
256 |       pos = scan(str,'!') 
257 |       if(pos.gt.0) then
258 |         str = str(1:pos-1)  ! extract the parameter value
259 |       end if
260 |       select case(i)    
261 |       case(1)  
262 |         read(str,*) NoRows
263 |       case(2)  
264 |         read(str,*) NoCols
265 |       case(3)  
266 |         read(str,*) NoElements
267 |       case(4)  
268 |         read(str,*) Blocksize
269 |       end select
270 |     end do
271 |     close(F)
272 |     
273 |     call SMatCreate(oA,NoRows,NoCols,'from file')
274 |     call mArraysReallocateReals(oA%Vals,NoElements)
275 |     call mArraysReallocateInts(oA%ColIdxs,NoElements)
276 |     oA%NoRows=NoRows
277 |     oA%NoCols=NoCols
278 |     oA%NoElements=NoElements
279 |        
280 |     !Row indices
281 |     open(unit=F,file=RowFileName,status='unknown')
282 |     read(F,'(A)') S
283 |     do i=1,NoRows+1
284 |       read(F,'(A)') S
285 |       read(S,*) oA%RowIdxs(i)
286 |     end do
287 |     close(F)
288 |     
289 |     !Col indices
290 |     open(unit=F,file=ColFileName,status='unknown')
291 |     read(F,'(A)') S
292 |     do i=1,NoElements
293 |       read(F,'(A)') S
294 |       read(S,*) oA%ColIdxs(i)
295 |     end do
296 |     close(F)
297 |     
298 |     !Vals indices
299 |     open(unit=F,file=ValFileName,status='unknown')
300 |     read(F,'(A)') S
301 |     do i=1,NoElements
302 |       read(F,'(A)') S
303 |       read(S,*) oA%Vals(i)
304 |     end do
305 |     close(F)
306 |     
307 |     !RHS
308 |     allocate(oV(NoCols))
309 |     open(unit=F,file=RHSFileName,status='unknown')
310 |     read(F,'(A)') S
311 |     do i=1,NoCols
312 |       read(F,'(A)') S
313 |       read(S,*) oV(i)
314 |     end do
315 |     close(F)
316 |     return
317 | 
318 |     end subroutine SMatReadSparse    
319 | 
320 |    
321 |        subroutine SMatDestroy(ioSparseMat)
322 | !Casper Kirkegaard, November 2009.
323 | !This routine destroys a TSparseMat structures, ie. 
324 | !deallocates memory etc.
325 | !
326 | !Updated 2014 Tue, Now it takes complex numbers as well
327 | !
328 | !Updated Dec 2016 Kristoffer Andersen. Now also resets IsSorted and IsNativeCSR
329 | !
330 | !IO
331 | !   ioSparseMat  -> Sparse matrix to destroy    
332 |     implicit none
333 |     type (TSparseMat),intent(inout) :: ioSparseMat
334 | !Implementation
335 |     !Deallocate the coordinate form arrays(Always allocated)
336 |     if (associated(ioSparseMat%Vals)) then
337 |       deallocate(ioSparseMat%Vals)
338 |       deallocate(ioSparseMat%ColIdxs)
339 |       deallocate(ioSparseMat%RowIdxs)
340 |     end if    
341 |     if (associated(ioSparseMat%cVals)) then
342 |       deallocate(ioSparseMat%cVals)
343 |       deallocate(ioSparseMat%ColIdxs)
344 |       deallocate(ioSparseMat%RowIdxs)
345 |     end if  
346 |     nullify(ioSparseMat%Vals)   
347 |     nullify(ioSparseMat%CVals)   
348 |     nullify(ioSparseMat%ColIdxs)
349 |     nullify(ioSparseMat%RowIdxs)
350 |     !Reset vars   
351 |     ioSparseMat%NoRows=0
352 |     ioSparseMat%NoCols=0
353 |     ioSparseMat%NoElements=0
354 |     ioSparseMat%IsComplex=.false.
355 |   end subroutine SMatDestroy
356 | 
357 |     subroutine SMatGetDiag(iA,oDiag,iEnsurePositiveDefinite)
358 |     !Tue June 2015
359 |     !This function should be called through its interface SMatGetDiag
360 |     !This function returns the value of the diagonal elements in a matrix saved in CSR format
361 |     !IO
362 |     !iA -> input sparse matrix in CSR format
363 |     !oDiag -> output oDiag(i) contains the inverse value of the i'th diagonal element in the CSR matrix given in iA
364 |     !iEnsurePositiveDefinite -> (default = false) can be used to check if the diagonal is positive - NOT FULL POSITIVE DEFINITENESS
365 |     !
366 |     !CK, october 2015
367 |     !Changed oDiag to no longer be allocatable - it not possible to have an output array alloctable. 
368 |     !
369 |     ! KRA, Dec 2016
370 |     ! Added Positive diagonal check and check of the size of oDiag compared to iA%NoRows
371 |     !
372 |     implicit none    
373 |     type (TSparseMat),intent(inout)  :: iA
374 |     real*8,intent(out)   :: oDiag(:)
375 |     logical,intent(in),optional      :: iEnsurePositiveDefinite
376 |     integer                          :: i,k
377 |     integer                          :: oError
378 |     real*8                           :: tmp
379 |     character*256                    :: S
380 |     logical                          :: Error = .False.
381 |     integer                          :: OldThreads
382 |     logical                          :: CheckPositiveDefinite
383 |     !allocate(oDiag(iA%NoRows))
384 |     if(size(oDiag,1).lt.iA%NoRows) goto 903
385 |     if(present(iEnsurePositiveDefinite)) then
386 |       CheckPositiveDefinite=iEnsurePositiveDefinite
387 |     else 
388 |       CheckPositiveDefinite=.False.
389 |     end if
390 |     
391 |     
392 |     if(CheckPositiveDefinite) then
393 |       !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i,tmp,oError)
394 |       !$OMP DO SCHEDULE(STATIC)
395 |       do i=1,iA%NoRows
396 |         tmp=SMatGetValue(iA,i,i,oError)
397 |         if (tmp.gt.0d0) then
398 |           oDiag(i)=tmp
399 |         else 
400 |           Error=.true.
401 |           k=i
402 |         end if
403 |       end do
404 |       !$OMP END DO
405 |       !$OMP END PARALLEL  
406 |     else
407 |       !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i,tmp,oError)
408 |       !$OMP DO SCHEDULE(STATIC)
409 |       do i=1,iA%NoRows
410 |         oDiag(i)=SMatGetValue(iA,i,i,oError)
411 |       end do
412 |       !$OMP END DO
413 |       !$OMP END PARALLEL  
414 |     end if        
415 |     
416 |     call ReturnOldThreadNum()
417 |     if (Error) goto 901
418 |     return
419 |     901 continue
420 |     print*, 'Critical failure in SMatGetDiag_real, matrix-diagonal is ',SMatGetValue(iA,k,k,oError),' at ',k
421 | 903 continue
422 |     print*, 'Output array allocation too small in SMatGetDiag_real'
423 |     End subroutine SMatGetDiag
424 | 
425 |      
426 |     real*8 function SMatGetValue(iA,iRowNo,iColNo,oError)
427 | !Casper Kirkegaard, November 2009.
428 | !This routine locates and returns the value of a given row,col entry. 
429 | !NOTE THAT THE MATRIX IS FINALIZED WHEN CALLING THIS PROCEDURE!!!  
430 | !IO
431 | !  iA     -> Sparse matrix to search for value
432 | !  iRowNo -> Index of the row to search for
433 | !  iColNo -> Index of the column to search for
434 | !  oError -> -1 if value does not exist, ie. entry equal to zero
435 | !            Positive value if the value entry exists.   
436 | !IO parameters
437 |     implicit none    
438 |     type (TSparseMat),intent(inout)  :: iA
439 |     integer,intent(in)               :: iColNo
440 |     integer,intent(in)               :: iRowNo
441 |     integer,intent(out)              :: oError
442 | !Variable declarations  
443 |     integer                          :: CorColNo     
444 | !Implementation  
445 |     !Apply column index correction if necessary. Also make sure to switch back the ColIdxCorrected status flag
446 |     !at the end of this routine if this is the routine applying the correction.
447 |     CorColNo=iColNo    
448 |     !Return the value
449 |     call SMatLocateCSRIdx(iA,iRowNo,CorColNo,oError)
450 |     if (oError.eq.-1) then
451 |       SMatGetValue=0d0
452 |     else
453 |       SMatGetValue=iA%Vals(oError)  
454 |     end if  
455 | 
456 |     !Switch back the col index correction lock if necessary.
457 | end function SMatGetValue
458 | 
459 |     subroutine SMatDAD(A,iD)
460 |       type (TSparseMat),intent(inout)  :: A
461 |       real*8,dimension(:),intent(in)   :: iD 
462 |       integer :: i,j
463 |       do i=1,A%NoRows
464 |         do j=A%RowIdxs(i),A%RowIdxs(i+1)-1
465 |           A%Vals(j)=A%Vals(j)*iD(i)*iD(A%colIdxs(j))
466 |         end do
467 |       end do
468 |       
469 |     end subroutine SMatDAD
470 | 
471 |        subroutine SMatLocateCSRIdx(iA,iRowNo,iColNo,oIndex,iFindNearest)
472 | !Casper Kirkegaard, November 2009.
473 | !This routine locates and returns the CSR index of a given row,col entry.  
474 | !-------------------------------------
475 | !Updated September 2014, by Tue
476 | !Added the optional parameter iFindNearest.
477 | !------------------------------------
478 | !IO
479 | !  iA     -> Sparse matrix to search
480 | !  iRowNo -> Index of the row to search for
481 | !  iColNo -> Index of the column to search for
482 | !  oIndex -> Index in iA%Vals of the entry iRow,iColNo
483 | !            Set to -1 if the entry does not exits in the matrix 
484 | !  iFindNearest -> optional parameter, which tells whether to return the nearest match if iIndex does not exist.
485 | !                  default is 0, which returns -1 if index is not found
486 | !                  if set to 1, it returns the nearest index searching forward
487 | !                  if set to -1, it returns the nearest index searching backwards
488 | !IO parameters
489 |     implicit none    
490 |     type (TSparseMat),intent(inout)  :: iA
491 |     integer,intent(in)               :: iColNo
492 |     integer,intent(in)               :: iRowNo
493 |     integer,intent(out)              :: oIndex
494 |     integer,intent(in),optional      :: iFindNearest
495 | !Variable declarations    
496 |     integer                          :: CorColNo
497 |     integer                          :: StartIndex,EndIndex 
498 |     integer                          :: FindNearest
499 | !Implementation  
500 |     if (present(iFindNearest)) then
501 |       FindNearest = iFindNearest
502 |     else
503 |       FindNearest = 0
504 |     end if
505 |     !Apply column index correction if necessary. Also make to switch back the ColIdxCorrected status flag
506 |     !at the end of this routine if this is the routine applying the correction.
507 |     CorColNo=iColNo
508 | 
509 |     !Do an n*log(n) search by exploiting that the column indices of each
510 |     !row are sorted by increasing value. Chop up the entire interval in halfs
511 |     !and determine in which of the intervals the index lies. Continue until the value
512 |     !is found or the width of the search interval is 1 and the values has not been found.
513 |     StartIndex=iA%RowIdxs(iRowNo)
514 |     EndIndex=iA%RowIdxs(iRowNo+1)-1
515 |     call SMatLocateIdx(iA%ColIdxs,StartIndex,EndIndex,CorColNo,oIndex,FindNearest)    
516 |     !Switch back the col index correction lock if necessary.
517 |    end subroutine SMatLocateCSRIdx
518 | 
519 |         subroutine SMatLocateIdx(iArray,iBegin,iEnd,iIndex,oIndex,iFindNearest)
520 | !Casper Kirkegaard, November 2009.
521 | !This routine locates the position of a given value in a sorted integer array.  
522 | !-------------------------------------
523 | !Updated September 2014, by Tue
524 | !Added the optional parameter iFindNearest.
525 | !------------------------------------
526 | !IO
527 | !  iArray -> Array to search through
528 | !  iBegin -> Index in iArray to start from
529 | !  iEnd   -> Index in iArray to end at
530 | !  iIndex -> Integer to locate index in iArray for
531 | !  oIndex -> index in iarray of the value iIndex
532 | !            Set to -1 if the entry does not exits in the  
533 | !  iFindNearest -> optional parameter, which tells whether to return the nearest match if iIndex does not exist.
534 | !                  default is 0, which returns -1 if index is not found
535 | !                  if set to 1, it returns the nearest index searching forward
536 | !                  if set to -1, it returns the nearest index searching backwards
537 | !IO parameters
538 |     implicit none
539 |     integer, dimension(:),intent(in) :: iArray
540 |     integer,intent(in)               :: iBegin
541 |     integer,intent(in)               :: iEnd
542 |     integer,intent(in)               :: iIndex
543 |     integer,intent(out)              :: oIndex
544 |     integer,intent(in),optional      :: iFindNearest
545 | !Variable declarations    
546 |     integer                          :: i,j   
547 |     integer                          :: StartIndex
548 |     integer                          :: MidIndex
549 |     integer                          :: EndIndex   
550 |     real*8                           :: Dummy 
551 |     integer                          :: BruteIndex
552 |     logical                          :: Searching
553 |     integer                          :: Count
554 |     integer                          :: SearchForNearest
555 | !Implementation  
556 |     !BruteIndex=-1
557 |     !do i=iA%RowIdxs(iRowNo),iA%RowIdxs(iRowNo+1)-1
558 |     !  if (iA%ColIdxs(i).eq.iColNo) then
559 |     !    BruteIndex=i
560 |     !  end if 
561 |     !end do
562 |     !Do a log(n) search by exploiting that the column indices of each
563 |     !row are sorted by increasing value. Chop up the entire interval in halfs
564 |     !and determine in which of the intervals the index lies. Continue until the value
565 |     !is found or the width of the search interval is 1 and the values has not been found.
566 |     if (present(iFindNearest)) then 
567 |         SearchForNearest = iFindNearest
568 |     else
569 |         SearchForNearest = 0
570 |     end if
571 |     Searching=.true.
572 |     oIndex=-1
573 |     StartIndex=iBegin
574 |     EndIndex=iEnd
575 |     !Don't search if the row is empty ...
576 |     if (EndIndex.lt.StartIndex) Searching=.False.
577 |     Count=1
578 |     do while(Searching)
579 |       Count=Count+1
580 |       if (EndIndex-StartIndex.lt.2) then
581 |         Searching=.false.
582 |         do i=StartIndex,EndIndex
583 |           if (iArray(i).eq.iIndex) then
584 |             oIndex=i
585 |             exit
586 |           end if   
587 |         end do
588 |       end if
589 |       Dummy=(StartIndex+EndIndex)/2
590 |       MidIndex=AnInt(Dummy)  
591 |   
592 |       if (iArray(MidIndex).eq.iIndex) then
593 |         Searching=.false.
594 |         oIndex=MidIndex 
595 |       else
596 |         if (iArray(MidIndex).gt.iIndex) then
597 |           EndIndex=MidIndex
598 |         else
599 |           StartIndex=MidIndex
600 |         end if  
601 |       end if      
602 |     end do
603 |     !If the exact index was not found, do we look for nearest index?
604 |     if ((oIndex.eq.-1).and.(SearchForNearest.ne.0)) then
605 |       if (SearchForNearest.eq.1) then
606 |         if (iArray(StartIndex).gt.iIndex) then
607 |           oIndex=StartIndex
608 |         else
609 |           oIndex=EndIndex
610 |         end if
611 |       else if(SearchForNearest.eq.-1) then
612 |         if (iArray(EndIndex).lt.iIndex) then
613 |           oIndex=EndIndex
614 |         else
615 |           oIndex=StartIndex
616 |         end if
617 |       end if
618 |     end if
619 |     end subroutine SMatLocateIdx
620 |    
621 |    subroutine SMatGetDiagonalDominant(iA,oDiagDom)
622 |    !Tue Dec, 2014
623 |    !Determines the 
624 |    !
625 |    !IO
626 |    !iA -> matrix in CSR format
627 |    !oDiagDom -> on output the diagonal dominance of the matrix, if DiagDom<1 it is strictly diagonal dominant   
628 |     implicit none    
629 |     type (TSparseMat),intent(inout)     :: iA   
630 |     real*8,intent(out)                  :: oDiagDom
631 |     !Local vars
632 |     integer                           :: i, j,k
633 |     integer,allocatable,dimension(:)  :: DiagIdxs
634 |     real*8                            :: Diag,RowVal,MinVal,Maxval    
635 |     character*256                     :: S
636 |     logical                           :: Error =.False.
637 |     integer                            :: OldThreads
638 |     allocate(DiagIdxs(iA%NoRows))
639 |     Call SMatGetDiagIdx(iA,DiagIdxs)
640 |     oDiagDom=0
641 | !$OMP PARALLEL IF(StartOpenMP(2,1)) PRIVATE(I,J,Diag,RowVal) SHARED(DiagIdxs,iA,k,error) DEFAULT(NONE)  REDUCTION(+:oDiagDom)    
642 | !$OMP DO SCHEDULE(STATIC)
643 |       do i=1,iA%NoRows
644 |         Diag=(abs(iA%Vals(DiagIdxs(i))))
645 |         if(Diag.eq.0) then 
646 |             Error=.true.
647 |             k=i
648 |         end if        
649 |         RowVal=0d0
650 |         do j=iA%RowIdxs(i),iA%RowIdxs(i+1)-1
651 |             RowVal=RowVal+abs(iA%Vals(j))
652 |         end do
653 |         if (Error) then
654 |           !Set oDiagDom to something very large if the diagelement is 0.
655 |           oDiagDom=50d0
656 |         else
657 |           oDiagDom=oDiagDom+(RowVal-Diag)/Diag
658 |         end if  
659 |     end do
660 | !$OMP END DO
661 | !$OMP END PARALLEL
662 |    oDiagDom=oDiagDom/((iA%NoRows-2)*0.5d0)
663 |     call ReturnOldThreadNum()
664 |     deallocate(DiagIdxs)
665 |     return
666 |    end subroutine SMatGetDiagonalDominant
667 | 
668 |    
669 |      subroutine SMatAmultV(A,iV,oV)
670 | !Casper Kirkegaard, November 2009.
671 | !This routine calculates the product of the input matrix A with a vector iV
672 | !IO
673 | ! A  -> Sparse A matrix
674 | ! iV -> Dense vector to multiply with
675 | ! oV -> Product A*iV 
676 | !IO parameters
677 |     implicit none
678 |     type (TSparseMat),intent(inout)  :: A
679 |     real*8,dimension(:),intent(in)   :: iV           
680 |     real*8,dimension(:),intent(out)  :: oV
681 | !Variable declarations    
682 |     integer,Dimension(:),allocatable :: TempVals 
683 |     integer                          :: i,j
684 |     integer                          :: AFirst,ALast
685 |     integer                          :: RowStart
686 |     integer                          :: RowEnd    
687 |     real*8                           :: Sum   
688 |     integer                          :: OldThreads
689 |     logical                          :: Dummy
690 | !Implementation  
691 |     dummy = StartOpenMP(2,1)
692 |     call amux (A%NoRows, iV, oV, A%Vals,A%ColIdxs,A%RowIdxs) 
693 |     call ReturnOldThreadNum()
694 |      end subroutine SMatAmultV
695 |      
696 |     subroutine amux (n, x, y, a,ja,ia) 
697 |       real*8  x(*), y(*), a(*) 
698 |       integer n, ja(*), ia(*)
699 | !c-----------------------------------------------------------------------
700 | !c         A times a vector
701 | !c----------------------------------------------------------------------- 
702 | !c multiplies a matrix by a vector using the dot product form
703 | !c Matrix A is stored in compressed sparse row storage.
704 | !c
705 | !c on entry:
706 | !c----------
707 | !c n     = row dimension of A
708 | !c x     = real array of length equal to the column dimension of
709 | !c         the A matrix.
710 | !c a, ja,
711 | !c    ia = input matrix in compressed sparse row format.
712 | !c
713 | !c on return:
714 | !c-----------
715 | !c y     = real array of length n, containing the product y=Ax
716 | !c
717 | !c-----------------------------------------------------------------------
718 | !c local variables
719 | !c
720 |       real*8 t
721 |       integer i, k
722 | !c-----------------------------------------------------------------------
723 |       !call mkl_dcsrgemv('n', n, a, ia, ja, x, y)
724 |       !return
725 | !$OMP PARALLEL SHARED(n, x, y, a,ja,ia) PRIVATE(t,i,k)
726 | !$OMP DO
727 |       do i = 1,n
728 |         t = 0
729 |         do k=ia(i), ia(i+1)-1 
730 |           t = t + a(k)*x(ja(k))
731 |         end do
732 |         y(i) = t
733 |       end do  
734 | !$OMP END DO
735 | !$OMP END PARALLEL      
736 | !c---------end-of-amux---------------------------------------------------
737 | !c-----------------------------------------------------------------------
738 |     end
739 |     
740 |           subroutine smatgetUpperTriangular (ioA)
741 | !   Kristoffer Andersen, Dec 2016
742 | !   extracts the upper trangular part of ioA. 
743 |       type(tsparsemat),intent(inout) :: ioA
744 |       !real*8 a(*), ao(*), t
745 |       !integer ia(*), ja(*), iao(*), jao(*)
746 | !----------------------------------------------------------------------- 
747 | ! On entry
748 | !-----------
749 | ! nrow  = dimension of the matrix a.
750 | ! a, ja, 
751 | !    ia = matrix stored in compressed row sparse format
752 | !
753 | ! nzmax = length of arrays ao,  and jao. 
754 | !
755 | ! On return:
756 | !----------- 
757 | ! ao, jao, 
758 | !     iao = lower part of input matrix (a,ja,ia) stored in compressed sparse 
759 | !          row format format.
760 | !  
761 | ! ierr   = integer error indicator. 
762 | !          ierr .eq. 0  means normal return
763 | !          ierr .eq. i  means that the code has stopped when processing
764 | !          row number i, because there is not enough space in ao, jao
765 | !          (according to the value of nzmax) 
766 | !
767 | !----------------------------------------------------------------------- 
768 |       !var
769 |       integer :: ierr,ko,i,kold,kdiag,k,RowIDold
770 |       !prog
771 |       ko = 0
772 |       do  i=1, ioA%NoRows
773 |          RowIdOld = ioA%RowIdxs(i)
774 |          ioA%RowIdxs(i) = ko+1
775 |          do k = RowIdOld, ioA%RowIdxs(i+1) -1
776 |             if (ioA%ColIdxs(k)  .ge. i) then ! upper trangle
777 |                 ko = ko+1 ! next new index
778 |                 ioA%Vals(ko) = ioA%Vals(k) ! store value
779 |                 ioA%ColIdxs(ko) = ioA%ColIdxs(k)
780 |             endif
781 |          enddo
782 |      
783 |       enddo
784 |       
785 |       ioA%RowIDxs(ioA%NoRows+1) = ko+1
786 |       ioA%NoElements = ko
787 |       return
788 |       end subroutine smatgetUpperTriangular 
789 |       
790 | 
791 |     
792 |      subroutine SMatGetInvDiag(iA,oInvDiag)
793 |     !Tue September 2014
794 |     !This function should be called through its interface SMatGetInvDiag
795 |     !This function returns the inverse value of the diagonal elements in a matrix saved in CSR format
796 |     !IO
797 |     !iA -> input sparse matrix in CSR format
798 |     !oInvDiag -> output oInvDiag(i) contains the inverse value of the i'th diagonal element in the CSR matrix given in iA
799 |     !
800 |     !CK, october 2015
801 |     !Removed allocatable property on oInvDiag. Cannot allocate and pass an allocatable outside a subroutine.
802 |     implicit none    
803 |     type (TSparseMat),intent(inout)  :: iA
804 |     real*8,intent(out)               :: oInvDiag(:)
805 |     !var
806 |     integer                          :: i,k
807 |     integer                          :: oError
808 |     real*8                           :: tmp
809 |     character*256                    :: S
810 |     logical                          :: Error
811 |     integer                          :: OldThreads
812 |     Error = .False.
813 |     !allocate(oInvDiag(iA%NoRows))
814 |     !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i,tmp,oError)
815 |     !$OMP DO SCHEDULE(STATIC)
816 |     do i=1,iA%NoRows
817 |         tmp=SMatGetValue(iA,i,i,oError)
818 |         if (abs(tmp).gt.0d0) then
819 |             oInvDiag(i)=1/tmp
820 |         else 
821 | !!$OMP CRITICAL (Errorfound)  ! might be neccesary
822 |             Error=.true.
823 |             k=i
824 | !!$OMP END CRITICAL (Errorfound)
825 |         end if
826 |     end do
827 |     !$OMP END DO
828 |     !$OMP END PARALLEL  
829 |     call ReturnOldThreadNum()
830 |     return
831 |     End subroutine SMatGetInvDiag
832 |     
833 |     double precision function ddot(n,dx,incx,dy,incy)
834 |     double precision dx(*),dy(*),dtemp
835 |     integer i,incx,incy,ix,iy,m,mp1,n
836 |     ddot = 0.0d0
837 |     dtemp = 0.0d0
838 |     if(n<=0)return
839 |     if(incx==1.AND.incy==1)go to 20
840 |     ix = 1
841 |     iy = 1
842 |     if(incx<0)ix = (-n+1)*incx + 1
843 |     if(incy<0)iy = (-n+1)*incy + 1
844 |     do 10 i = 1,n
845 |       dtemp = dtemp + dx(ix)*dy(iy)
846 |       ix = ix + incx
847 |       iy = iy + incy
848 |  10 continue
849 |     ddot = dtemp
850 |     return
851 | 
852 |  20 m = mod(n,5)
853 |     if( m == 0 ) go to 40
854 |     do 30 i = 1,m
855 |       dtemp = dtemp + dx(i)*dy(i)
856 |  30 continue
857 |     if( n < 5 ) go to 60
858 |  40 mp1 = m + 1
859 |     do 50 i = mp1,n,5
860 |       dtemp = dtemp + dx(i)*dy(i) + dx(i + 1)*dy(i + 1) + dx(i + 2)*dy(i + 2) + dx(i + 3)*dy(i + 3) + dx(i + 4)*dy(i + 4)
861 |  50 continue
862 |  60 ddot = dtemp
863 |     return
864 |     end
865 |     
866 |      subroutine SMatGetDiagIdx(iA,oDiagIdxs)
867 |     !Tue September 2014
868 |     !This function should be called through its interface SMatGetDiagIdx
869 |     !This function returns the indices for the diagonal elements in a matrix saved in CSR format
870 |     !CK, october 2015
871 |     !Changed oDiagIdxs to no longer be allocatable - it not possible to have an output array alloctable. 
872 |     !IO
873 |     !iA -> input sparse matrix in CSR format
874 |     !SMatGetDiagIdx_type -> output SMatGetDiagIdx_type(i) contains the indices for the i'th diagonal element in the CSR matrix given in iA
875 |     implicit none    
876 |     type (TSparseMat),intent(inout)  :: iA
877 |     integer,intent(out)              :: oDiagIdxs(:)
878 |     integer                          :: i
879 |     integer                            :: OldThreads
880 |     
881 |     !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i)
882 |     !$OMP DO SCHEDULE(STATIC)
883 |     do i=1,iA%NoRows
884 |       call SMatLocateCSRIdx(iA,i,i,oDiagIdxs(i))
885 |     end do
886 |     !$OMP END DO
887 |     !$OMP END PARALLEL  
888 |     call ReturnOldThreadNum()
889 |     End subroutine SMatGetDiagIdx
890 |     
891 |       subroutine SMatGetBlockRowIdx(iA,iBlocks,oBRowIdxs)
892 |     !Tue September 2014
893 |     !This routine finds the first and last element in each row which are within the blocks given in iBlocks.
894 |     !In other words, this routine is used to sort out the elements which are not used in a blocksplitting of the matrix.
895 |     !CK, October 2015
896 |     !Removed allocation of browidxs - the allocation hasto be performed from outside not to cause data corruption.
897 |     !IO
898 |     !iA -> input sparse matrix in CSR format
899 |     !iBlocks -> The blocksplitting we wish to use on iA.
900 |     !oBRowIdxs -> oBRowIdxs(i,1) is the first indices to first element in row i within the block, oBRowIdxs(i,2) is the last element.
901 |     implicit none    
902 |     type (TSparseMat),intent(inout)  :: iA
903 |     integer,dimension(:,:),intent(in):: iBlocks
904 |     integer,            intent(out)  :: oBRowIdxs(:,:)
905 |     integer                          :: i,k
906 |     integer                          :: threads
907 |     integer                            :: OldThreads
908 |     !allocate(oBRowIdxs(iA%NoRows,2))
909 |     threads = size(iBlocks,1)    
910 |     do k=1,threads
911 |     !$OMP PARALLEL IF(StartOpenMP(2,1)) DEFAULT(SHARED) PRIVATE(i)
912 |     !$OMP DO SCHEDULE(STATIC)
913 |       do i = iBlocks(k,1),iBlocks(k,2)
914 |         call SMatLocateCSRIdx(iA,i,iBlocks(k,1),oBRowIdxs(i,1),1)
915 |         call SMatLocateCSRIdx(iA,i,iBlocks(k,2),oBRowIdxs(i,2),-1)
916 |       end do
917 |     !$OMP END DO
918 |     !$OMP END PARALLEL  
919 |     end do
920 |     call ReturnOldThreadNum()
921 |       End subroutine SMatGetBlockRowIdx
922 |     
923 |       
924 |     function SMatFindMaxColDif(iA)
925 |     !Tue September 2014
926 |     !This function finds the maximum distance two elements can be apart in the same row in the matrix
927 |     !IO
928 |     !iA -> input sparse matrix in CSR format
929 |     !FindMaxColDif -> output FindMaxColDif contains the maximum difference between the first and last element in each row across all rows.
930 |     implicit none    
931 |     type (TSparseMat),intent(in)     :: iA
932 |     integer                          :: SMatFindMaxColDif
933 |     integer                          :: i,j
934 |     SMatFindMaxColDif=0
935 |     do i=1,iA%NoRows
936 |         SMatFindMaxColDif=max(SMatFindMaxColDif,(iA%ColIdxs(iA%RowIdxs(i+1)-1)-iA%ColIdxs(iA%RowIdxs(i))+1))
937 |     end do
938 |     end function SMatFindMaxColDif
939 |    
940 | 	
941 |    
942 | end module mSMat      
943 | 
944 | 


--------------------------------------------------------------------------------
/Technical_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tueboesen/Sparse-iterative-parallel-linear-solver/fe82bd08e2050716c2ed43996337d5e8f4723c44/Technical_report.pdf


--------------------------------------------------------------------------------