├── LICENSE
├── Makefile
├── PanguLU_Users_Guide.pdf
├── README.md
├── build_helper.py
├── build_list.csv
├── examples
    ├── Makefile
    ├── Trefethen_20b.mtx
    ├── example.c
    ├── mmio.h
    ├── mmio_highlevel.h
    └── run.sh
├── include
    ├── pangulu.h
    └── pangulu_interface_common.h
├── lib
    └── Makefile
├── make.inc
├── reordering_omp
    ├── Makefile
    ├── include
    │   └── mynd_omp.h
    ├── lib
    │   └── makefile
    └── src
    │   ├── control.h
    │   ├── define.h
    │   ├── makefile
    │   ├── mkqsort.h
    │   ├── mynd_balance.c
    │   ├── mynd_coarsen.c
    │   ├── mynd_common.c
    │   ├── mynd_compressgraph.c
    │   ├── mynd_createcoarsegraph.c
    │   ├── mynd_functionset.h
    │   ├── mynd_graph.c
    │   ├── mynd_hashtable.c
    │   ├── mynd_ikvsorti.c
    │   ├── mynd_initialpartition.c
    │   ├── mynd_match.c
    │   ├── mynd_memory.c
    │   ├── mynd_mmdorder.c
    │   ├── mynd_priorityqueue.c
    │   ├── mynd_queue.c
    │   ├── mynd_read.c
    │   ├── mynd_refine.c
    │   ├── mynd_reordergraph.c
    │   ├── mynd_searchtree.c
    │   ├── mynd_splitgraph.c
    │   ├── mynd_timer.c
    │   ├── mynd_variables.c
    │   ├── struct.h
    │   └── typedef.h
└── src
    ├── Makefile
    ├── pangulu.c
    ├── pangulu_common.h
    ├── pangulu_communication.c
    ├── pangulu_conversion.c
    ├── pangulu_kernel_interface.c
    ├── pangulu_memory.c
    ├── pangulu_numeric.c
    ├── pangulu_platform_helper.c
    ├── pangulu_preprocessing.c
    ├── pangulu_reordering.c
    ├── pangulu_sptrsv.c
    ├── pangulu_storage.c
    ├── pangulu_strings.h
    ├── pangulu_symbolic.c
    ├── pangulu_task.c
    ├── pangulu_thread.c
    ├── pangulu_utils.c
    └── platforms
        ├── 01_SHAREDMEM
            └── 00_CPU
            │   └── 000_CPU
            │       ├── Makefile
            │       ├── pangulu_platform_0100000.c
            │       └── pangulu_platform_0100000.h
        ├── 02_NONSHAREDMEM
            └── 01_GPU
            │   └── 000_CUDA
            │       ├── Makefile
            │       ├── pangulu_platform_0201000.cu
            │       └── pangulu_platform_0201000.h
        ├── pangulu_platform_common.h
        └── platform_list.csv


/Makefile:
--------------------------------------------------------------------------------
 1 | all : examples
 2 | 
 3 | .PHONY : examples lib src clean update
 4 | 
 5 | examples : lib
 6 | 	$(MAKE) -C $@
 7 | 
 8 | lib : src
 9 | 	$(MAKE) -C $@
10 | 
11 | src: reordering
12 | 	$(MAKE) -C $@
13 | 
14 | reordering:
15 | 	$(MAKE) -C reordering_omp
16 | 
17 | clean:
18 | 	$(MAKE) -C src clean
19 | 	$(MAKE) -C lib clean
20 | 	$(MAKE) -C examples clean
21 | 	$(MAKE) -C reordering_omp clean
22 | 
23 | update : clean all


--------------------------------------------------------------------------------
/PanguLU_Users_Guide.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SuperScientificSoftwareLaboratory/PanguLU/d2d84052e6e982fc0b6c05e1294be8892186b495/PanguLU_Users_Guide.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PanguLU
  2 | 
  3 | -------------------
  4 | 
  5 | ## Introduction
  6 | 
  7 | PanguLU is an open source software package for solving a linear system *Ax = b* on heterogeneous distributed platforms. The library is written in C, and exploits parallelism from MPI, OpenMP and CUDA. The sparse LU factorisation algorithm used in PanguLU splits the sparse matrix into multiple equally-sized sparse matrix blocks and computes them by using sparse BLAS. The latest version of PanguLU uses a synchronisation-free communication strategy to reduce the overall latency overhead, and a variety of block-wise sparse BLAS methods have been adaptively called to improve efficiency on CPUs and GPUs. Currently, PanguLU supports both single and double precision, both real and complex values. In addition, our team at the SSSLab is constantly optimising and updating PanguLU.
  8 | 
  9 | ## Structure of code
 10 | 
 11 | ```
 12 | PanguLU/README         instructions on installation
 13 | PanguLU/src            C and CUDA source code, to be compiled into libpangulu.a and libpangulu.so
 14 | PanguLU/examples       example code
 15 | PanguLU/include        contains headers archieve pangulu.h
 16 | PanguLU/lib            contains library archieve libpangulu.a and libpangulu.so
 17 | PanguLU/reordering_omp Parallel graph partitioning and fill-reducing matrix ordering
 18 | PanguLU/Makefile       top-level Makefile that does installation and testing
 19 | PanguLU/make.inc       compiler, compiler flags included in all Makefiles (excepts examples/Makefile)
 20 | ```
 21 | 
 22 | ## Installation
 23 | #### Step 1 : Assert "make" is available.
 24 | "make" is an automatic build tool, it is required to build PanguLU. "make" is available in most GNU/Linux. You can install it using package managers like `apt` or `yum`.
 25 | 
 26 | #### Step 2 : Assert MPI library is available.
 27 | PanguLU requires MPI library. you need to install MPI library with header files. Tested MPI libraries : MPICH 4.3.0.
 28 | 
 29 | #### Step 3 : Assert CUDA is available. (optimal, required if GPU is used)
 30 | If GPUs are used, CUDA is required. Tested version : CUDA 12.9.
 31 | 
 32 | #### Step 4 : Assert BLAS library is available. (optimal, required if GPU is not used)
 33 | A BLAS library is required if CPU takes part in algebra computing of numeric factorisation. Tested version : OpenBLAS 0.3.26.
 34 | 
 35 | #### Step 5 : Edit `make.inc`.
 36 | Search `/path/to` in `make.inc`. Replace them to the path actually on your computer.
 37 | 
 38 | #### Step 6 : Edit `examples/Makefile`
 39 | The Makefile of example code doesn't include `make.inc`. Search `/path/to` in `examples/Makefile`. Replace them to the path actually on your computer.
 40 | 
 41 | #### Step 7 : Decide if you want to use GPU.
 42 | GPU is enabled by default. If you want to disable GPU, you should : 
 43 |  - Remove `GPU_CUDA` in `build_list.csv`;
 44 |  - Remove `-DGPU_OPEN` in `PANGULU_FLAGS`. You can find `PANGULU_FLAGS` in `make.inc`;
 45 |  - Comment `LINK_CUDA` in `examples/Makefile`.
 46 |  
 47 | Vise versa.
 48 | 
 49 | #### Step 8 : Run `make -j` in your terminal.
 50 | Make sure the working directory of your terminal is the root directory of PanguLU. If PanguLU was successfully built, you will find `libpangulu.a` and `libpangulu.so` in `lib` directory, and `pangulu_example.elf` in `exampls` directory.
 51 | 
 52 | ## Build flags
 53 | `PANGULU_FLAGS` influences build behaviors. You can edit `PANGULU_FLAGS` in `make.inc` to implement different features of PanguLU. Here are available flags :
 54 | 
 55 | #### Decide if or not using GPU.
 56 | Use `-DGPU_OPEN` to use GPU, vice versa. Please notice that using this flag is not the only thing to do if you want to use GPU. Please check Step 7 in the Installation part.
 57 | 
 58 | #### Decide the value type of matrix and vector entries.
 59 | Use `-DCALCULATE_TYPE_R64` (double real) or `-DCALCULATE_TYPE_CR64` (double complex) or `-DCALCULATE_TYPE_R32` (float real) or `-DCALCULATE_TYPE_CR32` (float complex). Note to also add this option to the compilation command in `example/Makefile`.
 60 | 
 61 | #### Decide if or not using MC64 reordering algorithm.
 62 | Use `-DPANGULU_MC64` to enable MC64 algorithm. Please notice that MC64 is not supported when matrix entries are complex numbers. If complex values are selected and `-DPANGULU_MC64` flag is used, MC64 would not enable.
 63 | 
 64 | #### Decide using our parallel reordering algorithm or METIS reordering tool.
 65 | Use `-DMETIS` if you want to use METIS; otherwise, our parallel reordering algorithm will be used by default.
 66 | 
 67 | #### Decide log level.
 68 | Please select zero or one of these flags : `-DPANGULU_LOG_INFO`, `-DPANGULU_LOG_WARNING` or `-DPANGULU_LOG_ERROR`. Log level "INFO" prints all messages to standard output (including warnings and errors). Log level "WANRING" only prints warnings and errors. Log level "ERROR" only prints fatal errors causing PanguLU to terminate abnormally.
 69 | 
 70 | #### Decide whether additional performance information is needed:
 71 | Use `-DPANGULU_PERF` to output additional performance information, such as kernel time per gpu and gflops of numeric factorisation. Note that this will slow down the speed of numeric factorisation.
 72 | 
 73 | #### Decide core binding strategy.
 74 | Hyper-threading is not recommended. If you can't turn off the hyper-threading and each core of your CPU has 2 threads, using `-DHT_IS_OPEN` may reaps performance gain.
 75 | 
 76 | ## Function interfaces
 77 | To make it easier to call PanguLU in your software, PanguLU provides the following function interfaces:
 78 | 
 79 | #### 1. pangulu_init()
 80 | ```
 81 | void pangulu_init(
 82 |   sparse_index_t pangulu_n, // Specifies the number of rows in the CSC matrix.
 83 |   sparse_pointer_t pangulu_nnz, // Specifies the total number of non-zero elements in the CSC matrix.
 84 |   sparse_pointer_t *csc_colptr, // Points to an array that stores pointers to columns of the CSC matrix.
 85 |   sparse_index_t *csc_rowidx, // Points to an array that stores indices to rows of the CSC matrix.
 86 |   sparse_value_t *csc_value, // Points to an array that stores the values of non-zero elements of the CSC matrix.
 87 |   pangulu_init_options *init_options, // Pointer to a pangulu_init_options structure containing initialisation parameters for the solver.
 88 |   void **pangulu_handle // On return, contains a handle pointer to the library's internal state.
 89 | );
 90 | ```
 91 | 
 92 | #### 2. pangulu_gstrf()
 93 | ```
 94 | void pangulu_gstrf(
 95 |   pangulu_gstrf_options *gstrf_options, // Pointer to pangulu_gstrf_options structure.
 96 |   void **pangulu_handle // Pointer to the solver handle returned on initialisation.
 97 | );
 98 | ```
 99 | 
100 | #### 3. pangulu_gstrs()
101 | ```
102 | void pangulu_gstrs(
103 |   sparse_value_t *rhs, // Pointer to the right-hand side vector.
104 |   pangulu_gstrs_options *gstrs_options, // Pointer to the pangulu_gstrs_options structure.
105 |   void** pangulu_handle // Pointer to the library internal state handle returned on initialisation.
106 | );
107 | ```
108 | 
109 | #### 4. pangulu_gssv()
110 | ```
111 | void pangulu_gssv(
112 |   sparse_value_t *rhs, // Pointer to the right-hand side vector.
113 |   pangulu_gstrf_options *gstrf_options, // Pointer to a pangulu_gstrf_options structure.
114 |   pangulu_gstrs_options *gstrs_options, // Pointer to a pangulu_gstrs_options structure.
115 |   void **pangulu_handle // Pointer to the library internal status handle returned on initialisation.
116 | );
117 | ```
118 | 
119 | #### 5. pangulu_finalize()
120 | ```
121 | void pangulu_finalize(
122 |   void **pangulu_handle // Pointer to the library internal state handle returned on initialisation.
123 | );
124 | ```
125 | 
126 | `example.c` is a sample program to call PanguLU. You can refer to this file to complete the call to PanguLU. You should first create the distributed matrix using `pangulu_init()`. If you need to solve multiple right-hand side vectors while the matrix is unchanged, you can call `pangulu_gstrs()` multiple times after calling `pangulu_gstrf()`. If you need to factorise a number of different matrices, call `pangulu_finalize()` after completing the solution of one matrix, and then use `pangulu_init()` to to initialise the next matrix.
127 | 
128 | ## Executing the example code of PanguLU
129 | The test routines are placed in the `examples` directory. The routine in `examples/example.c` firstly call `pangulu_gstrf()` to perform LU factorisation, and then call `pangulu_gstrs()` to solve linear equation.
130 | #### run command
131 | 
132 | > **mpirun -np process_count ./pangulu_example.elf -nb block_size -f path_to_mtx -r path_to_rhs**
133 |  
134 | process_count : MPI process number to launch PanguLU;
135 | 
136 | block_size : Rank of each non-zero block;
137 | 
138 | path_to_mtx : The matrix name in mtx format;
139 | 
140 | path_to_rhs : The path of the right-hand side vector. (optional)
141 | 
142 | You can also use the run.sh, for example:
143 | 
144 | > **bash run path_to_mtx block_size process_count**
145 | 
146 | #### test sample
147 | 
148 | > **mpirun -np 4 ./pangulu_example.elf -nb 10 -f Trefethen_20b.mtx**
149 | 
150 | or use the run.sh:
151 | > **bash run.sh Trefethen_20b.mtx 10 4**
152 | 
153 | 
154 | In this example, 4 processes are used to test, the block_size is 10, matrix name is Trefethen_20b.mtx.
155 | 
156 | 
157 | ## Release versions
158 | 
159 | #### <p align='left'>Version 5.0.0 (Jul. 31, 2025) </p>
160 | 
161 | * Added a task aggregator to increase numeric factorisation performance;
162 | * Optimised performance of preprocessing phase;
163 | * Added parallel reordering algorithm on CPU as the default reordering algorithm;
164 | * Optimised GPU memory layout to reduce GPU memory usage.
165 | 
166 | #### <p align='left'>Version 4.1.0 (Sep. 1, 2024) </p>
167 | 
168 | * Optimised memory usage of numeric factorisation and solving;
169 | * Added parallel building support.
170 | 
171 | #### <p align='left'>Version 4.0.0 (Jul. 24, 2024) </p>
172 | 
173 | * Optimised user interfaces of solver routines;
174 | * Optimised performamce of numeric factorisation phase on CPU platform;
175 | * Added support on complex matrix solving;
176 | * Optimised preprocessing performance;
177 | 
178 | #### <p align='left'>Version 3.5.0 (Aug. 06, 2023) </p>
179 | 
180 | * Updated the pre-processing phase with OpenMP.
181 | * Updated the compilation method of PanguLU, compile libpangulu.so and libpangulu.a at the same time.
182 | * Updated timing for the reorder phase, the symbolic factorisation phase, the pre-processing phase.
183 | * Added GFLOPS for the numeric factorisation phase.
184 |  
185 | #### <p align='left'>Version 3.0.0 (Apr. 02, 2023) </p>
186 | 
187 | * Used adaptive selection sparse BLAS in the numeric factorisation phase.
188 | * Added the reorder phase.
189 | * Added the symbolic factorisation phase. 
190 | * Added mc64 sorting algorithm in the reorder phase.
191 | * Added interface for 64-bit metis package in the reorder phase.
192 | 
193 | #### <p align='left'> Version 2.0.0 (Jul. &thinsp;22, 2022) </p>
194 | 
195 | * Used a synchronisation-free scheduling strategy in the numeric factorisation phase.
196 | * Updated the MPI communication method in the numeric factorisation phase.
197 | * Added single precision in the numeric factorisation phase.
198 | 
199 | #### <p align='left'>Version 1.0.0 (Oct. 19, 2021) </p>
200 | 
201 | * Used a rule-based 2D LU factorisation scheduling strategy.
202 | * Used Sparse BLAS for floating point calculations on GPUs.
203 | * Added the pre-processing phase.
204 | * Added the numeric factorisation phase.
205 | * Added the triangular solve phase.
206 | 
207 | ## Reference
208 | 
209 | * [1] Xu Fu, Bingbin Zhang, Tengcheng Wang, Wenhao Li, Yuechen Lu, Enxin Yi, Jianqi Zhao, Xiaohan Geng, Fangying Li, Jingwen Zhang, Zhou Jin, Weifeng Liu. PanguLU: A Scalable Regular Two-Dimensional Block-Cyclic Sparse Direct Solver on Distributed Heterogeneous Systems. 36th ACM/IEEE International Conference for High Performance Computing, Networking, Storage, and Analysis (SC '23). 2023.
210 | 
211 | 
212 | 


--------------------------------------------------------------------------------
/build_helper.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | import csv
  3 | import os
  4 | import sys
  5 | import subprocess
  6 | import time
  7 | 
  8 | platform_functions_info = [
  9 |     ["malloc", "void** platform_address, size_t size", "platform_address, size"],
 10 |     ["malloc_pinned", "void** platform_address, size_t size", "platform_address, size"],
 11 |     ["synchronize", "", ""],
 12 |     ["memset", "void* s, int c, size_t n", "s, c, n"],
 13 |     ["create_stream", "void** stream", "stream"],
 14 |     ["memcpy", "void *dst, const void *src, size_t count, unsigned int kind", "dst, src, count, kind"],
 15 |     ["memcpy_async", "void *dst, const void *src, size_t count, unsigned int kind, void* stream", "dst, src, count, kind, stream"],
 16 |     ["free", "void* devptr", "devptr"],
 17 |     ["get_device_num", "int* device_num", "device_num"],
 18 |     ["set_default_device", "int device_num", "device_num"],
 19 |     ["get_device_name", "char* name, int device_num", "name, device_num"],
 20 |     ["get_device_memory_usage", "size_t* used_byte", "used_byte"],
 21 |     
 22 |     ["getrf", "pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, int tid", "nb, opdst, tid"],
 23 |     ["tstrf", "pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* opdiag, int tid", "nb, opdst, opdiag, tid"],
 24 |     ["gessm", "pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* opdiag, int tid", "nb, opdst, opdiag, tid"],
 25 |     ["ssssm", "pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* op1, pangulu_storage_slot_t* op2, int tid", "nb, opdst, op1, op2, tid"],
 26 |     ["ssssm_batched", "pangulu_inblock_idx nb, pangulu_uint64_t ntask, pangulu_task_t* tasks", "nb, ntask, tasks"],
 27 |     ["hybrid_batched", "pangulu_inblock_idx nb, pangulu_uint64_t ntask, pangulu_task_t* tasks", "nb, ntask, tasks"],
 28 | 
 29 |     ["spmv",  "pangulu_inblock_idx nb, pangulu_storage_slot_t* a, calculate_type* x, calculate_type* y", "nb, a, x, y"],
 30 |     ["vecadd","pangulu_int64_t length, calculate_type *bval, calculate_type *xval", "length, bval, xval"],
 31 |     ["sptrsv","pangulu_inblock_idx nb, pangulu_storage_slot_t *s, calculate_type* xval, pangulu_int64_t uplo", "nb, s, xval, uplo"],
 32 | ]
 33 | 
 34 | def generate_platform_names(build_list_path, platform_list_path):
 35 |     build_name_list = []
 36 |     with open(build_list_path, "r") as f:
 37 |         build_reader = csv.reader(f)
 38 |         for build_item in build_reader:
 39 |             if len(build_item) < 1:
 40 |                 continue
 41 |             build_name_list.append(build_item[0])
 42 |             
 43 |     platform_list = []
 44 |     with open(platform_list_path, "r") as f:
 45 |         platform_reader = csv.reader(f)
 46 |         for platform_item in platform_reader:
 47 |             platform_list.append(platform_item)
 48 |     
 49 |     build_name_list_ret = []
 50 |     for name in build_name_list:
 51 |         for platform in platform_list:
 52 |             if len(platform) < 2:
 53 |                 continue
 54 |             if platform[1] == name:
 55 |                 build_name_list_ret.append(platform)
 56 |                 break
 57 |     return build_name_list_ret
 58 | 
 59 | 
 60 | def generate_platform_paths(build_platform_names, platform_list_path):
 61 |     platform_paths = []
 62 |     for platform in build_platform_names:
 63 |         platform_id = platform[0]
 64 |         assert(len(platform_id) == 7)
 65 |         platform_id_l1 = platform_id[0:2]
 66 |         platform_id_l2 = platform_id[2:4]
 67 |         platform_id_l3 = platform_id[4:7]
 68 |         dir_l1 = None
 69 |         dir_l2 = None
 70 |         dir_l3 = None
 71 |         dirs_l1 = [file for file in os.listdir(os.path.dirname(platform_list_path))]
 72 |         for current_dir_l1 in dirs_l1:
 73 |             if current_dir_l1[:2] == platform_id_l1:
 74 |                 dir_l1 = current_dir_l1
 75 |                 break
 76 |         dirs_l2 = [file for file in os.listdir(os.path.join(os.path.dirname(platform_list_path), dir_l1))]
 77 |         for current_dir_l2 in dirs_l2:
 78 |             if current_dir_l2[:2] == platform_id_l2:
 79 |                 dir_l2 = current_dir_l2
 80 |                 break
 81 |         dirs_l3 = [file for file in os.listdir(os.path.join(os.path.dirname(platform_list_path), dir_l1, dir_l2))]
 82 |         for current_dir_l3 in dirs_l3:
 83 |             if current_dir_l3[:3] == platform_id_l3:
 84 |                 dir_l3 = current_dir_l3
 85 |                 break
 86 |         platform_paths.append([platform_id, f"platforms/{dir_l1}/{dir_l2}/{dir_l3}"])
 87 |     return platform_paths
 88 | 
 89 | 
 90 | def generate_platform_function(platforms, function_info):
 91 |     function_lastname, formal_params, real_params = function_info
 92 |     ret = ""
 93 |     if len(formal_params) == 0:
 94 |         ret += f"void pangulu_platform_{function_lastname}(pangulu_platform_t platform)" + "{"
 95 |     else:
 96 |         ret += f"void pangulu_platform_{function_lastname}({formal_params}, pangulu_platform_t platform)" + "{"
 97 |     ret += "switch(platform){"
 98 |     for platform in platforms:
 99 |         ret += f'case PANGULU_PLATFORM_{platform[1]} : pangulu_platform_{platform[0]}_{function_lastname}({real_params}); break;'
100 |     ret += f'default: printf("No platform implementation for function pangulu_platform_{platform[0]}_{function_lastname}.\\n"); break;'
101 |     ret += "}}"
102 |     return ret
103 | 
104 | 
105 | def generate_platform_helper_c(dst_dir, build_list_path, platform_list_path):
106 |     dont_modify_warning = '''// Warning : Don't modify this file directly.
107 | // This file is automatically generated by build_helper.py.
108 | // This file will be regenerated after the next compilation.
109 | // All changes will be lost.
110 | '''
111 |     h_file_str = ""
112 |     h_file_str += dont_modify_warning
113 |     h_file_str += """#include "pangulu_common.h"
114 | #include "./platforms/pangulu_platform_common.h"
115 | """
116 |     build_platform_names = generate_platform_names(build_list_path, platform_list_path)
117 |         
118 |     for function_info in platform_functions_info:
119 |         h_file_str += generate_platform_function(build_platform_names, function_info)
120 |         h_file_str += "\n"
121 |     with open(os.path.join(dst_dir, "pangulu_platform_helper.c"), "w") as f:
122 |         f.write(h_file_str)
123 | 
124 | 
125 | def generate_platform_helper_h(dst_dir, build_list_path, platform_list_path):
126 |     dont_modify_warning = '''// Warning : Don't modify this file directly.
127 | // This file is automatically generated by build_helper.py.
128 | // This file will be regenerated after the next compilation.
129 | // All changes will be lost.
130 | '''
131 |     h_file_str = ""
132 |     h_file_str += dont_modify_warning
133 |     h_file_str += """#ifndef PANGULU_PLATFORM_HELPER
134 | #define PANGULU_PLATFORM_HELPER
135 | """
136 |     build_platform_names = generate_platform_names(build_list_path, platform_list_path)
137 |     
138 |     build_platform_paths = generate_platform_paths(build_platform_names, platform_list_path)
139 |     for platform_path in build_platform_paths:
140 |         h_file_str += f'#include "{os.path.join(platform_path[1].split("platforms/")[-1], f"pangulu_platform_{platform_path[0]}.h")}"\n'
141 | 
142 |     h_file_str += "typedef unsigned long long pangulu_platform_t;\n"
143 |     
144 |     for platform in build_platform_names:
145 |         h_file_str += f"#define PANGULU_PLATFORM_{platform[1]} 0x{platform[0]}\n"
146 | 
147 |     for function_info in platform_functions_info:
148 |         h_file_str += generate_platform_function(build_platform_names, function_info).split("{")[0]+";"
149 |         h_file_str += "\n"
150 | 
151 |     h_file_str += "#endif\n"
152 |     with open(os.path.join(dst_dir, "pangulu_platform_common.h"), "w") as f:
153 |         f.write(h_file_str)
154 | 
155 | 
156 | def compile_platform_code(build_list_path, platform_list_path):
157 |     build_platform_names = generate_platform_names(build_list_path, platform_list_path)
158 |     build_platform_paths = generate_platform_paths(build_platform_names, platform_list_path)
159 |     for build_platform_path in build_platform_paths:
160 |         command = f"make -C src/{build_platform_path[1]}"
161 |         print(command)
162 |         return_code = subprocess.call(command.split())
163 |         if return_code != 0:
164 |             exit(return_code)
165 | 
166 | def generate_build_info_h(dst_dir):
167 |     dont_modify_warning = '''// Warning : Don't modify this file directly.
168 | // This file is automatically generated by build_helper.py.
169 | // This file will be regenerated after the next compilation.
170 | // All changes will be lost.
171 | '''
172 |     h_file_str = ""
173 |     h_file_str += dont_modify_warning
174 |     h_file_str += """#ifndef PANGULU_BUILD_INFO
175 | #define PANGULU_BUILD_INFO
176 | """
177 | 
178 |     h_file_str += F'''const char* pangulu_build_time = "{time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}";\n'''
179 |     
180 |     h_file_str += "#endif"
181 |     with open(os.path.join(dst_dir, "pangulu_build_info.h"), "w") as f:
182 |         f.write(h_file_str)
183 | 
184 |     
185 | if __name__ == "__main__":
186 |     if sys.argv[1] == "generate_platform_helper":
187 |         # generate_build_info_h("src/")
188 |         generate_platform_helper_h("src/platforms/", "build_list.csv", "src/platforms/platform_list.csv")
189 |         generate_platform_helper_c("src/", "build_list.csv", "src/platforms/platform_list.csv")
190 |     elif sys.argv[1] == "compile_platform_code":
191 |         compile_platform_code("build_list.csv", "src/platforms/platform_list.csv")
192 |     else:
193 |         print("[BUILD_HELPER_ERROR] Unknown command.")
194 |         exit(1)


--------------------------------------------------------------------------------
/build_list.csv:
--------------------------------------------------------------------------------
1 | CPU_NAIVE
2 | GPU_CUDA


--------------------------------------------------------------------------------
/examples/Makefile:
--------------------------------------------------------------------------------
 1 | LINK_REORDERING = ../reordering_omp/lib/libmynd.so
 2 | #LINK_METIS = /path/to/metis/lib/libmetis.so
 3 | LINK_OPENBLAS = /path/to/openblas/lib/libopenblas.so
 4 | LINK_CUDA = -L/path/to/cuda/lib64 -lcudart -lstdc++ -lcusolver -lcublas
 5 | LINK_PANGULU = ../lib/libpangulu.a
 6 | 
 7 | all: pangulu_example.elf
 8 | 
 9 | pangulu_example.elf:example.c
10 | 	mpicc -O3 $< -DCALCULATE_TYPE_R64 -I../include $(LINK_PANGULU) $(LINK_CUDA) $(LINK_REORDERING) $(LINK_METIS) $(LINK_OPENBLAS)  -fopenmp -lpthread -lm -o $@
11 | 
12 | clean:
13 | 	rm -f *.elf
14 | 


--------------------------------------------------------------------------------
/examples/Trefethen_20b.mtx:
--------------------------------------------------------------------------------
  1 | %%MatrixMarket matrix coordinate integer symmetric
  2 | %-------------------------------------------------------------------------------
  3 | % UF Sparse Matrix Collection, Tim Davis
  4 | % http://www.cise.ufl.edu/research/sparse/matrices/JGD_Trefethen/Trefethen_20b
  5 | % name: JGD_Trefethen/Trefethen_20b
  6 | % [Diagonal matrices with primes, Nick Trefethen, Oxford Univ.]
  7 | % id: 2203
  8 | % date: 2008
  9 | % author: N. Trefethen
 10 | % ed: J.-G. Dumas
 11 | % fields: name title A id date author ed kind notes
 12 | % kind: combinatorial problem
 13 | %-------------------------------------------------------------------------------
 14 | % notes:
 15 | % Diagonal matrices with primes, Nick Trefethen, Oxford Univ.          
 16 | % From Jean-Guillaume Dumas' Sparse Integer Matrix Collection,         
 17 | % http://ljk.imag.fr/membres/Jean-Guillaume.Dumas/simc.html            
 18 | %                                                                      
 19 | % Problem 7 of the Hundred-dollar, Hundred-digit Challenge Problems,   
 20 | % SIAM News, vol 35, no. 1.                                            
 21 | %                                                                      
 22 | % 7. Let A be the 20,000 x 20,000 matrix whose entries are zero        
 23 | % everywhere except for the primes 2, 3, 5, 7, . . . , 224737 along the
 24 | % main diagonal and the number 1 in all the positions A(i,j) with      
 25 | % |i-j| = 1,2,4,8, . . . ,16384.  What is the (1,1) entry of inv(A)?   
 26 | %                                                                      
 27 | % http://www.siam.org/news/news.php?id=388                             
 28 | %                                                                      
 29 | % Filename in JGD collection: Trefethen/trefethen_20__19_minor.sms     
 30 | %-------------------------------------------------------------------------------
 31 | 19 19 83
 32 | 1 1 3
 33 | 2 1 1
 34 | 3 1 1
 35 | 5 1 1
 36 | 9 1 1
 37 | 17 1 1
 38 | 2 2 5
 39 | 3 2 1
 40 | 4 2 1
 41 | 6 2 1
 42 | 10 2 1
 43 | 18 2 1
 44 | 3 3 7
 45 | 4 3 1
 46 | 5 3 1
 47 | 7 3 1
 48 | 11 3 1
 49 | 19 3 1
 50 | 4 4 11
 51 | 5 4 1
 52 | 6 4 1
 53 | 8 4 1
 54 | 12 4 1
 55 | 5 5 13
 56 | 6 5 1
 57 | 7 5 1
 58 | 9 5 1
 59 | 13 5 1
 60 | 6 6 17
 61 | 7 6 1
 62 | 8 6 1
 63 | 10 6 1
 64 | 14 6 1
 65 | 7 7 19
 66 | 8 7 1
 67 | 9 7 1
 68 | 11 7 1
 69 | 15 7 1
 70 | 8 8 23
 71 | 9 8 1
 72 | 10 8 1
 73 | 12 8 1
 74 | 16 8 1
 75 | 9 9 29
 76 | 10 9 1
 77 | 11 9 1
 78 | 13 9 1
 79 | 17 9 1
 80 | 10 10 31
 81 | 11 10 1
 82 | 12 10 1
 83 | 14 10 1
 84 | 18 10 1
 85 | 11 11 37
 86 | 12 11 1
 87 | 13 11 1
 88 | 15 11 1
 89 | 19 11 1
 90 | 12 12 41
 91 | 13 12 1
 92 | 14 12 1
 93 | 16 12 1
 94 | 13 13 43
 95 | 14 13 1
 96 | 15 13 1
 97 | 17 13 1
 98 | 14 14 47
 99 | 15 14 1
100 | 16 14 1
101 | 18 14 1
102 | 15 15 53
103 | 16 15 1
104 | 17 15 1
105 | 19 15 1
106 | 16 16 59
107 | 17 16 1
108 | 18 16 1
109 | 17 17 61
110 | 18 17 1
111 | 19 17 1
112 | 18 18 67
113 | 19 18 1
114 | 19 19 71
115 | 


--------------------------------------------------------------------------------
/examples/example.c:
--------------------------------------------------------------------------------
  1 | typedef unsigned long long int sparse_pointer_t;
  2 | #define MPI_SPARSE_POINTER_T MPI_UNSIGNED_LONG_LONG
  3 | #define FMT_SPARSE_POINTER_T "%llu"
  4 | 
  5 | typedef unsigned int sparse_index_t;
  6 | #define MPI_SPARSE_INDEX_T MPI_UNSIGNED
  7 | #define FMT_SPARSE_INDEX_T "%u"
  8 | 
  9 | #if defined(CALCULATE_TYPE_R64)
 10 | typedef double sparse_value_t;
 11 | #elif defined(CALCULATE_TYPE_R32)
 12 | typedef float sparse_value_t;
 13 | #elif defined(CALCULATE_TYPE_CR64)
 14 | typedef double _Complex sparse_value_t;
 15 | typedef double sparse_value_real_t;
 16 | #define COMPLEX_MTX
 17 | #elif defined(CALCULATE_TYPE_CR32)
 18 | typedef float _Complex sparse_value_t;
 19 | typedef float sparse_value_real_t;
 20 | #define COMPLEX_MTX
 21 | #else
 22 | typedef double sparse_value_t;
 23 | #error [example.c Compile Error] Unknown value type. Set -DCALCULATE_TYPE_CR64 or -DCALCULATE_TYPE_R64 or -DCALCULATE_TYPE_CR32 or -DCALCULATE_TYPE_R32 in compile command line.
 24 | #endif
 25 | 
 26 | #include "../include/pangulu.h"
 27 | #include <sys/resource.h>
 28 | #include <getopt.h>
 29 | #include <stdio.h>
 30 | #include <mpi.h>
 31 | #include <math.h>
 32 | #include "mmio_highlevel.h"
 33 | 
 34 | #ifdef COMPLEX_MTX
 35 | sparse_value_real_t complex_fabs(sparse_value_t x)
 36 | {
 37 |     return sqrt(__real__(x) * __real__(x) + __imag__(x) * __imag__(x));
 38 | }
 39 | 
 40 | sparse_value_t complex_sqrt(sparse_value_t x)
 41 | {
 42 |     sparse_value_t y;
 43 |     __real__(y) = sqrt(complex_fabs(x) + __real__(x)) / sqrt(2);
 44 |     __imag__(y) = (sqrt(complex_fabs(x) - __real__(x)) / sqrt(2)) * (__imag__(x) > 0 ? 1 : __imag__(x) == 0 ? 0
 45 |                                                                                                             : -1);
 46 |     return y;
 47 | }
 48 | #endif
 49 | 
 50 | void read_command_params(int argc, char **argv, char *mtx_name, char *rhs_name, int *nb)
 51 | {
 52 |     int c;
 53 |     extern char *optarg;
 54 |     while ((c = getopt(argc, argv, "nb:f:r:")) != EOF)
 55 |     {
 56 |         switch (c)
 57 |         {
 58 |         case 'b':
 59 |             *nb = atoi(optarg);
 60 |             continue;
 61 |         case 'f':
 62 |             strcpy(mtx_name, optarg);
 63 |             continue;
 64 |         case 'r':
 65 |             strcpy(rhs_name, optarg);
 66 |             continue;
 67 |         }
 68 |     }
 69 |     if ((nb) == 0)
 70 |     {
 71 |         printf("Error : nb is 0\n");
 72 |         exit(1);
 73 |     }
 74 | }
 75 | 
 76 | int main(int ARGC, char **ARGV)
 77 | {
 78 |     // Step 1: Create varibles, initialize MPI environment.
 79 |     int provided = 0;
 80 |     int rank = 0, size = 0;
 81 |     int nb = 0;
 82 |     MPI_Init_thread(&ARGC, &ARGV, MPI_THREAD_MULTIPLE, &provided);
 83 |     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 84 |     MPI_Comm_size(MPI_COMM_WORLD, &size);
 85 |     sparse_index_t m = 0, n = 0, is_sym = 0;
 86 |     sparse_pointer_t nnz;
 87 |     sparse_pointer_t *colptr = NULL;
 88 |     sparse_index_t *rowidx = NULL;
 89 |     sparse_value_t *value = NULL;
 90 |     sparse_value_t *sol = NULL;
 91 |     sparse_value_t *rhs = NULL;
 92 | 
 93 |     // Step 2: Read matrix and rhs vectors.
 94 |     if (rank == 0)
 95 |     {
 96 |         char mtx_name[200] = {'\0'};
 97 |         char rhs_name[200] = {'\0'};
 98 |         read_command_params(ARGC, ARGV, mtx_name, rhs_name, &nb);
 99 | 
100 |         switch (mtx_name[strlen(mtx_name) - 1])
101 |         {
102 |         case 'x':
103 |             // mtx read (csc)
104 |             printf("Reading mtx matrix %s\n", mtx_name);
105 |             mmio_info(&m, &n, &nnz, &is_sym, mtx_name);
106 |             colptr = (sparse_pointer_t *)malloc(sizeof(sparse_pointer_t) * (n + 1));
107 |             rowidx = (sparse_index_t *)malloc(sizeof(sparse_index_t) * nnz);
108 |             value = (sparse_value_t *)malloc(sizeof(sparse_value_t) * nnz);
109 |             mmio_data_csc(colptr, rowidx, value, mtx_name);
110 |             printf("Read mtx done.\n");
111 |             break;
112 |         case 'd':
113 | 
114 |             // lid read
115 |             printf("Reading lid matrix %s\n", mtx_name);
116 |             FILE *lid_file = fopen(mtx_name, "r");
117 |             fread(&m, sizeof(sparse_index_t), 1, lid_file);
118 |             fread(&n, sizeof(sparse_index_t), 1, lid_file);
119 |             fread(&nnz, sizeof(sparse_pointer_t), 1, lid_file);
120 |             sparse_pointer_t *rowptr = (sparse_pointer_t *)malloc(sizeof(sparse_pointer_t) * (n + 1));
121 |             sparse_index_t *colidx = (sparse_index_t *)malloc(sizeof(sparse_index_t) * nnz);
122 |             sparse_value_t *value_csr = (sparse_value_t *)malloc(sizeof(sparse_value_t) * nnz);
123 |             fread(rowptr, sizeof(sparse_pointer_t), n + 1, lid_file);
124 |             fread(colidx, sizeof(sparse_index_t), nnz, lid_file);
125 |             fread(value_csr, sizeof(sparse_value_t), nnz, lid_file);
126 |             fclose(lid_file);
127 | 
128 |             colptr = (sparse_pointer_t *)malloc(sizeof(sparse_pointer_t) * (n + 1));
129 |             rowidx = (sparse_index_t *)malloc(sizeof(sparse_index_t) * nnz);
130 |             value = (sparse_value_t *)malloc(sizeof(sparse_value_t) * nnz);
131 |             memset(colptr, 0, sizeof(sparse_pointer_t) * (n + 1));
132 |             sparse_pointer_t *trans_aid = (sparse_pointer_t *)malloc(sizeof(sparse_pointer_t) * n);
133 |             memset(trans_aid, 0, sizeof(sparse_pointer_t) * n);
134 |             for (sparse_index_t row = 0; row < n; row++)
135 |             {
136 |                 for (sparse_pointer_t idx = rowptr[row]; idx < rowptr[row + 1]; idx++)
137 |                 {
138 |                     sparse_index_t col = colidx[idx];
139 |                     colptr[col + 1]++;
140 |                 }
141 |             }
142 |             for (sparse_index_t row = 0; row < n; row++)
143 |             {
144 |                 colptr[row + 1] += colptr[row];
145 |             }
146 |             memcpy(trans_aid, colptr, sizeof(sparse_pointer_t) * n);
147 |             for (sparse_index_t row = 0; row < n; row++)
148 |             {
149 |                 for (sparse_pointer_t idx = rowptr[row]; idx < rowptr[row + 1]; idx++)
150 |                 {
151 |                     sparse_index_t col = colidx[idx];
152 |                     rowidx[trans_aid[col]] = row;
153 |                     value[trans_aid[col]] = value_csr[idx];
154 |                     trans_aid[col]++;
155 |                 }
156 |             }
157 |             free(rowptr);
158 |             free(colidx);
159 |             free(value_csr);
160 |             free(trans_aid);
161 |             printf("Read lid done.\n");
162 | 
163 |             break;
164 |         }
165 | 
166 |         sol = (sparse_value_t *)malloc(sizeof(sparse_value_t) * n);
167 |         rhs = (sparse_value_t *)malloc(sizeof(sparse_value_t) * n);
168 |         memset(rhs, 0, sizeof(sparse_value_t) * n);
169 |         if (rhs_name[0])
170 |         {
171 |             FILE *rhs_file = fopen(rhs_name, "r");
172 |             if (!rhs_file)
173 |             {
174 |                 fprintf(stderr, "Error: Failed to open rhs file '%s'\n",
175 |                         rhs_name);
176 |                 exit(1);
177 |             }
178 | 
179 |             char line[512];
180 |             int found_data = 0;
181 |             while (fgets(line, sizeof(line), rhs_file))
182 |             {
183 |                 if (line[0] != '%')
184 |                 {
185 |                     found_data = 1;
186 |                     break;
187 |                 }
188 |             }
189 | 
190 |             if (!found_data)
191 |             {
192 |                 fprintf(stderr, "Error: File '%s' contains only comments or is empty\n", rhs_name);
193 |                 fclose(rhs_file);
194 |                 exit(1);
195 |             }
196 | 
197 |             int rhs_len;
198 |             if (sscanf(line, "%d", &rhs_len) != 1)
199 |             {
200 |                 fprintf(stderr, "Error: Failed to read vector dimension from '%s'\n", rhs_name);
201 |                 fclose(rhs_file);
202 |                 exit(1);
203 |             }
204 | 
205 |             if (rhs_len != n)
206 |             {
207 |                 fprintf(stderr, "Error: Vector dimension mismatch - expected %d, got %d\n", n, rhs_len);
208 |                 fclose(rhs_file);
209 |                 exit(1);
210 |             }
211 | 
212 |             for (int i = 0; i < n; i++)
213 |             {
214 |                 int read_success = 0;
215 | 
216 | #ifdef COMPLEX_MTX
217 |                 double real_part, imag_part;
218 |                 if (fscanf(rhs_file, "%le %le", &real_part, &imag_part) == 2)
219 |                 {
220 |                     __real__(rhs[i]) = real_part;
221 |                     __imag__(rhs[i]) = imag_part;
222 |                     read_success = 1;
223 |                 }
224 | #else
225 |                 if (fscanf(rhs_file, "%lf", &rhs[i]) == 1)
226 |                 {
227 |                     read_success = 1;
228 |                 }
229 | #endif
230 | 
231 |                 if (!read_success)
232 |                 {
233 |                     fprintf(stderr, "Error: Failed to read vector element %d from '%s'\n", i, rhs_name);
234 |                     fclose(rhs_file);
235 |                     exit(1);
236 |                 }
237 | 
238 |                 sol[i] = rhs[i];
239 |             }
240 | 
241 |             fclose(rhs_file);
242 |             printf("Successfully read rhs from '%s'\n", rhs_name);
243 |         }
244 |         else
245 |         {
246 |             if (!colptr || !value)
247 |             {
248 |                 fprintf(stderr, "Error: Invalid matrix data for rhs generation\n");
249 |                 exit(1);
250 |             }
251 | 
252 |             for(int i=0; i < n; i++){
253 |                 rhs[i] = 0;
254 |             }
255 |             for (int i = 0; i < n; i++)
256 |             {
257 |                 for (sparse_pointer_t j = colptr[i]; j < colptr[i + 1]; j++)
258 |                 {
259 |                     rhs[rowidx[j]] += value[j];
260 |                 }
261 |             }
262 |             for(int i=0; i < n; i++){
263 |                 sol[i] = rhs[i];
264 |             }
265 | 
266 |             printf("Successfully generated rhs from matrix\n");
267 |         }
268 | 
269 |         if((m != n) || m == 0){
270 |             printf("Matrix A is %d * %d. Exit.\n", m, n);
271 |             exit(1);
272 |         }
273 |     }
274 |     MPI_Bcast(&n, 1, MPI_SPARSE_INDEX_T, 0, MPI_COMM_WORLD);
275 |     MPI_Bcast(&nb, 1, MPI_INT, 0, MPI_COMM_WORLD);
276 |     MPI_Bcast(&nnz, 1, MPI_LONG_LONG_INT, 0, MPI_COMM_WORLD);
277 |     MPI_Barrier(MPI_COMM_WORLD);
278 | 
279 |     // Step 3: Initialize PanguLU solver.
280 |     pangulu_init_options init_options;
281 |     init_options.nb = nb;
282 |     init_options.gpu_kernel_warp_per_block = 4;
283 |     init_options.gpu_data_move_warp_per_block = 4;
284 |     init_options.nthread = 1;
285 |     init_options.reordering_nthread = 4;
286 |     init_options.sizeof_value = sizeof(sparse_value_t);
287 |     #ifdef COMPLEX_MTX
288 |     init_options.is_complex_matrix = 1;
289 |     #else
290 |     init_options.is_complex_matrix = 0;
291 |     #endif
292 |     init_options.mpi_recv_buffer_level = 0.5;
293 |     void *pangulu_handle;
294 |     pangulu_init(n, nnz, colptr, rowidx, value, &init_options, &pangulu_handle);
295 | 
296 |     // Step 4: Execute LU factorisation.
297 |     pangulu_gstrf_options gstrf_options;
298 |     pangulu_gstrf(&gstrf_options, &pangulu_handle);
299 | 
300 |     // Step 5: Execute triangle solve using factorize results.
301 |     pangulu_gstrs_options gstrs_options;
302 |     pangulu_gstrs(sol, &gstrs_options, &pangulu_handle);
303 |     MPI_Barrier(MPI_COMM_WORLD);
304 | 
305 |     // Step 6: Check the answer.
306 |     sparse_value_t *rhs_computed;
307 |     if (rank == 0)
308 |     {
309 |         // Step 6.1: Calculate rhs_computed = A * x.
310 |         rhs_computed = (sparse_value_t *)malloc(sizeof(sparse_value_t) * n);
311 |         memset(rhs_computed, 0, sizeof(sparse_value_t) * n);
312 |         for (int i = 0; i < n; i++)
313 |         {
314 |             for (sparse_pointer_t j = colptr[i]; j < colptr[i + 1]; j++)
315 |             {
316 |                 rhs_computed[rowidx[j]] += value[j] * sol[i];
317 |             }
318 |         }
319 | 
320 |         // Step 6.2: Calculate residual residual = rhs_comuted - rhs.
321 |         sparse_value_t *residual = rhs_computed;
322 |         for (int i = 0; i < n; i++)
323 |         {
324 |             residual[i] = rhs_computed[i] - rhs[i];
325 |         }
326 | 
327 |         sparse_value_t sum, c;
328 |         // Step 6.3: Calculte norm2 of residual.
329 |         sum = 0.0;
330 |         c = 0.0;
331 |         for (int i = 0; i < n; i++)
332 |         {
333 |             sparse_value_t num = residual[i] * residual[i];
334 |             sparse_value_t z = num - c;
335 |             sparse_value_t t = sum + z;
336 |             c = (t - sum) - z;
337 |             sum = t;
338 |         }
339 | #ifdef COMPLEX_MTX
340 |         sparse_value_real_t residual_norm2 = complex_fabs(complex_sqrt(sum));
341 | #else
342 |         sparse_value_t residual_norm2 = sqrt(sum);
343 | #endif
344 | 
345 |         // Step 6.4: Calculte norm2 of original rhs.
346 |         sum = 0.0;
347 |         c = 0.0;
348 |         for (int i = 0; i < n; i++)
349 |         {
350 |             sparse_value_t num = rhs[i] * rhs[i];
351 |             sparse_value_t z = num - c;
352 |             sparse_value_t t = sum + z;
353 |             c = (t - sum) - z;
354 |             sum = t;
355 |         }
356 | #ifdef COMPLEX_MTX
357 |         sparse_value_real_t rhs_norm2 = complex_fabs(complex_sqrt(sum));
358 | #else
359 |         sparse_value_t rhs_norm2 = sqrt(sum);
360 | #endif
361 | 
362 |         // Step 6.5: Calculate relative residual.
363 |         double relative_residual = residual_norm2 / rhs_norm2;
364 |         printf("|| Ax - b || / || b || = %le\n", relative_residual);
365 |     }
366 | 
367 |     // Step 7: Clean and finalize.
368 |     pangulu_finalize(&pangulu_handle);
369 |     if (rank == 0)
370 |     {
371 |         free(colptr);
372 |         free(rowidx);
373 |         free(value);
374 |         free(sol);
375 |         free(rhs);
376 |         free(rhs_computed);
377 |     }
378 |     MPI_Finalize();
379 | }
380 | 


--------------------------------------------------------------------------------
/examples/run.sh:
--------------------------------------------------------------------------------
 1 | mtx_path=$1
 2 | nb=$2
 3 | np=$3
 4 | 
 5 | if [ ! -f $mtx_path ];then
 6 |   echo "$mtx_path is not a file."
 7 |   exit
 8 | fi
 9 | 
10 | echo mpirun -np $np ./pangulu_example.elf -nb $nb -f $mtx_path
11 | 
12 | mpirun -np $np ./pangulu_example.elf -nb $nb -f $mtx_path
13 | 


--------------------------------------------------------------------------------
/include/pangulu.h:
--------------------------------------------------------------------------------
 1 | #ifndef PANGULU_H
 2 | #define PANGULU_H
 3 | 
 4 | #include "pangulu_interface_common.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C"
 8 | {
 9 | #endif // __cplusplus
10 | 
11 |     void pangulu_init(sparse_index_t pangulu_n, sparse_pointer_t pangulu_nnz, sparse_pointer_t *csc_colptr, sparse_index_t *csc_rowidx, sparse_value_t *csc_value, pangulu_init_options *init_options, void **pangulu_handle);
12 |     void pangulu_gstrf(pangulu_gstrf_options *gstrf_options, void **pangulu_handle);
13 |     void pangulu_gstrs(sparse_value_t *rhs, pangulu_gstrs_options *gstrs_options, void **pangulu_handle);
14 |     void pangulu_gssv(sparse_value_t *rhs, pangulu_gstrf_options *gstrf_options, pangulu_gstrs_options *gstrs_options, void **pangulu_handle);
15 |     void pangulu_finalize(void **pangulu_handle);
16 | 
17 | #ifdef __cplusplus
18 | }
19 | #endif // __cplusplus
20 | 
21 | #endif // PANGULU_H
22 | 


--------------------------------------------------------------------------------
/include/pangulu_interface_common.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | typedef struct pangulu_init_options
 4 | {
 5 |     int nthread;
 6 |     int nb;
 7 |     int gpu_kernel_warp_per_block;
 8 |     int gpu_data_move_warp_per_block;
 9 |     int reordering_nthread;
10 |     int sizeof_value;
11 |     int is_complex_matrix;
12 |     float mpi_recv_buffer_level;
13 | }pangulu_init_options;
14 | 
15 | typedef struct pangulu_gstrf_options
16 | {
17 | }pangulu_gstrf_options;
18 | 
19 | typedef struct pangulu_gstrs_options
20 | {
21 | }pangulu_gstrs_options;
22 | 


--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
 1 | include ../make.inc
 2 | 
 3 | all : oclean
 4 | 
 5 | libs : libpangulu.so libpangulu.a
 6 | 
 7 | libpangulu.so:
 8 | 	$(MPICC) $(MPICCFLAGS) -shared -fPIC -o $@ ./pangulu*.o
 9 | libpangulu.a:
10 | 	ar -rv -o $@ ./pangulu*.o
11 | 	- ranlib $@
12 | 
13 | oclean: libs
14 | 	rm -f pangulu*.o
15 | 
16 | clean:
17 | 	rm -f libpangulu.so
18 | 	rm -f libpangulu.a
19 | 	


--------------------------------------------------------------------------------
/make.inc:
--------------------------------------------------------------------------------
 1 | COMPILE_LEVEL = -O3
 2 | CUDA_COMPILE_LEVEL =
 3 | 
 4 | #0201000,GPU_CUDA
 5 | CUDA_INC = -I/path/to/cuda/include
 6 | NVCC = nvcc $(CUDA_COMPILE_LEVEL) $(COMPILE_LEVEL)
 7 | NVCCFLAGS = $(PANGULU_FLAGS) -w -Xptxas -dlcm=cg -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_61,code=compute_61 $(CUDA_INC)
 8 | 
 9 | #general
10 | CC = gcc $(COMPILE_LEVEL)
11 | MPICC = mpicc $(COMPILE_LEVEL)
12 | OPENBLAS_INC = -I/path/to/openblas/include
13 | CFLAGS = $(OPENBLAS_INC) $(CUDA_INC) -fopenmp -lpthread -lm
14 | MPICCFLAGS = $(CFLAGS)
15 | REORDERING_INC = -I../reordering_omp/include
16 | #METIS_INC = -I/path/to/metis/include
17 | PANGULU_FLAGS = -DPANGULU_LOG_INFO -DCALCULATE_TYPE_R64 -DGPU_OPEN #-DMETIS #-DPANGULU_MC64 #-DPANGULU_PERF
18 | 


--------------------------------------------------------------------------------
/reordering_omp/Makefile:
--------------------------------------------------------------------------------
 1 | all: lib
 2 | 
 3 | lib: src
 4 | 	$(MAKE) -C lib
 5 | 
 6 | src: generate_mynd_types
 7 | 	$(MAKE) -C src
 8 | 
 9 | generate_mynd_types: clean
10 | 	echo "" > include/mynd_types.h
11 | 	echo "#define IDXTYPEWIDTH 32" >> include/mynd_types.h
12 | 	echo "#define REALTYPEWIDTH 64" >> include/mynd_types.h
13 | 
14 | clean:
15 | 	$(MAKE) -C lib clean
16 | 	- (rm include/mynd_types.h)
17 | 
18 | 


--------------------------------------------------------------------------------
/reordering_omp/include/mynd_omp.h:
--------------------------------------------------------------------------------
  1 | #ifndef MYND_OMP_H
  2 | #define MYND_OMP_H
  3 | 
  4 | #include "mynd_types.h"
  5 | #if defined(_MSC_VER)
  6 |   #define COMPILER_MSC
  7 | #endif
  8 | #if defined(__ICC)
  9 |   #define COMPILER_ICC
 10 | #endif
 11 | #if defined(__GNUC__)
 12 |   #define COMPILER_GCC
 13 | #endif
 14 | 
 15 | #ifndef _GKLIB_H_
 16 | #ifdef COMPILER_MSC
 17 | #include <limits.h>
 18 | 
 19 | typedef __int32 int32_t;
 20 | typedef __int64 int64_t;
 21 | #define PRId32       "I32d"
 22 | #define PRId64       "I64d"
 23 | #define SCNd32       "ld"
 24 | #define SCNd64       "I64d"
 25 | #define INT32_MIN    ((int32_t)_I32_MIN)
 26 | #define INT32_MAX    _I32_MAX
 27 | #define INT64_MIN    ((int64_t)_I64_MIN)
 28 | #define INT64_MAX    _I64_MAX
 29 | #else
 30 | #include <inttypes.h>
 31 | #endif
 32 | #endif
 33 | 
 34 | 
 35 | #if IDXTYPEWIDTH == 32
 36 |   typedef int32_t reordering_int_t;
 37 | 
 38 |   #define IDX_MAX   INT32_MAX
 39 |   #define IDX_MIN   INT32_MIN
 40 | 
 41 |   #define SCIDX  SCNd32
 42 |   #define PRIDX  PRId32
 43 | 
 44 |   #define strtoidx      strtol
 45 |   #define lyj_abs          abs
 46 | #elif IDXTYPEWIDTH == 64
 47 |   typedef int64_t reordering_int_t;
 48 | 
 49 |   #define IDX_MAX   INT64_MAX
 50 |   #define IDX_MIN   INT64_MIN
 51 | 
 52 |   #define SCIDX  SCNd64
 53 |   #define PRIDX  PRId64
 54 | 
 55 | #ifdef COMPILER_MSC
 56 |   #define strtoidx      _strtoi64
 57 | #else
 58 |   #define strtoidx      strtoll
 59 | #endif
 60 |   #define lyj_abs          labs
 61 | #else
 62 |   #error "Incorrect user-supplied value fo IDXTYPEWIDTH"
 63 | #endif
 64 | 
 65 | 
 66 | #if REALTYPEWIDTH == 32
 67 |   typedef float reordering_real_t;
 68 | 
 69 |   #define SCREAL         "f"
 70 |   #define PRREAL         "f"
 71 |   #define REAL_MAX       FLT_MAX
 72 |   #define REAL_MIN       FLT_MIN
 73 |   #define REAL_EPSILON   FLT_EPSILON
 74 | 
 75 |   #define rabs          fabsf
 76 |   #define REALEQ(x,y) ((rabs((x)-(y)) <= FLT_EPSILON))
 77 | 
 78 | #ifdef COMPILER_MSC
 79 |   #define strtoreal     (float)strtod
 80 | #else
 81 |   #define strtoreal     strtof
 82 | #endif
 83 | #elif REALTYPEWIDTH == 64
 84 |   typedef double reordering_real_t;
 85 | 
 86 |   #define SCREAL         "lf"
 87 |   #define PRREAL         "lf"
 88 |   #define REAL_MAX       DBL_MAX
 89 |   #define REAL_MIN       DBL_MIN
 90 |   #define REAL_EPSILON   DBL_EPSILON
 91 | 
 92 |   #define rabs          fabs
 93 |   #define REALEQ(x,y) ((rabs((x)-(y)) <= DBL_EPSILON))
 94 | 
 95 |   #define strtoreal     strtod
 96 | #else
 97 |   #error "Incorrect user-supplied value for REALTYPEWIDTH"
 98 | #endif
 99 | 
100 | #include <stdio.h>
101 | #include <stdbool.h>
102 | #include <string.h>
103 | #include <stdlib.h>
104 | #include <sys/time.h>
105 | 
106 | #ifdef __cplusplus
107 | extern "C"
108 | {
109 | #endif
110 | 
111 |     void mynd_ReorderGraph(reordering_int_t *nvtxs, reordering_int_t *nedges, reordering_int_t *xadj, reordering_int_t *vwgt, reordering_int_t *adjncy, reordering_int_t *adjwgt, 
112 |         reordering_int_t *treflect, reordering_int_t *reflect, reordering_int_t *compress, reordering_int_t *tcontrol, reordering_int_t *is_memery_manage_before, reordering_int_t nthreads);
113 |     void mynd_ReadGraph(char *filename, reordering_int_t *nvtxs, reordering_int_t *nedges, reordering_int_t **txadj, reordering_int_t **tvwgt, reordering_int_t **tadjncy, reordering_int_t **tadjwgt);
114 | 
115 | #ifdef __cplusplus
116 | }
117 | #endif
118 | 
119 | #endif
120 | 


--------------------------------------------------------------------------------
/reordering_omp/lib/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | STATIC_LIB = libmynd.a
 3 | SHARED_LIB = libmynd.so
 4 | OBJECTS = $(wildcard *.o)
 5 | 
 6 | all: $(STATIC_LIB) $(SHARED_LIB)
 7 | 	$(RM) $(OBJECTS) 
 8 | 
 9 | $(SHARED_LIB): $(OBJECTS)
10 | 	$(CC) -shared $(OBJECTS) -o $@
11 | 
12 | $(STATIC_LIB): $(OBJECTS)
13 | 	ar rcs $@ $(OBJECTS)
14 | 
15 | clean:
16 | 	rm -f $(SHARED_LIB) $(STATIC_LIB) *.o
17 | 


--------------------------------------------------------------------------------
/reordering_omp/src/control.h:
--------------------------------------------------------------------------------
 1 | #ifndef CONTROL_H
 2 | #define CONTROL_H
 3 | 
 4 | #include "typedef.h"
 5 | reordering_int_t control;
 6 | 
 7 | #define ALL_Time                    7
 8 | #define PRINTTIMEGENERAL            7
 9 | 
10 | #define NESTEDBISECTION_Time        3
11 | #define BISECTIONBEST_Time          3
12 | #define COARSEN_Time                3
13 | #define REORDERBISECTION_Time       3
14 | #define REFINE2WAYNODE_Time         3
15 | #define SPLITGRAPHREORDER_Time      3
16 | #define PRINTTIMEPHASES             3
17 | 
18 | #define MATCH_Time                  1
19 | #define CREATCOARSENGRAPH_Time      1
20 | #define PARTITIOBINF2WAY            1 
21 | #define FM2WAYCUTBALANCE_Time       1
22 | #define FM2WAYCUTREFINE_Time        1
23 | #define REORDERINF2WAY_Time         1
24 | #define FMNODEBALANCE_Time          1
25 | #define FM1SIDENODEREFINE_Time      1
26 | #define FM2SIDENODEREFINE_Time      1
27 | #define PRINTTIMESTEPS              1
28 | 
29 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/define.h:
--------------------------------------------------------------------------------
  1 | #ifndef DEFINE_H
  2 | #define DEFINE_H
  3 | 
  4 | #include <stdint.h>
  5 | #include "struct.h"
  6 | #include "mkqsort.h"
  7 | 
  8 | #define IDX_MAX   INT64_MAX
  9 | #define IDX_MIN   INT64_MIN
 10 | 
 11 | #define SCIDX  SCNd64
 12 | #define PRIDX  PRId64
 13 | 
 14 | #define NUM_OPTIONS     40
 15 | #define PTYPE_RB        0
 16 | #define PTYPE_KWAY      1
 17 | 
 18 | #define OBJTYPE_CUT     0
 19 | #define OBJTYPE_VOL     1
 20 | #define OBJTYPE_NODE    2
 21 | 
 22 | #define CTYPE_RM        0
 23 | #define CTYPE_SHEM      1
 24 | 
 25 | #define OP_PMETIS       0
 26 | #define OP_KMETIS       1
 27 | #define OP_OMETIS       2
 28 | 
 29 | #define CTYPE_RM        0
 30 | #define CTYPE_SHEM      1
 31 | 
 32 | #define IPTYPE_GROW     0
 33 | #define IPTYPE_RANDOM   1
 34 | #define IPTYPE_EDGE     2
 35 | #define IPTYPE_NODE     3
 36 | #define IPTYPE_METISRB  4
 37 | 
 38 | 
 39 | #define RTYPE_FM        0
 40 | #define RTYPE_GREEDY    1
 41 | #define RTYPE_SEP2SIDED 2
 42 | #define RTYPE_SEP1SIDED 3
 43 | 
 44 | #define PMETIS_DEFAULT_UFACTOR          1
 45 | #define MCPMETIS_DEFAULT_UFACTOR        10
 46 | #define PartGraphKway_DEFAULT_UFACTOR   30
 47 | #define OMETIS_DEFAULT_UFACTOR          200
 48 | 
 49 | #define OPTION_PTYPE        0
 50 | #define OPTION_OBJTYPE      1
 51 | #define OPTION_CTYPE        2
 52 | #define OPTION_IPTYPE       3
 53 | #define OPTION_RTYPE        4
 54 | #define OPTION_DBGLVL       5
 55 | #define OPTION_NITER        6
 56 | #define OPTION_NCUTS        7
 57 | #define OPTION_SEED         8
 58 | #define OPTION_NO2HOP       9
 59 | #define OPTION_MINCONN      10
 60 | #define OPTION_CONTIG       11
 61 | #define OPTION_COMPRESS     12
 62 | #define OPTION_CCORDER      13
 63 | #define OPTION_PFACTOR      14
 64 | #define OPTION_NSEPS        15
 65 | #define OPTION_UFACTOR      16
 66 | #define OPTION_NUMBERING    17
 67 | 
 68 | #define OPTION_HELP         18
 69 | #define OPTION_TPWGTS       19
 70 | #define OPTION_NCOMMON      20
 71 | #define OPTION_NOOUTPUT     21
 72 | #define OPTION_BALANCE      22
 73 | #define OPTION_GTYPE        23
 74 | #define OPTION_UBVEC        24
 75 | 
 76 | #define GETOPTION(options, idx, defval) ((options) == NULL || (options)[idx] == -1 ? defval : (options)[idx]) 
 77 | 
 78 | #define COMPUTE_UBFACTORS(ufactor) (1.0 + 0.001 * (ufactor))
 79 | 
 80 | #define lyj_max(a, b) ((a) >= (b) ? (a) : (b))
 81 | #define lyj_min(a, b) ((a) <= (b) ? (a) : (b))
 82 | #define lyj_swap(a, b, z) (z = a, a = b, b = z)
 83 | #define m_gt_n(m,n) ((m)>(n))
 84 | 
 85 | #define MAKECSR(i, n, a) \
 86 |    do { \
 87 |      for (i=1; i<n; i++) a[i] += a[i-1]; \
 88 |      for (i=n; i>0; i--) a[i] = a[i-1]; \
 89 |      a[0] = 0; \
 90 |    } while(0) 
 91 | 
 92 | #define SHIFTCSR(i, n, a) \
 93 |    do { \
 94 |      for (i=n; i>0; i--) a[i] = a[i-1]; \
 95 |      a[0] = 0; \
 96 |    } while(0) 
 97 | 
 98 | void ikvsorti(size_t n, ikv_t *base)
 99 | {
100 | #define ikey_lt(a, b) ((a)->key < (b)->key)
101 |     GK_MKQSORT(ikv_t, base, n, ikey_lt);
102 | #undef ikey_lt
103 | }
104 | 
105 | #define CONTROL_COMMAND(a, flag, cmd) if ((a)&(flag)) (cmd);
106 | 
107 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/makefile:
--------------------------------------------------------------------------------
 1 | CC = gcc
 2 | CFLAGS = -c -Wall -fPIC -fopenmp -O3
 3 | 
 4 | SOURCES = $(wildcard *.c)
 5 | OBJECTS = $(SOURCES:.c=.o)
 6 | 
 7 | all: $(OBJECTS)
 8 | 
 9 | %.o: %.c
10 | 	$(CC) $(CFLAGS) $< -o $@
11 | 	mv $@ ../lib
12 | 
13 | clean:
14 | 	rm -f $(OBJECTS)


--------------------------------------------------------------------------------
/reordering_omp/src/mkqsort.h:
--------------------------------------------------------------------------------
  1 | #ifndef MKQSORT_H
  2 | #define MKQSORT_H
  3 | 
  4 | /* Swap two items pointed to by A and B using temporary buffer t. */
  5 | #define _GKQSORT_SWAP(a, b, t) ((void)((t = *a), (*a = *b), (*b = t)))
  6 | 
  7 | /* Discontinue quicksort algorithm when partition gets below this size.
  8 |    This particular magic number was chosen to work best on a Sun 4/260. */
  9 | #define _GKQSORT_MAX_THRESH 4
 10 | 
 11 | /* The next 4 #defines implement a very fast in-line stack abstraction. */
 12 | #define _GKQSORT_STACK_SIZE	    (8 * sizeof(size_t))
 13 | #define _GKQSORT_PUSH(top, low, high) (((top->_lo = (low)), (top->_hi = (high)), ++top))
 14 | #define	_GKQSORT_POP(low, high, top)  ((--top, (low = top->_lo), (high = top->_hi)))
 15 | #define	_GKQSORT_STACK_NOT_EMPTY	    (_stack < _top)
 16 | 
 17 | 
 18 | /* The main code starts here... */
 19 | #define GK_MKQSORT(GKQSORT_TYPE,GKQSORT_BASE,GKQSORT_NELT,GKQSORT_LT)   \
 20 | {									\
 21 |   GKQSORT_TYPE *const _base = (GKQSORT_BASE);				\
 22 |   const size_t _elems = (GKQSORT_NELT);					\
 23 |   GKQSORT_TYPE _hold;							\
 24 | 									\
 25 |   if (_elems == 0)                                                      \
 26 |     return;                                                             \
 27 |                                                                         \
 28 |   /* Don't declare two variables of type GKQSORT_TYPE in a single	\
 29 |    * statement: eg `TYPE a, b;', in case if TYPE is a pointer,		\
 30 |    * expands to `type* a, b;' wich isn't what we want.			\
 31 |    */									\
 32 | 									\
 33 |   if (_elems > _GKQSORT_MAX_THRESH) {					\
 34 |     GKQSORT_TYPE *_lo = _base;						\
 35 |     GKQSORT_TYPE *_hi = _lo + _elems - 1;				\
 36 |     struct {								\
 37 |       GKQSORT_TYPE *_hi; GKQSORT_TYPE *_lo;				\
 38 |     } _stack[_GKQSORT_STACK_SIZE], *_top = _stack + 1;			\
 39 | 									\
 40 |     while (_GKQSORT_STACK_NOT_EMPTY) {					\
 41 |       GKQSORT_TYPE *_left_ptr; GKQSORT_TYPE *_right_ptr;		\
 42 | 									\
 43 |       /* Select median value from among LO, MID, and HI. Rearrange	\
 44 |          LO and HI so the three values are sorted. This lowers the	\
 45 |          probability of picking a pathological pivot value and		\
 46 |          skips a comparison for both the LEFT_PTR and RIGHT_PTR in	\
 47 |          the while loops. */						\
 48 | 									\
 49 |       GKQSORT_TYPE *_mid = _lo + ((_hi - _lo) >> 1);			\
 50 | 									\
 51 |       if (GKQSORT_LT (_mid, _lo))					\
 52 |         _GKQSORT_SWAP (_mid, _lo, _hold);				\
 53 |       if (GKQSORT_LT (_hi, _mid))					\
 54 |         _GKQSORT_SWAP (_mid, _hi, _hold);				\
 55 |       else								\
 56 |         goto _jump_over;						\
 57 |       if (GKQSORT_LT (_mid, _lo))					\
 58 |         _GKQSORT_SWAP (_mid, _lo, _hold);				\
 59 |   _jump_over:;								\
 60 | 									\
 61 |       _left_ptr  = _lo + 1;						\
 62 |       _right_ptr = _hi - 1;						\
 63 | 									\
 64 |       /* Here's the famous ``collapse the walls'' section of quicksort.	\
 65 |          Gotta like those tight inner loops!  They are the main reason	\
 66 |          that this algorithm runs much faster than others. */		\
 67 |       do {								\
 68 |         while (GKQSORT_LT (_left_ptr, _mid))				\
 69 |          ++_left_ptr;							\
 70 | 									\
 71 |         while (GKQSORT_LT (_mid, _right_ptr))				\
 72 |           --_right_ptr;							\
 73 | 									\
 74 |         if (_left_ptr < _right_ptr) {					\
 75 |           _GKQSORT_SWAP (_left_ptr, _right_ptr, _hold);			\
 76 |           if (_mid == _left_ptr)					\
 77 |             _mid = _right_ptr;						\
 78 |           else if (_mid == _right_ptr)					\
 79 |             _mid = _left_ptr;						\
 80 |           ++_left_ptr;							\
 81 |           --_right_ptr;							\
 82 |         }								\
 83 |         else if (_left_ptr == _right_ptr) {				\
 84 |           ++_left_ptr;							\
 85 |           --_right_ptr;							\
 86 |           break;							\
 87 |         }								\
 88 |       } while (_left_ptr <= _right_ptr);				\
 89 | 									\
 90 |      /* Set up pointers for next iteration.  First determine whether	\
 91 |         left and right partitions are below the threshold size.  If so,	\
 92 |         ignore one or both.  Otherwise, push the larger partition's	\
 93 |         bounds on the stack and continue sorting the smaller one. */	\
 94 | 									\
 95 |       if (_right_ptr - _lo <= _GKQSORT_MAX_THRESH) {			\
 96 |         if (_hi - _left_ptr <= _GKQSORT_MAX_THRESH)			\
 97 |           /* Ignore both small partitions. */				\
 98 |           _GKQSORT_POP (_lo, _hi, _top);				\
 99 |         else								\
100 |           /* Ignore small left partition. */				\
101 |           _lo = _left_ptr;						\
102 |       }									\
103 |       else if (_hi - _left_ptr <= _GKQSORT_MAX_THRESH)			\
104 |         /* Ignore small right partition. */				\
105 |         _hi = _right_ptr;						\
106 |       else if (_right_ptr - _lo > _hi - _left_ptr) {			\
107 |         /* Push larger left partition indices. */			\
108 |         _GKQSORT_PUSH (_top, _lo, _right_ptr);				\
109 |         _lo = _left_ptr;						\
110 |       }									\
111 |       else {								\
112 |         /* Push larger right partition indices. */			\
113 |         _GKQSORT_PUSH (_top, _left_ptr, _hi);				\
114 |         _hi = _right_ptr;						\
115 |       }									\
116 |     }									\
117 |   }									\
118 | 									\
119 |   /* Once the BASE array is partially sorted by quicksort the rest	\
120 |      is completely sorted using insertion sort, since this is efficient	\
121 |      for partitions below MAX_THRESH size. BASE points to the		\
122 |      beginning of the array to sort, and END_PTR points at the very	\
123 |      last element in the array (*not* one beyond it!). */		\
124 | 									\
125 |   {									\
126 |     GKQSORT_TYPE *const _end_ptr = _base + _elems - 1;			\
127 |     GKQSORT_TYPE *_tmp_ptr = _base;					\
128 |     register GKQSORT_TYPE *_run_ptr;					\
129 |     GKQSORT_TYPE *_thresh;						\
130 | 									\
131 |     _thresh = _base + _GKQSORT_MAX_THRESH;				\
132 |     if (_thresh > _end_ptr)						\
133 |       _thresh = _end_ptr;						\
134 | 									\
135 |     /* Find smallest element in first threshold and place it at the	\
136 |        array's beginning.  This is the smallest array element,		\
137 |        and the operation speeds up insertion sort's inner loop. */	\
138 | 									\
139 |     for (_run_ptr = _tmp_ptr + 1; _run_ptr <= _thresh; ++_run_ptr)	\
140 |       if (GKQSORT_LT (_run_ptr, _tmp_ptr))				\
141 |         _tmp_ptr = _run_ptr;						\
142 | 									\
143 |     if (_tmp_ptr != _base)						\
144 |       _GKQSORT_SWAP (_tmp_ptr, _base, _hold);				\
145 | 									\
146 |     /* Insertion sort, running from left-hand-side			\
147 |      * up to right-hand-side.  */					\
148 | 									\
149 |     _run_ptr = _base + 1;						\
150 |     while (++_run_ptr <= _end_ptr) {					\
151 |       _tmp_ptr = _run_ptr - 1;						\
152 |       while (GKQSORT_LT (_run_ptr, _tmp_ptr))				\
153 |         --_tmp_ptr;							\
154 | 									\
155 |       ++_tmp_ptr;							\
156 |       if (_tmp_ptr != _run_ptr) {					\
157 |         GKQSORT_TYPE *_trav = _run_ptr + 1;				\
158 |         while (--_trav >= _run_ptr) {					\
159 |           GKQSORT_TYPE *_hi; GKQSORT_TYPE *_lo;				\
160 |           _hold = *_trav;						\
161 | 									\
162 |           for (_hi = _lo = _trav; --_lo >= _tmp_ptr; _hi = _lo)		\
163 |             *_hi = *_lo;						\
164 |           *_hi = _hold;							\
165 |         }								\
166 |       }									\
167 |     }									\
168 |   }									\
169 | 									\
170 | }
171 | 
172 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_coarsen.c:
--------------------------------------------------------------------------------
  1 | #ifndef COARSEN_H
  2 | #define COARSEN_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | graph_t *mynd_CoarsenGraph(graph_t *graph, reordering_int_t Coarsen_Threshold)
  7 | {
  8 | 	reordering_int_t i, eqewgts, level, maxvwgt, cnvtxs;
  9 | 
 10 | 	/* determine if the weights on the edges are all the same */
 11 | 	for (eqewgts = 1, i = 1; i < graph->nedges; i++) 
 12 | 	{
 13 | 		if (graph->adjwgt[0] != graph->adjwgt[i]) 
 14 | 		{
 15 | 			eqewgts = 0;
 16 | 			break;
 17 | 		}
 18 | 	}
 19 | 
 20 | 	/* set the maximum allowed coarsest vertex weight */
 21 | 	for (i = 0; i < 1; i++)
 22 | 		maxvwgt = 1.5 * graph->tvwgt[i] / Coarsen_Threshold;
 23 | 
 24 | 	level = 0;
 25 | 
 26 | 	do 
 27 | 	{
 28 | 		// printf("level=%"PRIDX" graph->nvtxs=%"PRIDX"\n",level,graph->nvtxs);
 29 | 		/* allocate memory for cmap, if it has not already been done due to
 30 | 			multiple cuts */
 31 | 		if (graph->match == NULL)
 32 | 			graph->match = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "CoarsenGraph: graph->match");
 33 | 		if (graph->cmap == NULL)
 34 | 			graph->cmap = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "CoarsenGraph: graph->cmap");
 35 | 		if (graph->where == NULL)
 36 | 			graph->where  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "CoarsenGraph: graph->where");
 37 | 		/* determine which matching scheme you will use */
 38 | 
 39 | 		// printf("CoarsenGraph 0\n");
 40 | 		CONTROL_COMMAND(control, MATCH_Time, mynd_gettimebegin(&start_match, &end_match, &time_match));
 41 | 		if (eqewgts || graph->nedges == 0)
 42 | 			cnvtxs = mynd_Match_RM(graph, maxvwgt);
 43 | 		else
 44 | 			cnvtxs = mynd_Match_SHEM(graph, maxvwgt);
 45 | 		CONTROL_COMMAND(control, MATCH_Time, mynd_gettimeend(&start_match, &end_match, &time_match));
 46 | 		// printf("CoarsenGraph 1\n");
 47 | 		// printf("cnvtxs=%"PRIDX"\n",cnvtxs);
 48 | 		// exam_num(graph->match,graph->nvtxs);
 49 | 
 50 | 		CONTROL_COMMAND(control, CREATCOARSENGRAPH_Time, mynd_gettimebegin(&start_createcoarsengraph, &end_createcoarsengraph, &time_createcoarsengraph));
 51 | 		// CreateCoarseGraph(graph, cnvtxs);
 52 | 		// CreateCoarseGraph_S(graph, cnvtxs);
 53 | 		// CreateCoarseGraph_BST(graph, cnvtxs);
 54 | 		// CreateCoarseGraph_BST_2(graph, cnvtxs);
 55 | 		// mynd_CreateCoarseGraph_HT(graph, cnvtxs);
 56 | 		mynd_CreateCoarseGraph_HT_2(graph, cnvtxs);
 57 | 		CONTROL_COMMAND(control, CREATCOARSENGRAPH_Time, mynd_gettimeend(&start_createcoarsengraph, &end_createcoarsengraph, &time_createcoarsengraph));
 58 | 		// printf("CoarsenGraph 2\n");
 59 | 		// printf("CreateCoarseGraph=%10.3"PRREAL"\n",time_createcoarsengraph);
 60 | 
 61 | 		graph = graph->coarser;
 62 | 		eqewgts = 0;
 63 | 		level++;
 64 | 
 65 | 		//  sort for adjncy adjwgt
 66 | 		// for(reordering_int_t z = 0;z < graph->nvtxs;z++)
 67 | 		// {
 68 | 		// 	for(reordering_int_t y = graph->xadj[z];y < graph->xadj[z + 1];y++)
 69 | 		// 	{
 70 | 		// 		reordering_int_t t = y;
 71 | 		// 		for(reordering_int_t x = y + 1;x < graph->xadj[z + 1];x++)
 72 | 		// 			if(graph->adjncy[x] < graph->adjncy[t]) t = x;
 73 | 		// 		reordering_int_t temp;
 74 | 		// 		temp = graph->adjncy[t],graph->adjncy[t] = graph->adjncy[y], graph->adjncy[y] = temp;
 75 | 		// 		temp = graph->adjwgt[t],graph->adjwgt[t] = graph->adjwgt[y], graph->adjwgt[y] = temp;
 76 | 		// 	}
 77 | 		// }
 78 | 
 79 | 		// exam_nvtxs_nedges(graph);
 80 |         // exam_xadj(graph);
 81 |         // exam_vwgt(graph);
 82 |         // exam_adjncy_adjwgt(graph);
 83 | 
 84 | 	} while (graph->nvtxs > Coarsen_Threshold && 
 85 | 			graph->nvtxs < 0.85 * graph->finer->nvtxs && 
 86 | 			graph->nedges > graph->nvtxs / 2);
 87 | 
 88 | 	return graph;
 89 | }
 90 | 
 91 | /*************************************************************************/
 92 | /*! This function takes a graph and creates a sequence of nlevels coarser 
 93 |     graphs, where nlevels is an input parameter.
 94 |  */
 95 | /*************************************************************************/
 96 | graph_t *mynd_CoarsenGraphNlevels_metis(graph_t *graph, reordering_int_t Coarsen_Threshold, reordering_int_t nlevels)
 97 | {
 98 | 	reordering_int_t i, eqewgts, level, maxvwgt, cnvtxs;
 99 | 
100 | 	/* determine if the weights on the edges are all the same */
101 | 	for (eqewgts = 1, i = 1; i < graph->nedges; i++) 
102 | 	{
103 | 		if (graph->adjwgt[0] != graph->adjwgt[i]) 
104 | 		{
105 | 			eqewgts = 0;
106 | 			break;
107 | 		}
108 | 	}
109 | 
110 | 	/* set the maximum allowed coarsest vertex weight */
111 | 	for (i = 0; i < 1; i++)
112 | 		maxvwgt = 1.5 * graph->tvwgt[i] / Coarsen_Threshold;
113 | 
114 | 	for (level = 0; level < nlevels; level++) 
115 | 	{
116 | 		// printf("level=%"PRIDX" graph->nvtxs=%"PRIDX"\n",level,graph->nvtxs);
117 | 		/* allocate memory for cmap, if it has not already been done due to
118 | 			multiple cuts */
119 | 		if (graph->match == NULL)
120 | 			graph->match = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "CoarsenGraphNlevels_metis: graph->match");
121 | 		if (graph->cmap == NULL)
122 | 			graph->cmap = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "CoarsenGraphNlevels_metis: graph->cmap");
123 | 		if (graph->where == NULL)
124 | 			graph->where  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "CoarsenGraphNlevels_metis: graph->where");
125 | 		
126 | 		// printf("CoarsenGraphNlevels 0\n");
127 |     	/* determine which matching scheme you will use */
128 | 		CONTROL_COMMAND(control, MATCH_Time, mynd_gettimebegin(&start_match, &end_match, &time_match));
129 |         if (eqewgts || graph->nedges == 0)
130 | 			cnvtxs = mynd_Match_RM(graph, maxvwgt);
131 |         else
132 | 			cnvtxs = mynd_Match_SHEM(graph, maxvwgt);
133 | 		CONTROL_COMMAND(control, MATCH_Time, mynd_gettimeend(&start_match, &end_match, &time_match));
134 | 		// printf("CoarsenGraphNlevels 1\n");
135 | 		// printf("cnvtxs=%"PRIDX"\n",cnvtxs);
136 | 
137 | 		CONTROL_COMMAND(control, CREATCOARSENGRAPH_Time, mynd_gettimebegin(&start_createcoarsengraph, &end_createcoarsengraph, &time_createcoarsengraph));
138 | 		// CreateCoarseGraph(graph, cnvtxs);
139 | 		// CreateCoarseGraph_S(graph, cnvtxs);
140 | 		// CreateCoarseGraph_BST(graph, cnvtxs);
141 | 		// CreateCoarseGraph_BST_2(graph, cnvtxs);
142 | 		// mynd_CreateCoarseGraph_HT(graph, cnvtxs);
143 | 		mynd_CreateCoarseGraph_HT_2(graph, cnvtxs);
144 | 		// mynd_CreateCoarseGraph_HT_2_time(graph, cnvtxs);
145 | 		CONTROL_COMMAND(control, CREATCOARSENGRAPH_Time, mynd_gettimeend(&start_createcoarsengraph, &end_createcoarsengraph, &time_createcoarsengraph));
146 | 
147 | 		// printf("CoarsenGraphNlevels 2\n");
148 | 		// printf("CreateCoarseGraph=%10.3"PRREAL"\n",time_createcoarsengraph);
149 | 
150 | 		graph = graph->coarser;
151 | 		eqewgts = 0;
152 | 		// if(level == 0)
153 | 		// 	printf("level=0\n");
154 | 
155 | 		//  sort for adjncy adjwgt
156 | 		// for(reordering_int_t z = 0;z < graph->nvtxs;z++)
157 | 		// {
158 | 		// 	for(reordering_int_t y = graph->xadj[z];y < graph->xadj[z + 1];y++)
159 | 		// 	{
160 | 		// 		reordering_int_t t = y;
161 | 		// 		for(reordering_int_t x = y + 1;x < graph->xadj[z + 1];x++)
162 | 		// 			if(graph->adjncy[x] < graph->adjncy[t]) t = x;
163 | 		// 		reordering_int_t temp;
164 | 		// 		temp = graph->adjncy[t],graph->adjncy[t] = graph->adjncy[y], graph->adjncy[y] = temp;
165 | 		// 		temp = graph->adjwgt[t],graph->adjwgt[t] = graph->adjwgt[y], graph->adjwgt[y] = temp;
166 | 		// 	}
167 | 		// }
168 | 
169 | 		if (graph->nvtxs < Coarsen_Threshold || 
170 | 			graph->nvtxs > 0.85 * graph->finer->nvtxs || 
171 | 			graph->nedges < graph->nvtxs / 2)
172 | 		break; 
173 | 	}
174 | 
175 | 	return graph;
176 | }
177 | 
178 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_common.c:
--------------------------------------------------------------------------------
  1 | #ifndef COMMON_H
  2 | #define COMMON_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | /*************************************************************************
  7 | * This function returns the log2(x)
  8 | **************************************************************************/
  9 | reordering_int_t lyj_log2(reordering_int_t a) 
 10 | {
 11 |     if (a <= 0) 
 12 | 	{
 13 |         fprintf(stderr, "lyj_log2: Input must be greater than 0.\n");
 14 |         return -1;
 15 |     }
 16 | 
 17 |     reordering_int_t i = 0;
 18 |     while (a > 1) 
 19 | 	{
 20 |         a >>= 1;
 21 |         i++;
 22 |     }
 23 | 
 24 |     return i;
 25 | }
 26 | 
 27 | void mynd_set_value_int(reordering_int_t n, reordering_int_t val, reordering_int_t *src)
 28 | {
 29 | 	reordering_int_t i;
 30 | 	for (i = 0; i < n; i++)
 31 |     	src[i] = val;
 32 | }
 33 | 
 34 | void mynd_set_value_double(reordering_int_t n, reordering_real_t val, reordering_real_t *src)
 35 | {
 36 | 	reordering_int_t i;
 37 | 	for (i = 0; i < n; i++)
 38 |     	src[i] = val;
 39 | }
 40 | 
 41 | void mynd_copy_double(reordering_int_t n, reordering_real_t *src, reordering_real_t *dst)
 42 | {
 43 | 	for (reordering_int_t i = 0; i < n; i++)
 44 |     	dst[i] = src[i];
 45 | }
 46 | 
 47 | void mynd_copy_int(reordering_int_t n, reordering_int_t *src, reordering_int_t *dst)
 48 | {
 49 | 	for (reordering_int_t i = 0; i < n; i++)
 50 |     	dst[i] = src[i];
 51 | }
 52 | 
 53 | reordering_int_t mynd_sum_int(reordering_int_t n, reordering_int_t *src, reordering_int_t ncon)
 54 | {
 55 | 	reordering_int_t sum = 0;
 56 | 	for(reordering_int_t i = 0;i < n;i++)
 57 | 		sum += src[i];
 58 | 	return sum;
 59 | }
 60 | 
 61 | void mynd_select_sort(reordering_int_t *num, reordering_int_t length)
 62 | {
 63 | 	for(reordering_int_t i = 0;i < length;i++)
 64 | 	{
 65 | 		reordering_int_t t = i;
 66 | 		for(reordering_int_t j = i + 1;j < length;j++)
 67 | 			if(num[j] < num[t]) t = j;
 68 | 		reordering_int_t z;
 69 | 		lyj_swap(num[t], num[i],z);
 70 | 		// printf("i=%d t=%d num: ",i,t);
 71 | 		// for(reordering_int_t j = 0;j < length;j++)
 72 | 		// 	printf("%d ",num[j]);
 73 | 		// printf("\n");
 74 | 	}
 75 | }
 76 | 
 77 | void mynd_select_sort_val(reordering_int_t *num, reordering_int_t length)
 78 | {
 79 | 	for(reordering_int_t i = 0;i < length;i++)
 80 | 	{
 81 | 		reordering_int_t t = i;
 82 | 		for(reordering_int_t j = i + 1;j < length;j++)
 83 | 			if(num[j] < num[t]) t = j;
 84 | 		reordering_int_t z;
 85 | 		lyj_swap(num[t], num[i], z);
 86 | 		lyj_swap(num[t + length], num[i + length],z);
 87 | 	}
 88 | }
 89 | //	USE_GKRAND ???
 90 | void mynd_gk_randinit(uint64_t seed)
 91 | {
 92 | #ifdef USE_GKRAND
 93 |   mt[0] = seed;
 94 |   for (mti=1; mti<NN; mti++) 
 95 |     mt[mti] = (6364136223846793005ULL * (mt[mti-1] ^ (mt[mti-1] >> 62)) + mti);
 96 | #else
 97 |   srand((unsigned int) seed);
 98 | #endif
 99 | }
100 | 
101 | /*************************************************************************/
102 | /*! Initializes the generator */ 
103 | /**************************************************************************/
104 | void mynd_isrand(reordering_int_t seed)
105 | {
106 |   mynd_gk_randinit((uint64_t) seed);
107 | }
108 | 
109 | /*************************************************************************/
110 | /*! This function initializes the random number generator 
111 |   */
112 | /*************************************************************************/
113 | void mynd_InitRandom(reordering_int_t seed)
114 | {
115 | 	mynd_isrand((seed == -1 ? 4321 : seed));
116 | }
117 | 
118 | /* generates a random number on [0, 2^64-1]-interval */
119 | uint64_t mynd_gk_randint64(void)
120 | {
121 | #ifdef USE_GKRAND
122 | #else
123 |   return (uint64_t)(((uint64_t) rand()) << 32 | ((uint64_t) rand()));
124 | #endif
125 | }
126 | 
127 | /* generates a random number on [0, 2^32-1]-interval */
128 | uint32_t mynd_gk_randint32(void)
129 | {
130 | #ifdef USE_GKRAND
131 | #else
132 |   return (uint32_t)rand();
133 | #endif
134 | }
135 | 
136 | /*************************************************************************/
137 | /*! Returns a random number */ 
138 | /**************************************************************************/
139 | reordering_int_t mynd_irand()
140 | {
141 |   if (sizeof(reordering_int_t) <= sizeof(int32_t)) 
142 |     return (reordering_int_t)mynd_gk_randint32();
143 |   else 
144 |     return (reordering_int_t)mynd_gk_randint64(); 
145 | }
146 | 
147 | reordering_int_t mynd_rand_count()
148 | {
149 | 	static int ccnt = 0;  
150 | 	ccnt++;   
151 | 	return ccnt;
152 | }
153 | 
154 | /*************************************************************************/
155 | /*! Returns a random number between [0, max) */ 
156 | /**************************************************************************/
157 | reordering_int_t mynd_irandInRange(reordering_int_t max) 
158 | {
159 | 	// reordering_int_t t = rand_count(); 
160 | 	// if(t % 10000 == 0)  printf("ccnt=%d\n",t);
161 | 	return (reordering_int_t)((mynd_irand())%max); 
162 | }
163 | 
164 | /*************************************************************************/
165 | /*! Randomly permutes the elements of an array p[]. 
166 |     flag == 1, p[i] = i prior to permutation, 
167 |     flag == 0, p[] is not initialized. */
168 | /**************************************************************************/
169 | void mynd_irandArrayPermute(reordering_int_t n, reordering_int_t *p, reordering_int_t nshuffles, reordering_int_t flag)
170 | {
171 | 	reordering_int_t i, u, v;
172 | 	reordering_int_t tmp;
173 | 
174 | 	if (flag == 1) 
175 | 	{
176 | 		for (i = 0; i < n; i++)
177 | 			p[i] = (reordering_int_t)i;
178 | 	}
179 | 
180 | 	if (n < 10) 
181 | 	{
182 | 		for (i = 0; i < n; i++) 
183 | 		{
184 | 			v = mynd_irandInRange(n);
185 | 			u = mynd_irandInRange(n);
186 | 			lyj_swap(p[v], p[u], tmp);
187 | 		}
188 | 	}
189 | 	else 
190 | 	{
191 | 		for (i = 0; i < nshuffles; i++) 
192 | 		{
193 | 			v = mynd_irandInRange(n - 3);
194 | 			u = mynd_irandInRange(n - 3);
195 | 			lyj_swap(p[v + 0], p[u + 2], tmp);
196 | 			lyj_swap(p[v + 1], p[u + 3], tmp);
197 | 			lyj_swap(p[v + 2], p[u + 0], tmp);
198 | 			lyj_swap(p[v + 3], p[u + 1], tmp);
199 | 		}
200 | 	}
201 | }
202 | 
203 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_compressgraph.c:
--------------------------------------------------------------------------------
  1 | #ifndef COMPRESSGRAPH_H
  2 | #define COMPRESSGRAPH_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | graph_t *mynd_Compress_Graph(reordering_int_t nvtxs, reordering_int_t *xadj, reordering_int_t *adjncy, reordering_int_t *vwgt, reordering_int_t *cptr, reordering_int_t *cind)
  7 | {
  8 |     reordering_int_t i, ii, iii, j, jj, k, l, cnvtxs, cnedges;
  9 |     reordering_int_t *cxadj, *cvwgt, *cadjncy, *cadjwgt, *mark, *map;
 10 |     ikv_t *keys;
 11 |     graph_t *graph = NULL;
 12 | 
 13 |     // printf("Compress_Graph 0\n");
 14 | 
 15 |     mark = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "Compress_Graph: mark");
 16 |     map  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "Compress_Graph: map");
 17 |     keys = (ikv_t *)mynd_check_malloc(sizeof(ikv_t) * nvtxs, "Compress_Graph: keys");
 18 |     mynd_set_value_int(nvtxs, -1, mark);
 19 |     mynd_set_value_int(nvtxs, -1, map);
 20 | 
 21 |     /* Compute a key for each adjacency list */
 22 |     for (i = 0; i < nvtxs; i++) 
 23 |     {
 24 |         k = 0;
 25 |         for (j = xadj[i]; j < xadj[i + 1]; j++)
 26 |             k += adjncy[j];
 27 |         keys[i].key = k + i; /* Add the diagonal entry as well */
 28 |         keys[i].val = i;
 29 |     }
 30 | 
 31 |     mynd_ikvsorti(nvtxs, keys);
 32 | 
 33 |     // printf("Compress_Graph 1\n");
 34 | 
 35 |     l = cptr[0] = 0;
 36 |     for (cnvtxs = i = 0; i < nvtxs; i++) 
 37 |     {
 38 |         ii = keys[i].val;
 39 |         if (map[ii] == -1) 
 40 |         {
 41 |             mark[ii] = i;  /* Add the diagonal entry */
 42 |             for (j = xadj[ii]; j < xadj[ii + 1]; j++) 
 43 |                 mark[adjncy[j]] = i;
 44 | 
 45 |             map[ii]   = cnvtxs;
 46 |             cind[l++] = ii;
 47 | 
 48 |             for (j = i + 1; j < nvtxs; j++) 
 49 |             {
 50 |                 iii = keys[j].val;
 51 | 
 52 |                 if (keys[i].key != keys[j].key || xadj[ii + 1] - xadj[ii] != xadj[iii + 1] - xadj[iii])
 53 |                     break; /* Break if keys or degrees are different */
 54 | 
 55 |                 if (map[iii] == -1) 
 56 |                 { /* Do a comparison if iii has not been mapped */ 
 57 |                     for (jj = xadj[iii]; jj < xadj[iii + 1]; jj++) 
 58 |                     {
 59 |                         if (mark[adjncy[jj]] != i)
 60 |                             break;
 61 |                     }
 62 | 
 63 |                     if (jj == xadj[iii + 1]) 
 64 |                     { /* Identical adjacency structure */
 65 |                         map[iii]  = cnvtxs;
 66 |                         cind[l++] = iii;
 67 |                     }
 68 |                 }
 69 |             }
 70 | 
 71 |             cptr[++cnvtxs] = l;
 72 |         }
 73 |     }
 74 | 
 75 |     // printf("Compress_Graph 2\n");
 76 | 
 77 |     if (cnvtxs < 0.85 * nvtxs) 
 78 |     {
 79 |         // printf("compress !!!\n");
 80 |         graph = mynd_CreateGraph();
 81 | 
 82 |         cnedges = 0;
 83 |         for (i = 0; i < cnvtxs; i++) 
 84 |         {
 85 |             ii = cind[cptr[i]];
 86 |             cnedges += xadj[ii + 1] - xadj[ii];
 87 |         }
 88 | 
 89 |         // printf("compress 0 !!!\n");
 90 | 
 91 |         // cxadj   = graph->xadj   = imalloc(cnvtxs+1, "CompressGraph: xadj");
 92 |         // cvwgt   = graph->vwgt   = ismalloc(cnvtxs, 0, "CompressGraph: vwgt");
 93 |         // cadjncy = graph->adjncy = imalloc(cnedges, "CompressGraph: adjncy");
 94 |         //         graph->adjwgt = ismalloc(cnedges, 1, "CompressGraph: adjwgt");
 95 |         cxadj   = graph->xadj   = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * (cnvtxs + 1), "Compress_Graph: graph->xadj");
 96 |         cvwgt   = graph->vwgt   = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * cnvtxs, "Compress_Graph: graph->vwgt");
 97 |         cadjncy = graph->adjncy = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * cnedges, "Compress_Graph: graph->adjncy");
 98 |         cadjwgt = graph->adjwgt = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * cnedges, "Compress_Graph: graph->adjwgt");
 99 |         mynd_set_value_int(cnvtxs, 0, cvwgt);
100 |         mynd_set_value_int(cnedges, 1, cadjwgt);
101 | 
102 |         // printf("compress 1 !!!\n");
103 | 
104 |         mynd_set_value_int(nvtxs, -1, mark);
105 |         l = cxadj[0] = 0;
106 |         for (i = 0; i < cnvtxs; i++) 
107 |         {
108 |             mark[i] = i;
109 |             for (j = cptr[i]; j < cptr[i + 1]; j++) 
110 |             {
111 |                 ii = cind[j];
112 | 
113 |                 cvwgt[i] += (vwgt == NULL ? 1 : vwgt[ii]);
114 | 
115 |                 for (jj = xadj[ii]; jj < xadj[ii + 1]; jj++) 
116 |                 {
117 |                     k = map[adjncy[jj]];
118 |                     if (mark[k] != i) 
119 |                     {
120 |                         mark[k] = i;
121 |                         cadjncy[l++] = k;
122 |                     }
123 |                 }
124 |             }
125 |             cxadj[i + 1] = l;
126 |         }
127 | 
128 |         // printf("compress 2 !!!\n");
129 | 
130 |         graph->nvtxs  = cnvtxs;
131 |         graph->nedges = l;
132 |         // graph->ncon   = 1;
133 |         // printf("sizeof(reordering_int_t) * cnedges=%"PRIDX" sizeof(reordering_int_t) * graph->nedges=%"PRIDX"\n",sizeof(reordering_int_t) * cnedges,sizeof(reordering_int_t) * graph->nedges);
134 |         cadjncy = graph->adjncy = (reordering_int_t *)mynd_check_realloc(cadjncy, sizeof(reordering_int_t) * graph->nedges, sizeof(reordering_int_t) * cnedges, "Compress_Graph: adjncy");
135 | 	    cadjwgt = graph->adjwgt = (reordering_int_t *)mynd_check_realloc(cadjwgt, sizeof(reordering_int_t) * graph->nedges, sizeof(reordering_int_t) * cnedges, "Compress_Graph: adjwgt");
136 |     
137 | 
138 |         mynd_SetupGraph_tvwgt(graph);
139 |         mynd_SetupGraph_label(graph);
140 |     }
141 | 
142 |     // printf("Compress_Graph 3\n");
143 | 
144 |     // printf("%"PRIDX"\n",mark[0]);
145 |     // for(reordering_int_t i = 0;i < nvtxs;i++)
146 |     //     printf("%"PRIDX" ",mark[i]);
147 |     
148 |     mynd_check_free(keys, sizeof(ikv_t) * nvtxs, "Compress_Graph: keys");
149 |     // printf("Compress_Graph 3.1\n");
150 |     mynd_check_free(map, sizeof(reordering_int_t) * nvtxs, "Compress_Graph: map");
151 |     // printf("Compress_Graph 3.2\n");
152 |     // printf("mark=%p \n",mark);
153 |     mynd_check_free(mark, sizeof(reordering_int_t) * nvtxs, "Compress_Graph: mark");
154 | 
155 |     // printf("Compress_Graph 4\n");
156 | 
157 |     return graph;
158 | }
159 | 
160 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_hashtable.c:
--------------------------------------------------------------------------------
  1 | #ifndef HASHTABLE_H
  2 | #define HASHTABLE_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | //  Hash Table Version 1.0
  7 | void mynd_hash_table_Init(hash_table_t *hash, reordering_int_t size)
  8 | {
  9 |     hash->nownodes = 0;
 10 |     hash->maxnodes = size;
 11 | 
 12 |     hash->hashelement = (hashelement_t *)mynd_check_malloc(sizeof(hashelement_t) * size, "mynd_hash_table_Init: hashelement");
 13 | 
 14 |     for(int i = 0;i < size;i++)
 15 |         hash->hashelement[i].val = -1;
 16 | }
 17 | 
 18 | hash_table_t *mynd_hash_table_Create(reordering_int_t size)
 19 | {
 20 |     hash_table_t *hash;
 21 | 
 22 |     hash = (hash_table_t *)mynd_check_malloc(sizeof(hash_table_t), "mynd_hash_table_Create: hash");
 23 |     mynd_hash_table_Init(hash, size);
 24 | 
 25 |     return hash;
 26 | }
 27 | 
 28 | void mynd_hashelement_Free(hash_table_t *hash)
 29 | {
 30 |     if (hash == NULL) return;
 31 |     mynd_check_free(hash->hashelement, sizeof(hashelement_t) * hash->maxnodes, "mynd_hashelement_Free: hash->hashelement");
 32 |     hash->nownodes = 0;
 33 |     hash->maxnodes = 0;
 34 | }
 35 | 
 36 | void mynd_hash_table_Destroy(hash_table_t *hash)
 37 | {
 38 |     if (hash == NULL) return;
 39 | 	mynd_hashelement_Free(hash);
 40 | 	mynd_check_free(hash, sizeof(hash_table_t), "hash_table_Destroy: hash");
 41 | }
 42 | 
 43 | reordering_int_t mynd_hash_table_Length(hash_table_t *hash)
 44 | {
 45 | 	return hash->nownodes;
 46 | }
 47 | 
 48 | reordering_int_t mynd_hashFunction(reordering_int_t val, reordering_int_t size)
 49 | {
 50 |     return val % size;
 51 | }
 52 | 
 53 | reordering_int_t mynd_Insert_hashelement(hashelement_t *element, reordering_int_t size, reordering_int_t val, reordering_int_t key, reordering_int_t index)
 54 | {
 55 |     reordering_int_t start_index = index;
 56 |     do
 57 |     {
 58 |         if(element[index].val == -1)
 59 |         {
 60 |             element[index].val = val;
 61 |             element[index].key = key;
 62 |             return 1;
 63 |         }
 64 |         else if(element[index].val == val)
 65 |         {
 66 |             element[index].key += key;
 67 |             return 0;
 68 |         }
 69 | 
 70 |         index++;
 71 |         if(index >= size)
 72 |             index = 0;
 73 |     } while (index != start_index);
 74 | 
 75 |     return 0;
 76 | }
 77 | 
 78 | void mynd_hash_table_Insert(hash_table_t *hash, reordering_int_t val, reordering_int_t key)
 79 | {
 80 |     reordering_int_t index = mynd_hashFunction(val, hash->maxnodes);
 81 |     
 82 |     hash->nownodes += mynd_Insert_hashelement(hash->hashelement,hash->maxnodes, val, key, index);
 83 | 
 84 | 	return ;
 85 | }
 86 | 
 87 | void mynd_Traversal_hashelement(hashelement_t *element, reordering_int_t size, reordering_int_t *dst1, reordering_int_t *dst2, reordering_int_t ptr) 
 88 | {
 89 |     if (element != NULL) 
 90 | 	{
 91 |         for(reordering_int_t i = 0;i < size;i++)
 92 |         {
 93 |             reordering_int_t t = element[i].val;
 94 |             if(t != -1)
 95 |             {
 96 |                 dst1[ptr] = t;
 97 | 			    dst2[ptr] = element[i].key;
 98 |                 ptr++;
 99 |             }
100 |         }
101 |     }
102 | }
103 | 
104 | void mynd_hash_table_Traversal(hash_table_t *hash, reordering_int_t *dst1, reordering_int_t *dst2)
105 | {
106 |     hashelement_t *element = hash->hashelement;
107 | 
108 |     mynd_Traversal_hashelement(element, hash->maxnodes, dst1, dst2, 0);
109 | }
110 | 
111 | 
112 | //  Hash Table Version 2.0
113 | void mynd_hash_table_Init2(hash_table2_t *hash, reordering_int_t size)
114 | {
115 |     hash->nownodes = 0;
116 |     hash->maxnodes = size;
117 | 
118 |     hash->hashelement = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * size, "hash_table_Init2: hashelement");
119 | 
120 |     for(int i = 0;i < size;i++)
121 |         hash->hashelement[i] = -1;
122 | }
123 | 
124 | hash_table2_t *mynd_hash_table_Create2(reordering_int_t size)
125 | {
126 |     hash_table2_t *hash;
127 | 
128 |     hash = (hash_table2_t *)mynd_check_malloc(sizeof(hash_table2_t), "hash_table_Create2: hash");
129 | 
130 |     mynd_hash_table_Init2(hash, size);
131 | 
132 |     return hash;
133 | }
134 | 
135 | void mynd_hashelement_Free2(hash_table2_t *hash)
136 | {
137 |     if (hash == NULL) return;
138 |     mynd_check_free(hash->hashelement, sizeof(reordering_int_t) * hash->maxnodes, "hashelement_Free2: hash->hashelement");
139 |     hash->nownodes = 0;
140 |     hash->maxnodes = 0;
141 | }
142 | 
143 | void mynd_hash_table_Destroy2(hash_table2_t *hash)
144 | {
145 |     if (hash == NULL) return;
146 | 	mynd_hashelement_Free2(hash);
147 | 	mynd_check_free(hash, sizeof(hash_table2_t), "hash_table_Destroy2: hash");
148 | }
149 | 
150 | reordering_int_t mynd_hash_table_Length2(hash_table2_t *hash)
151 | {
152 | 	return hash->nownodes;
153 | }
154 | 
155 | void mynd_hash_table_Reset2(hash_table2_t *hash, reordering_int_t *src)
156 | {
157 |     for(reordering_int_t i = 0;i < hash->nownodes;i++)
158 |         hash->hashelement[src[i]] = -1;
159 |     hash->nownodes = 0;
160 | }
161 | 
162 | reordering_int_t mynd_Insert_hashelement2(reordering_int_t *element, reordering_int_t val, reordering_int_t key)
163 | {
164 |     if(element[val] == -1)
165 |     {
166 |         element[val] = key;
167 |         return 1;
168 |     }
169 | 
170 |     return 0;
171 | }
172 | 
173 | //  0 --> Already Exist
174 | //  1 --> Be Inserting
175 | reordering_int_t mynd_hash_table_Insert2(hash_table2_t *hash, reordering_int_t val, reordering_int_t key)
176 | {
177 |     reordering_int_t flag = 0;
178 |     
179 |     flag = mynd_Insert_hashelement2(hash->hashelement, val, key);
180 | 
181 |     hash->nownodes += flag;
182 | 
183 | 	return flag;
184 | }
185 | 
186 | reordering_int_t mynd_hash_table_Find2(hash_table2_t *hash, reordering_int_t val)
187 | {
188 |     if(hash->hashelement[val] == -1)
189 |         return -1;
190 |     else 
191 |         return hash->hashelement[val];
192 | }
193 | 
194 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_ikvsorti.c:
--------------------------------------------------------------------------------
 1 | #ifndef IKVSORTI_H
 2 | #define IKVSORTI_H
 3 | 
 4 | #include "mynd_functionset.h"
 5 | 
 6 | void mynd_ikvsorti(size_t n, ikv_t *base)
 7 | {
 8 | #define ikey_lt(a, b) ((a)->key < (b)->key)
 9 |     GK_MKQSORT(ikv_t, base, n, ikey_lt);
10 | #undef ikey_lt
11 | }
12 | 
13 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_initialpartition.c:
--------------------------------------------------------------------------------
  1 | #ifndef INITIALPARTITION_H
  2 | #define INITIALPARTITION_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | //  InitSeparator + GrowBisectionNode
  7 | void mynd_ReorderBisection(graph_t *graph, reordering_int_t niparts)
  8 | {
  9 |     reordering_real_t ntpwgts[2] = {0.5, 0.5};
 10 | 
 11 |     /* this is required for the cut-based part of the refinement */
 12 |     // Setup2WayBalMultipliers(graph, ntpwgts);
 13 | 
 14 |     // GrowBisectionNode(graph, ntpwgts, niparts);
 15 |     reordering_int_t i, j, k, nvtxs, drain, nleft, first, last, pwgts[2], oneminpwgt, 
 16 |         onemaxpwgt, bestcut=0, inbfs;
 17 |     reordering_int_t *xadj, *vwgt, *adjncy, *where, *bndind;
 18 |     reordering_int_t *queue, *touched, *bestwhere;
 19 | 
 20 |     nvtxs  = graph->nvtxs;
 21 |     xadj   = graph->xadj;
 22 |     vwgt   = graph->vwgt;
 23 |     adjncy = graph->adjncy;
 24 | 
 25 |     onemaxpwgt = 1.2000499 * graph->tvwgt[0] * 0.5;
 26 |     oneminpwgt = (1.0 / 1.2000499) * graph->tvwgt[0] * 0.5;
 27 | 
 28 |     /* Allocate refinement memory. Allocate sufficient memory for both edge and node */
 29 |     graph->where  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: graph->where");
 30 |     graph->pwgts  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * 3, "ReorderBisection: graph->pwgts");
 31 |     graph->bndptr = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: graph->bndptr");
 32 |     graph->bndind = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: graph->bndind");
 33 |     graph->id     = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: graph->id");
 34 |     graph->ed     = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: graph->ed");
 35 |     graph->nrinfo = (nrinfo_t *)mynd_check_malloc(sizeof(nrinfo_t) * nvtxs, "ReorderBisection: graph->nrinfo");
 36 | 
 37 |     bestwhere = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: bestwhere");
 38 |     queue     = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: queue");
 39 |     touched   = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "ReorderBisection: touched");
 40 |   
 41 |     where  = graph->where;
 42 |     bndind = graph->bndind;
 43 |     // printf("ReorderBisection 0\n");
 44 |     for (inbfs = 0; inbfs < niparts; inbfs++) 
 45 |     {
 46 |         mynd_set_value_int(nvtxs, 1, where);
 47 |         mynd_set_value_int(nvtxs, 0, touched);
 48 | 
 49 |         pwgts[1] = graph->tvwgt[0];
 50 |         pwgts[0] = 0;
 51 | 
 52 |         queue[0] = mynd_irandInRange(nvtxs);
 53 |         touched[queue[0]] = 1;
 54 |         first = 0; last = 1;
 55 |         nleft = nvtxs-1;
 56 |         drain = 0;
 57 | 
 58 |         /* Start the BFS from queue to get a partition */
 59 |         for (;;) 
 60 |         {
 61 |             if (first == last) 
 62 |             { 
 63 |                 /* Empty. Disconnected graph! */
 64 |                 if (nleft == 0 || drain)
 65 |                     break;
 66 |   
 67 |                 k = mynd_irandInRange(nleft);
 68 |                 for (i = 0; i < nvtxs; i++) 
 69 |                 { 
 70 |                     /* select the kth untouched vertex */
 71 |                     if (touched[i] == 0) 
 72 |                     {
 73 |                         if (k == 0)
 74 |                             break;
 75 |                         else
 76 |                             k--;
 77 |                     }
 78 |                 }
 79 | 
 80 |                 queue[0]   = i;
 81 |                 touched[i] = 1;
 82 |                 first      = 0; 
 83 |                 last       = 1;
 84 |                 nleft--;
 85 |             }
 86 | 
 87 |             i = queue[first++];
 88 |             if (pwgts[1] - vwgt[i] < oneminpwgt) 
 89 |             {
 90 |                 drain = 1;
 91 |                 continue;
 92 |             }
 93 | 
 94 |             where[i] = 0;
 95 |             pwgts[0] += vwgt[i];
 96 |             pwgts[1] -= vwgt[i];
 97 |             if (pwgts[1] <= onemaxpwgt)
 98 |                 break;
 99 | 
100 |             drain = 0;
101 |             for (j = xadj[i]; j < xadj[i + 1]; j++) 
102 |             {
103 |                 k = adjncy[j];
104 |                 if (touched[k] == 0) 
105 |                 {
106 |                 queue[last++] = k;
107 |                 touched[k] = 1;
108 |                 nleft--;
109 |                 }
110 |             }
111 |         }
112 |         // printf("ReorderBisection 1\n");
113 |         // printf("ReorderBisection 1 end ccnt=%"PRIDX"\n",rand_count());
114 |         // exam_where(graph);
115 | 
116 |         /*************************************************************
117 |         * Do some partition refinement 
118 |         **************************************************************/
119 |         CONTROL_COMMAND(control, PARTITIOBINF2WAY, mynd_gettimebegin(&start_partitioninf2way, &end_partitioninf2way, &time_partitioninf2way));
120 | 		mynd_Compute_Partition_Informetion_2way(graph);
121 | 		CONTROL_COMMAND(control, PARTITIOBINF2WAY, mynd_gettimeend(&start_partitioninf2way, &end_partitioninf2way, &time_partitioninf2way));
122 |         // printf("ReorderBisection 2\n");
123 |         // exam_where(graph);
124 |         CONTROL_COMMAND(control, FM2WAYCUTBALANCE_Time, mynd_gettimebegin(&start_fm2waycutbalance, &end_fm2waycutbalance, &time_fm2waycutbalance));
125 | 		mynd_Balance2Way(graph, ntpwgts);
126 | 		CONTROL_COMMAND(control, FM2WAYCUTBALANCE_Time, mynd_gettimeend(&start_fm2waycutbalance, &end_fm2waycutbalance, &time_fm2waycutbalance));
127 |         // printf("ReorderBisection 3\n");
128 |         // printf("ReorderBisection 3 end ccnt=%"PRIDX"\n",rand_count());
129 |         // exam_where(graph);
130 |         CONTROL_COMMAND(control, FM2WAYCUTREFINE_Time, mynd_gettimebegin(&start_fm2waycutrefine, &end_fm2waycutrefine, &time_fm2waycutrefine));
131 | 		mynd_FM_2WayCutRefine(graph, ntpwgts, 4);
132 | 		CONTROL_COMMAND(control, FM2WAYCUTREFINE_Time, mynd_gettimeend(&start_fm2waycutrefine, &end_fm2waycutrefine, &time_fm2waycutrefine));
133 |         // printf("ReorderBisection 4\n");
134 |         // printf("ReorderBisection 4 end ccnt=%"PRIDX"\n",rand_count());
135 |         // exam_where(graph);
136 | 
137 |         /* Construct and refine the vertex separator */
138 |         for (i = 0; i < graph->nbnd; i++) 
139 |         {
140 |             j = bndind[i];
141 |             if (xadj[j + 1] - xadj[j] > 0) /* ignore islands */
142 |                 where[j] = 2;
143 |         }
144 | 
145 |         // printf("ReorderBisection 5\n");
146 |         // exam_where(graph);
147 | 
148 |         CONTROL_COMMAND(control, REORDERINF2WAY_Time, mynd_gettimebegin(&start_reorderinf2way, &end_reorderinf2way, &time_reorderinf2way));
149 | 		mynd_Compute_Reorder_Informetion_2way(graph); 
150 | 		CONTROL_COMMAND(control, REORDERINF2WAY_Time, mynd_gettimeend(&start_reorderinf2way, &end_reorderinf2way, &time_reorderinf2way));
151 |         // printf("ReorderBisection 6\n");
152 |         // exam_where(graph);
153 |         CONTROL_COMMAND(control, FM2SIDENODEREFINE_Time, mynd_gettimebegin(&start_fm2sidenoderefine, &end_fm2sidenoderefine, &time_fm2sidenoderefine));
154 | 		mynd_FM_2WayNodeRefine2Sided(graph, 1);
155 | 		CONTROL_COMMAND(control, FM2SIDENODEREFINE_Time, mynd_gettimeend(&start_fm2sidenoderefine, &end_fm2sidenoderefine, &time_fm2sidenoderefine));
156 |         // printf("ReorderBisection 7\n");
157 |         // printf("ReorderBisection 7 end ccnt=%"PRIDX"\n",rand_count());
158 |         // exam_where(graph);
159 |         CONTROL_COMMAND(control, FM1SIDENODEREFINE_Time, mynd_gettimebegin(&start_fm1sidenoderefine, &end_fm1sidenoderefine, &time_fm1sidenoderefine));
160 | 		mynd_FM_2WayNodeRefine1Sided(graph, 4);
161 | 		CONTROL_COMMAND(control, FM1SIDENODEREFINE_Time, mynd_gettimeend(&start_fm1sidenoderefine, &end_fm1sidenoderefine, &time_fm1sidenoderefine));
162 |         // printf("ReorderBisection 8\n");
163 |         // printf("ReorderBisection 8 end ccnt=%"PRIDX"\n",rand_count());
164 |         // exam_where(graph);
165 | 
166 |         /*
167 |         printf("ISep: [%"PRIDX" %"PRIDX" %"PRIDX" %"PRIDX"] %"PRIDX"\n", 
168 |             inbfs, graph->pwgts[0], graph->pwgts[1], graph->pwgts[2], bestcut); 
169 |         */
170 |         // printf("inbfs=%"PRIDX"\n",inbfs);
171 |         if (inbfs == 0 || bestcut > graph->mincut) 
172 |         {
173 |             bestcut = graph->mincut;
174 |             mynd_copy_int(nvtxs, where, bestwhere);
175 |         }
176 |     }
177 | 
178 |     graph->mincut = bestcut;
179 |     mynd_copy_int(nvtxs, bestwhere, where);
180 |     // printf("ReorderBisection 9\n");
181 |     mynd_check_free(touched, sizeof(reordering_int_t) * nvtxs, "ReorderBisection: touched");
182 |     mynd_check_free(queue, sizeof(reordering_int_t) * nvtxs, "ReorderBisection: queue");
183 |     mynd_check_free(bestwhere, sizeof(reordering_int_t) * nvtxs, "ReorderBisection: bestwhere");
184 |     //     exam_where(graph);
185 | }
186 | 
187 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_priorityqueue.c:
--------------------------------------------------------------------------------
  1 | #ifndef PRIORITYQUEUE_H
  2 | #define PRIORITYQUEUE_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | //	priority queue
  7 | /*************************************************************************/
  8 | /*! This function initializes the data structures of the priority queue */
  9 | /**************************************************************************/
 10 | void mynd_priority_queue_Init(priority_queue_t *queue, reordering_int_t maxnodes)
 11 | {
 12 | 	queue->nownodes = 0;
 13 | 	queue->maxnodes = maxnodes;
 14 | 	queue->heap     = (node_t *)mynd_check_malloc(sizeof(node_t) * maxnodes, "priority_queue_Init: heap");
 15 | 	queue->locator  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * maxnodes, "priority_queue_Init: locator");
 16 | 	for(reordering_int_t i = 0;i < maxnodes;i++)
 17 | 		queue->locator[i] = -1;
 18 | }
 19 | 
 20 | /*************************************************************************/
 21 | /*! This function creates and initializes a priority queue */
 22 | /**************************************************************************/
 23 | priority_queue_t *mynd_priority_queue_Create(reordering_int_t maxnodes)
 24 | {
 25 | 	priority_queue_t *queue; 
 26 | 
 27 | 	queue = (priority_queue_t *)mynd_check_malloc(sizeof(priority_queue_t), "mynd_priority_queue_Create: queue");
 28 | 	mynd_priority_queue_Init(queue, maxnodes);
 29 | 
 30 | 	return queue;
 31 | }
 32 | 
 33 | /*************************************************************************/
 34 | /*! This function resets the priority queue */
 35 | /**************************************************************************/
 36 | void mynd_priority_queue_Reset(priority_queue_t *queue)
 37 | {
 38 | 	reordering_int_t i;
 39 | 	reordering_int_t *locator = queue->locator;
 40 | 	node_t *heap    = queue->heap;
 41 | 
 42 | 	for (i = queue->nownodes - 1; i >= 0; i--)
 43 | 		locator[heap[i].val] = -1;
 44 | 	queue->nownodes = 0;
 45 | }
 46 | 
 47 | /*************************************************************************/
 48 | /*! This function frees the internal datastructures of the priority queue */
 49 | /**************************************************************************/
 50 | void mynd_priority_queue_Free(priority_queue_t *queue)
 51 | {
 52 | 	if (queue == NULL) return;
 53 | 	mynd_check_free(queue->locator, sizeof(reordering_int_t) * queue->maxnodes, "priority_queue_Free: queue->locator");
 54 | 	mynd_check_free(queue->heap, sizeof(node_t) * queue->maxnodes, "priority_queue_Free: queue->heap");
 55 | 	queue->maxnodes = 0;
 56 | }
 57 | 
 58 | /*************************************************************************/
 59 | /*! This function frees the internal datastructures of the priority queue 
 60 |     and the queue itself */
 61 | /**************************************************************************/
 62 | void mynd_priority_queue_Destroy(priority_queue_t *queue)
 63 | {
 64 | 	if (queue == NULL) return;
 65 | 	mynd_priority_queue_Free(queue);
 66 | 	mynd_check_free(queue, sizeof(priority_queue_t), "mynd_priority_queue_Destroy: queue");
 67 | }
 68 | 
 69 | /*************************************************************************/
 70 | /*! This function returns the length of the queue */
 71 | /**************************************************************************/
 72 | reordering_int_t mynd_priority_queue_Length(priority_queue_t *queue)
 73 | {
 74 | 	return queue->nownodes;
 75 | }
 76 | 
 77 | /*************************************************************************/
 78 | /*! This function adds an item in the priority queue */
 79 | /**************************************************************************/
 80 | reordering_int_t mynd_priority_queue_Insert(priority_queue_t *queue, reordering_int_t node, reordering_int_t key)
 81 | {
 82 | 	reordering_int_t i, j;
 83 | 	reordering_int_t *locator=queue->locator;
 84 | 	node_t *heap = queue->heap;
 85 | 
 86 | 	i = queue->nownodes++;
 87 | 	while (i > 0) 
 88 | 	{
 89 | 		j = (i - 1) >> 1;
 90 | 		if (m_gt_n(key, heap[j].key)) 
 91 | 		{
 92 | 			heap[i] = heap[j];
 93 | 			locator[heap[i].val] = i;
 94 | 			i = j;
 95 | 		}
 96 | 		else
 97 | 			break;
 98 | 	}
 99 |   
100 | 	heap[i].key   = key;
101 | 	heap[i].val   = node;
102 | 	locator[node] = i;
103 | 
104 | 	return 0;
105 | }
106 | 
107 | /*************************************************************************/
108 | /*! This function deletes an item from the priority queue */
109 | /**************************************************************************/
110 | reordering_int_t mynd_priority_queue_Delete(priority_queue_t *queue, reordering_int_t node)
111 | {
112 | 	reordering_int_t i, j, nownodes;
113 | 	reordering_int_t newkey, oldkey;
114 | 	reordering_int_t *locator = queue->locator;
115 | 	node_t *heap = queue->heap;
116 | 
117 | 	i = locator[node];
118 | 	locator[node] = -1;
119 | 
120 | 	if (--queue->nownodes > 0 && heap[queue->nownodes].val != node) 
121 | 	{
122 | 		node   = heap[queue->nownodes].val;
123 | 		newkey = heap[queue->nownodes].key;
124 | 		oldkey = heap[i].key;
125 | 
126 | 		if (m_gt_n(newkey, oldkey)) 
127 | 		{ /* Filter-up */
128 | 			while (i > 0) 
129 | 			{
130 | 				j = (i - 1) >> 1;
131 | 				if (m_gt_n(newkey, heap[j].key)) 
132 | 				{
133 | 					heap[i] = heap[j];
134 | 					locator[heap[i].val] = i;
135 | 					i = j;
136 | 				}
137 | 				else
138 | 					break;
139 | 			}
140 | 		}
141 | 		else 
142 | 		{ /* Filter down */
143 | 			nownodes = queue->nownodes;
144 | 			while ((j = (i << 1) + 1) < nownodes) 
145 | 			{
146 | 				if (m_gt_n(heap[j].key, newkey)) 
147 | 				{
148 | 					if (j + 1 < nownodes && m_gt_n(heap[j + 1].key, heap[j].key))
149 | 						j++;
150 | 					heap[i] = heap[j];
151 | 					locator[heap[i].val] = i;
152 | 					i = j;
153 | 				}
154 | 				else if (j + 1 < nownodes && m_gt_n(heap[j + 1].key, newkey)) 
155 | 				{
156 | 					j++;
157 | 					heap[i] = heap[j];
158 | 					locator[heap[i].val] = i;
159 | 					i = j;
160 | 				}
161 | 				else
162 | 					break;
163 | 			}
164 | 		}
165 | 
166 | 		heap[i].key   = newkey;
167 | 		heap[i].val   = node;
168 | 		locator[node] = i;
169 | 	}
170 | 
171 | 	return 0;
172 | }
173 | 
174 | /*************************************************************************/
175 | /*! This function updates the key values associated for a particular item */ 
176 | /**************************************************************************/
177 | void mynd_priority_queue_Update(priority_queue_t *queue, reordering_int_t node, reordering_int_t newkey)
178 | {
179 | 	reordering_int_t i, j, nownodes;
180 | 	reordering_int_t oldkey;
181 | 	reordering_int_t *locator = queue->locator;
182 | 	node_t *heap = queue->heap;
183 | 
184 | 	oldkey = heap[locator[node]].key;
185 | 
186 | 	i = locator[node];
187 | 
188 | 	if (m_gt_n(newkey, oldkey)) 
189 | 	{ /* Filter-up */
190 | 		while (i > 0) 
191 | 		{
192 | 			j = (i - 1) >> 1;
193 | 			if (m_gt_n(newkey, heap[j].key)) 
194 | 			{
195 | 				heap[i] = heap[j];
196 | 				locator[heap[i].val] = i;
197 | 				i = j;
198 | 			}
199 | 			else
200 | 				break;
201 | 		}
202 | 	}
203 | 	else 
204 | 	{ /* Filter down */
205 | 		nownodes = queue->nownodes;
206 | 		while ((j = (i << 1) + 1) < nownodes) 
207 | 		{
208 | 			if (m_gt_n(heap[j].key, newkey)) 
209 | 			{
210 | 				if (j + 1 < nownodes && m_gt_n(heap[j + 1].key, heap[j].key))
211 | 					j++;
212 | 				heap[i] = heap[j];
213 | 				locator[heap[i].val] = i;
214 | 				i = j;
215 | 			}
216 | 			else if (j + 1 < nownodes && m_gt_n(heap[j + 1].key, newkey)) 
217 | 			{
218 | 				j++;
219 | 				heap[i] = heap[j];
220 | 				locator[heap[i].val] = i;
221 | 				i = j;
222 | 			}
223 | 			else
224 | 				break;
225 | 		}
226 | 	}
227 | 
228 | 	heap[i].key   = newkey;
229 | 	heap[i].val   = node;
230 | 	locator[node] = i;
231 | 
232 | 	return;
233 | }
234 | 
235 | /*************************************************************************/
236 | /*! This function returns the item at the top of the queue and removes
237 |     it from the priority queue */
238 | /**************************************************************************/
239 | reordering_int_t mynd_priority_queue_GetTop(priority_queue_t *queue)
240 | {
241 | 	reordering_int_t i, j;
242 | 	reordering_int_t *locator;
243 | 	node_t *heap;
244 | 	reordering_int_t vtx, node;
245 | 	reordering_int_t key;
246 | 
247 | 	if (queue->nownodes == 0)
248 | 		return -1;
249 | 
250 | 	queue->nownodes--;
251 | 
252 | 	heap    = queue->heap;
253 | 	locator = queue->locator;
254 | 
255 | 	vtx = heap[0].val;
256 | 	locator[vtx] = -1;
257 | 
258 | 	if ((i = queue->nownodes) > 0) 
259 | 	{
260 | 		key  = heap[i].key;
261 | 		node = heap[i].val;
262 | 		i = 0;
263 | 		while ((j = 2 * i + 1) < queue->nownodes) 
264 | 		{
265 | 			if (m_gt_n(heap[j].key, key)) 
266 | 			{
267 | 				if (j + 1 < queue->nownodes && m_gt_n(heap[j + 1].key, heap[j].key))
268 | 					j = j+1;
269 | 				heap[i] = heap[j];
270 | 				locator[heap[i].val] = i;
271 | 				i = j;
272 | 			}
273 | 			else if (j + 1 < queue->nownodes && m_gt_n(heap[j + 1].key, key)) 
274 | 			{
275 | 				j = j + 1;
276 | 				heap[i] = heap[j];
277 | 				locator[heap[i].val] = i;
278 | 				i = j;
279 | 			}
280 | 			else
281 | 				break;
282 | 		}
283 | 
284 | 		heap[i].key   = key;
285 | 		heap[i].val   = node;
286 | 		locator[node] = i;
287 | 	}
288 | 
289 | 	return vtx;
290 | }
291 | 
292 | /*************************************************************************/
293 | /*! This function returns the item at the top of the queue. The item is not
294 |     deleted from the queue. */
295 | /**************************************************************************/
296 | reordering_int_t mynd_priority_queue_SeeTopVal(priority_queue_t *queue)
297 | {
298 |   return (queue->nownodes == 0 ? -1 : queue->heap[0].val);
299 | }
300 | 
301 | /*************************************************************************/
302 | /*! This function returns the item at the top of the queue. The item is not
303 |     deleted from the queue. */
304 | /**************************************************************************/
305 | reordering_int_t mynd_priority_queue_SeeTopKey(priority_queue_t *queue)
306 | {
307 |   return (queue->nownodes == 0 ? -1 : queue->heap[0].key);
308 | }
309 | 
310 | void mynd_exam_priority_queue(priority_queue_t *queue)
311 | {
312 | 	printf("nownodes=%"PRIDX" maxnodes=%"PRIDX"\n",queue->nownodes,queue->maxnodes);
313 | 	printf("key:");
314 | 	for(reordering_int_t i = 0;i < queue->nownodes;i++)
315 | 		printf("%"PRIDX" ",queue->heap[i].key);
316 | 	printf("\n");
317 | 	printf("val:");
318 | 	for(reordering_int_t i = 0;i < queue->nownodes;i++)
319 | 		printf("%"PRIDX" ",queue->heap[i].val);
320 | 	printf("\n");
321 | }
322 | 
323 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_queue.c:
--------------------------------------------------------------------------------
 1 | #ifndef QUEUE_H
 2 | #define QUEUE_H
 3 | 
 4 | #include "mynd_functionset.h"
 5 | 
 6 | //	common queue
 7 | reordering_int_t mynd_init_queue(reordering_int_t ptr, reordering_int_t *bndptr, reordering_int_t nvtxs)
 8 | {
 9 | 	mynd_set_value_int(nvtxs, -1, bndptr);
10 | 	return 0;
11 | }
12 | 
13 | /*************************************************************************/
14 | /*! Execution process:	(n -> empty)
15 | 		nbnd:	 5
16 | 		bndind:	 6  4  5  9  2  n  n  n  n  n
17 | 		bndptr:	-1 -1  4 -1  1  2  0 -1 -1  3
18 | 	aftering insert 8
19 | 		nbnd:	 6
20 | 		bndind:	 6  4  5  9  2  8  n  n  n  n
21 | 		bndptr:	-1 -1  4 -1  1  2  0 -1  5  3
22 |  */
23 | /**************************************************************************/
24 | 																//  /\		//
25 | reordering_int_t mynd_insert_queue(reordering_int_t nbnd, reordering_int_t *bndptr, reordering_int_t *bndind, reordering_int_t vertex)// /  \		//
26 | {													//				||
27 | 	bndind[nbnd]   = vertex;						//	bndind[5] = 8
28 | 	bndptr[vertex] = nbnd;							//	bndptr[8] = 5
29 | 	nbnd ++;										//	nbnd      = 6
30 | 
31 | 	return nbnd;
32 | }
33 | 
34 | /*************************************************************************/
35 | /*! Execution process:	(n -> empty)
36 | 		nbnd:	 6
37 | 		bndind:	 6  4  5  9  2  8  n  n  n  n
38 | 		bndptr:	-1 -1  4 -1  1  2  0 -1  5  3
39 | 	aftering delete 4
40 | 		nbnd:	 5
41 | 		bndind:	 6  8  5  9  2  n  n  n  n  n
42 | 		bndptr:	-1 -1  4 -1 -1  2  0 -1  1  3
43 |  */
44 | /**************************************************************************/
45 | 																//  /\		//
46 | reordering_int_t mynd_delete_queue(reordering_int_t nbnd, reordering_int_t *bndptr, reordering_int_t *bndind, reordering_int_t vertex)// /  \		//
47 | {													//				||
48 | 	bndind[bndptr[vertex]]   = bndind[nbnd - 1];	//	bndind[1] = 8
49 | 	bndptr[bndind[nbnd - 1]] = bndptr[vertex];		//	bndptr[8] = 1
50 | 	bndptr[vertex] = -1;							//	bndptr[4] = -1
51 | 	nbnd --;										//	nbnd      = 5
52 | 
53 | 	return nbnd;
54 | }
55 | 
56 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_read.c:
--------------------------------------------------------------------------------
  1 | #ifndef READ_H
  2 | #define READ_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | /*************************************************************************
  7 | * This function checks if a file exists
  8 | **************************************************************************/
  9 | reordering_int_t mynd_Is_file_exists(char *fname)
 10 | {
 11 |     struct stat status;
 12 | 
 13 |     if (stat(fname, &status) == -1)
 14 |         return 0;
 15 | 
 16 |     return S_ISREG(status.st_mode);
 17 | }
 18 | 
 19 | /*************************************************************************
 20 | * This function opens a file
 21 | **************************************************************************/
 22 | FILE * mynd_check_fopen(char *fname, char *mode, const char *message)
 23 | {
 24 |     FILE *fp = fopen(fname, mode);
 25 |     if (fp != NULL)
 26 |         return fp;
 27 | 
 28 |     char *error_message = (char *)mynd_check_malloc(sizeof(char) * 1024, "check_fopen: error_message");
 29 | 	sprintf(error_message, "Failed on check_fopen()\n\tfile: %s, mode: %s, [ %s ].", fname, mode, message);
 30 |     mynd_error_exit(error_message);
 31 | 
 32 |     return NULL;
 33 | }
 34 | 
 35 | /*************************************************************************/
 36 | /*! This function is the GKlib implementation of glibc's getline()
 37 |     function.
 38 |     \returns -1 if the EOF has been reached, otherwise it returns the 
 39 |              number of bytes read.
 40 | */
 41 | /*************************************************************************/
 42 | ssize_t  mynd_check_getline(char **lineptr, reordering_int_t *n, FILE *stream)
 43 | {
 44 |     reordering_int_t i;
 45 |     reordering_int_t ch;
 46 | 
 47 |     /* Check whether the file stream reaches the end of the file, and if it does, return -1 */
 48 |     if (feof(stream))
 49 |         return -1;  
 50 | 
 51 |     /* Initial memory allocation if *lineptr is NULL */
 52 |     if (*lineptr == NULL || *n == 0) {
 53 |         *n = 1024;
 54 |         *lineptr = (char *)mynd_check_malloc(sizeof(char) * (*n), "check_getline: lineptr");
 55 |     }
 56 | 
 57 |     /* get into the main loop */
 58 |     i = 0;
 59 |     /* The getc function is used to read characters from the file stream until the end of the file is reached */
 60 |     while ((ch = getc(stream)) != EOF) {
 61 |         (*lineptr)[i++] = (char)ch;
 62 | 
 63 |         /* reallocate memory if reached at the end of the buffer. The +1 is for '\0' */
 64 |         if (i+1 == *n) { 
 65 |             *n = 2*(*n);
 66 |             *lineptr = (char *)mynd_check_realloc(*lineptr, (*n)*sizeof(char), (*n)*sizeof(char)/2, "check_getline: lineptr");
 67 |         }
 68 |         
 69 |         if (ch == '\n')
 70 |             break;
 71 |     }
 72 |     (*lineptr)[i] = '\0';
 73 | 
 74 |     return (i == 0 ? -1 : i);
 75 | }
 76 | 
 77 | /*************************************************************************
 78 | * This function closes a file
 79 | **************************************************************************/
 80 | void  mynd_check_fclose(FILE *fp)
 81 | {
 82 | 	fclose(fp);
 83 | }
 84 | 
 85 | /*************************************************************************/
 86 | /*! This function reads in a sparse graph */
 87 | /*************************************************************************/
 88 | /*
 89 | params->filename = graphfile
 90 | 
 91 | */
 92 | void mynd_ReadGraph(char *filename, reordering_int_t *nvtxs, reordering_int_t *nedges, reordering_int_t **txadj, reordering_int_t **tvwgt, reordering_int_t **tadjncy, reordering_int_t **tadjwgt)
 93 | {
 94 |     reordering_int_t i, k, l, fmt, nfields, readew, readvw, readvs, edge, ewgt;
 95 |     reordering_int_t *xadj, *adjncy, *vwgt, *adjwgt;
 96 | 	reordering_int_t *vsize;
 97 |     char *line = NULL, fmtstr[256], *curstr, *newstr;
 98 |     reordering_int_t lnlen = 0;
 99 |     FILE *fpin;
100 |     graph_t *graph;
101 | 
102 |     if (!mynd_Is_file_exists(filename)) 
103 |     {
104 |         // char *error_message = (char *)malloc(sizeof(char) * 128);   //!!!
105 |         char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
106 | 		sprintf(error_message, "File %s does not exist!", filename);
107 |         mynd_error_exit(error_message);
108 |         // errexit("File %s does not exist!\n", params->filename);
109 |     }
110 | 	
111 |     graph = mynd_CreateGraph();
112 | 
113 |     fpin = mynd_check_fopen(filename, "r", "ReadGRaph: Graph");
114 | 	
115 |     /* Skip comment lines until you get to the first valid line */
116 |     do {
117 |         if ( mynd_check_getline(&line, &lnlen, fpin) == -1) 
118 |         {
119 |             char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
120 | 			sprintf(error_message, "Premature end of input file: file: %s.", filename);
121 |             mynd_error_exit(error_message);
122 |         }
123 |     } while (line[0] == '%');
124 | 
125 |     fmt = 0;
126 |     nfields = sscanf(line, "%"PRIDX" %"PRIDX" %"PRIDX"", &(graph->nvtxs), &(graph->nedges), &fmt);
127 | 	
128 | 	if (nfields < 2) 
129 | 	{
130 | 		char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
131 | 		sprintf(error_message, "The input file does not specify the number of vertices and edges.");
132 | 		mynd_error_exit(error_message);
133 | 	}
134 | 	
135 | 	if (graph->nvtxs <= 0 || graph->nedges <= 0) 
136 | 	{
137 | 		char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
138 | 		sprintf(error_message, "The supplied nvtxs: %"PRIDX" and nedges: %"PRIDX" must be positive.", graph->nvtxs, graph->nedges);
139 | 		mynd_error_exit(error_message);
140 | 	}
141 | 	
142 | 	if (fmt > 111) 
143 | 	{
144 | 		char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
145 | 		sprintf(error_message, "Cannot read this type of file format [fmt=%"PRIDX"]!", fmt);
146 | 		mynd_error_exit(error_message);
147 | 	}
148 | 	
149 | 	sprintf(fmtstr, "%03"PRIDX"", fmt%1000);
150 | 	readvs = (fmtstr[0] == '1');
151 | 	readvw = (fmtstr[1] == '1');
152 | 	readew = (fmtstr[2] == '1');
153 | 	// printf("readvs=%"PRIDX" readvw=%"PRIDX" readew=%"PRIDX"\n",readvs, readvw, readew);
154 | 	
155 | 	// if (ncon > 0 && !readvw) 
156 | 	// {
157 | 	// 	char *error_message = (char *)mynd_check_malloc(sizeof(char) * 1024, "ReadGraph: error_message");
158 | 	// 	sprintf(error_message, 
159 | 	// 	"------------------------------------------------------------------------------\n"
160 | 	// 	"***  I detected an error in your input file  ***\n\n"
161 | 	// 	"You specified ncon=%d, but the fmt parameter does not specify vertex weights\n" 
162 | 	// 	"Make sure that the fmt parameter is set to either 10 or 11.\n"
163 | 	// 	"------------------------------------------------------------------------------\n", ncon);
164 | 	// 	mynd_error_exit(error_message);
165 | 	// }
166 | 
167 | 	graph->nedges *=2;
168 | 	// ncon = graph->ncon = (ncon == 0 ? 1 : ncon);
169 | 
170 | 	// xadj   = graph->xadj   = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * (graph->nvtxs + 1), "ReadGraph: xadj");
171 | 	// adjncy = graph->adjncy = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nedges, "ReadGraph: adjncy");
172 | 	// vwgt   = graph->vwgt   = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs, "ReadGraph: vwgt");
173 | 	// adjwgt = graph->adjwgt = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nedges,"ReadGraph: adjwgt");
174 | 	xadj   = *txadj   = (reordering_int_t *)malloc(sizeof(reordering_int_t) * (graph->nvtxs + 1));
175 | 	adjncy = *tadjncy = (reordering_int_t *)malloc(sizeof(reordering_int_t) * graph->nedges);
176 | 	vwgt   = *tvwgt   = (reordering_int_t *)malloc(sizeof(reordering_int_t) * graph->nvtxs);
177 | 	adjwgt = *tadjwgt = (reordering_int_t *)malloc(sizeof(reordering_int_t) * graph->nedges);
178 | 
179 | 	// vsize  = graph->vsize  = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * graph->nvtxs,"ReadGraph: vsize");
180 | 
181 | 	mynd_set_value_int(graph->nvtxs + 1, 0, xadj);
182 | 	mynd_set_value_int(graph->nvtxs, 1, vwgt);
183 | 	mynd_set_value_int(graph->nedges, 1, adjwgt);
184 | 	// mynd_set_value_int(graph->nvtxs, 1, vsize);
185 | 
186 | 	/*----------------------------------------------------------------------
187 | 	* Read the sparse graph file
188 | 	*---------------------------------------------------------------------*/
189 | 	for (xadj[0] = 0, k = 0, i = 0; i < graph->nvtxs; i++) 
190 | 	{
191 | 		do {
192 | 			if ( mynd_check_getline(&line, &lnlen, fpin) == -1) 
193 | 			{
194 | 				char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
195 | 				sprintf(error_message, "Premature end of input file while reading vertex %"PRIDX".", i + 1);
196 | 				mynd_error_exit(error_message);
197 | 			}
198 | 		} while (line[0] == '%');
199 | 
200 | 		curstr = line;
201 | 		newstr = NULL;
202 | 
203 |     	/* Read vertex sizes */
204 | 		if (readvs) 
205 | 		{
206 | 			vsize[i] = strtoll(curstr, &newstr, 10);
207 | 			if (newstr == curstr)
208 | 			{
209 | 				char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
210 | 				sprintf(error_message, "The line for vertex %"PRIDX" does not have vsize information.", i + 1);
211 | 				mynd_error_exit(error_message);
212 | 			}
213 | 			if (vsize[i] < 0)
214 | 			{
215 | 				char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
216 | 				sprintf(error_message, "The size for vertex %"PRIDX" must be >= 0.", i + 1);
217 | 				mynd_error_exit(error_message);
218 | 			}
219 | 
220 | 			curstr = newstr;
221 | 		}
222 | 
223 | 
224 | 		/* Read vertex weights */
225 | 		if (readvw) 
226 | 		{
227 | 			for (l=0; l<1; l++) 
228 | 			{
229 | 				vwgt[i+l] = strtoll(curstr, &newstr, 10);
230 | 				if (newstr == curstr)
231 | 				{
232 | 					char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
233 | 					sprintf(error_message, "The line for vertex %"PRIDX" does not have enough weights for the %"PRIDX" constraint.", i + 1, l);
234 | 					mynd_error_exit(error_message);
235 | 				}
236 | 				if (vwgt[i + l] < 0)
237 | 				{
238 | 					char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
239 | 					sprintf(error_message, "The weight vertex %"PRIDX" and constraint %"PRIDX" must be >= 0.", i + 1, l);
240 | 					mynd_error_exit(error_message);
241 | 				}
242 | 
243 | 				curstr = newstr;
244 | 			}
245 | 		}
246 | 
247 | 		while (1) 
248 | 		{
249 | 			edge = strtoll(curstr, &newstr, 10);
250 | 			if (newstr == curstr)
251 | 				break; /* End of line */
252 | 			curstr = newstr;
253 | 
254 | 			// jiusuo
255 | 			// edge++;
256 | 
257 | 			if (edge < 1 || edge > graph->nvtxs)
258 | 			{
259 | 				char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
260 | 				sprintf(error_message, "Edge %"PRIDX" for vertex %"PRIDX" is out of bounds.", edge, i + 1);
261 | 				mynd_error_exit(error_message);
262 | 			}
263 | 
264 | 			ewgt = 1;
265 | 			if (readew) 
266 | 			{
267 | 				ewgt = strtoll(curstr, &newstr, 10);
268 | 				if (newstr == curstr)
269 | 				{
270 | 					char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
271 | 					sprintf(error_message, "Premature end of line for vertex %"PRIDX".", i + 1);
272 | 					mynd_error_exit(error_message);
273 | 				}
274 | 				if (ewgt <= 0)
275 | 				{
276 | 					char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
277 | 					sprintf(error_message, "The weight (%"PRIDX") for edge (%"PRIDX", %"PRIDX") must be positive.", ewgt, i + 1, edge);
278 | 					mynd_error_exit(error_message);
279 | 				}
280 | 
281 | 				curstr = newstr;
282 | 			}
283 | 
284 | 			if (k == graph->nedges)
285 | 			{
286 | 				char *error_message = (char *)mynd_check_malloc(sizeof(char) * 128, "ReadGraph: error_message");
287 | 				sprintf(error_message, "There are more edges in the file than the %"PRIDX" specified.", graph->nedges / 2);
288 | 				mynd_error_exit(error_message);
289 | 			}
290 | 
291 | 			adjncy[k] = edge-1;
292 | 			adjwgt[k] = ewgt;
293 | 			k++;
294 | 		}
295 | 
296 |     	xadj[i + 1] = k;
297 | 	}
298 | 
299 | 	mynd_check_fclose(fpin);
300 | 
301 | 	if (k != graph->nedges) 
302 | 	{
303 | 		printf("------------------------------------------------------------------------------\n");
304 | 		printf("***  I detected an error in your input file  ***\n\n");
305 | 		printf("In the first line of the file, you specified that the graph contained\n"
306 | 			"%"PRIDX" edges. However, I only found %"PRIDX" edges in the file.\n", 
307 | 			graph->nedges / 2, k / 2);
308 | 		if (2 * k == graph->nedges) 
309 | 		{
310 | 			printf("\n *> I detected that you specified twice the number of edges that you have in\n");
311 | 			printf("    the file. Remember that the number of edges specified in the first line\n");
312 | 			printf("    counts each edge between vertices v and u only once.\n\n");
313 | 		}
314 | 		printf("Please specify the correct number of edges in the first line of the file.\n");
315 | 		printf("------------------------------------------------------------------------------\n");
316 | 		exit(0);
317 | 	}
318 | 
319 | 	// printf("lnlen=%zu sizeof(char) * lnlen=%zu\n",lnlen, sizeof(char) * lnlen);
320 | 
321 | 	mynd_check_free(line, sizeof(char) * lnlen, "ReadGraph: line");
322 | 	nvtxs[0]  = graph->nvtxs;
323 | 	nedges[0] = graph->nedges;
324 | 	
325 | 	// return graph;
326 | }
327 | 
328 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_searchtree.c:
--------------------------------------------------------------------------------
  1 | #ifndef SEARCHTREE_H
  2 | #define SEARCHTREE_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | //  Binary Search Tree Version 1.0
  7 | void mynd_binary_search_tree_Init(binary_search_tree_t *tree)
  8 | {
  9 |     tree->nownodes = 0;
 10 | 	tree->treenode = NULL;
 11 | }
 12 | 
 13 | binary_search_tree_t *mynd_binary_search_tree_Create()
 14 | {
 15 |     binary_search_tree_t *tree;
 16 | 
 17 |     tree = (binary_search_tree_t *)mynd_check_malloc(sizeof(binary_search_tree_t), "binary_search_tree_Create: tree");
 18 |     mynd_binary_search_tree_Init(tree);
 19 | 
 20 |     return tree;
 21 | }
 22 | 
 23 | void mynd_Free_Treenode(treenode_t *node)
 24 | {
 25 | 	if(node != NULL)
 26 | 	{
 27 | 		mynd_Free_Treenode(node->left);
 28 | 		mynd_Free_Treenode(node->right);
 29 | 		mynd_check_free(node, sizeof(treenode_t), "Free_Treenode: node");
 30 | 	}
 31 | }
 32 | 
 33 | void mynd_binary_search_tree_Free(binary_search_tree_t *tree)
 34 | {
 35 | 	if (tree == NULL) return;
 36 | 	mynd_Free_Treenode(tree->treenode);
 37 | 	// mynd_check_free(tree->locator);
 38 | 	tree->nownodes = 0;
 39 | }
 40 | 
 41 | void mynd_binary_search_tree_Destroy(binary_search_tree_t *tree)
 42 | {
 43 | 	if (tree == NULL) return;
 44 | 	mynd_binary_search_tree_Free(tree);
 45 | 	mynd_check_free(tree, sizeof(binary_search_tree_t), "binary_search_tree_Destroy: tree");
 46 | }
 47 | 
 48 | reordering_int_t mynd_binary_search_tree_Length(binary_search_tree_t *tree)
 49 | {
 50 | 	return tree->nownodes;
 51 | }
 52 | 
 53 | treenode_t *mynd_Create_TreeNode(reordering_int_t val, reordering_int_t key)
 54 | {
 55 | 	treenode_t *newnode = (treenode_t *)mynd_check_malloc(sizeof(treenode_t), "Create_TreeNode: newnode");
 56 |     
 57 | 	newnode->val = val;
 58 | 	newnode->key = key;
 59 |     newnode->left = newnode->right = NULL;
 60 |     
 61 | 	return newnode;
 62 | }
 63 | 
 64 | treenode_t *mynd_Insert_TreeNode(treenode_t *node, reordering_int_t val, reordering_int_t key, reordering_int_t *nownodes)
 65 | {
 66 | 	// if empty
 67 |     if (node == NULL) 
 68 | 	{
 69 | 		node = mynd_Create_TreeNode(val, key);
 70 | 		(*nownodes)++;
 71 | 		return node;
 72 | 	}
 73 | 
 74 |     // if less than
 75 |     if (val < node->val)
 76 |         node->left = mynd_Insert_TreeNode(node->left, val, key, nownodes);
 77 |     // if greater than
 78 |     else if (val > node->val)
 79 |         node->right = mynd_Insert_TreeNode(node->right, val, key, nownodes);
 80 | 	
 81 | 	//	if equal
 82 | 	else
 83 | 		node->key += key;
 84 | 
 85 |     return node;
 86 | }
 87 | 
 88 | void mynd_binary_search_tree_Insert(binary_search_tree_t *tree, reordering_int_t val, reordering_int_t key)
 89 | {
 90 | 	treenode_t *root = tree->treenode;
 91 | 	
 92 | 	root = mynd_Insert_TreeNode(root, val, key, &tree->nownodes);
 93 | 	tree->treenode = root;
 94 | 
 95 | 	return ;
 96 | }
 97 | 
 98 | reordering_int_t mynd_InorderTraversal_TreeNode(treenode_t *root, reordering_int_t *dst1, reordering_int_t *dst2, reordering_int_t *ptr) 
 99 | {
100 |     if (root != NULL) 
101 | 	{
102 |         *ptr = mynd_InorderTraversal_TreeNode(root->left,dst1,dst2,ptr);
103 | 		//	do operation
104 |         // printf("root->val=%"PRIDX" root->key=%"PRIDX" ", root->val,root->key);
105 | 		// if(dst != NULL) 
106 | 		// {
107 | 			dst1[*ptr] = root->val;
108 | 			dst2[*ptr] = root->key;
109 | 			// printf("root->val=%"PRIDX" dst[ptr]=%"PRIDX" ptr=%"PRIDX"\n",root->val, dst[*ptr], *ptr);
110 | 			(*ptr) ++;
111 | 		// }
112 | 
113 |         *ptr = mynd_InorderTraversal_TreeNode(root->right,dst1,dst2,ptr);
114 |     }
115 | 
116 | 	return *ptr;
117 | }
118 | 
119 | void mynd_binary_search_tree_Traversal(binary_search_tree_t *tree, reordering_int_t *dst1, reordering_int_t *dst2)
120 | {
121 | 	treenode_t *root = tree->treenode;
122 | 	reordering_int_t ptr = 0;
123 | 
124 | 	mynd_InorderTraversal_TreeNode(root, dst1, dst2, &ptr);
125 | }
126 | 
127 | 
128 | 
129 | //  Binary Search Tree Version 2.0
130 | void mynd_binary_search_tree_Init2(binary_search_tree2_t *tree, reordering_int_t size)
131 | {
132 |     tree->nownodes = 0;
133 | 	tree->maxnodes = size;
134 | 	tree->treenode = (treenode2_t *)mynd_check_malloc(sizeof(treenode2_t) * size, "binary_search_tree2_Init: tree->treenode");
135 | 
136 | 	for(reordering_int_t i = 0;i < size;i++)
137 | 	{
138 | 		tree->treenode[i].val = -1;
139 | 		tree->treenode[i].key = 0;
140 | 	}
141 | }
142 | 
143 | binary_search_tree2_t *mynd_binary_search_tree_Create2(reordering_int_t size)
144 | {
145 |     binary_search_tree2_t *tree;
146 | 
147 |     tree = (binary_search_tree2_t *)mynd_check_malloc(sizeof(binary_search_tree2_t), "binary_search_tree2_Create: tree");
148 |     
149 | 	mynd_binary_search_tree_Init2(tree, size);
150 | 
151 |     return tree;
152 | }
153 | 
154 | void mynd_exam_binary_search_tree2(binary_search_tree2_t *tree)
155 | {
156 | 	printf("val:");
157 | 	for(reordering_int_t i = 0;i < tree->maxnodes;i++)
158 | 	{
159 | 		printf("%"PRIDX" ",tree->treenode[i].val);
160 | 	}
161 | 	printf("\n");
162 | 	printf("key:");
163 | 	for(reordering_int_t i = 0;i < tree->maxnodes;i++)
164 | 	{
165 | 		printf("%"PRIDX" ",tree->treenode[i].key);
166 | 	}
167 | 	printf("\n");
168 | }
169 | 
170 | void mynd_exam_binary_search_tree2_flag(binary_search_tree2_t *tree)
171 | {
172 | 	reordering_int_t flag = 0;
173 | 	printf("val:");
174 | 	for(reordering_int_t i = 0;i < tree->maxnodes;i++)
175 | 	{
176 | 		printf("%"PRIDX" ",tree->treenode[i].val);
177 | 		if(tree->treenode[i].val != -1)
178 | 			flag = 1;
179 | 	}
180 | 	printf("\n");
181 | 	if(flag == 1)
182 | 		printf("flag=1\n");
183 | 	printf("key:");
184 | 	for(reordering_int_t i = 0;i < tree->maxnodes;i++)
185 | 	{
186 | 		printf("%"PRIDX" ",tree->treenode[i].key);
187 | 		if(tree->treenode[i].key != 0)
188 | 			flag = 2;
189 | 	}
190 | 	printf("\n");
191 | 	if(flag == 2)
192 | 		printf("flag=2\n");
193 | }
194 | 
195 | void mynd_binary_search_tree_Free2(binary_search_tree2_t *tree)
196 | {
197 | 	if (tree == NULL) return;
198 | 	mynd_check_free(tree->treenode, sizeof(treenode2_t) * tree->maxnodes, "binary_search_tree_Free2: tree->treenode");
199 | 	tree->nownodes = 0;
200 | 	tree->maxnodes = 0;
201 | }
202 | 
203 | void mynd_binary_search_tree_Destroy2(binary_search_tree2_t *tree)
204 | {
205 | 	if (tree == NULL) return;
206 | 	mynd_binary_search_tree_Free2(tree);
207 | 	mynd_check_free(tree, sizeof(binary_search_tree2_t), "binary_search_tree_Destroy2: tree");
208 | }
209 | 
210 | reordering_int_t mynd_binary_search_tree_Length2(binary_search_tree2_t *tree)
211 | {
212 | 	return tree->nownodes;
213 | }
214 | 
215 | void mynd_Insert_TreeNode2(binary_search_tree2_t *tree, reordering_int_t val, reordering_int_t key)
216 | {
217 | 	reordering_int_t ptr = 0;
218 | 	treenode2_t *treenode = tree->treenode;
219 | 
220 | 	while (treenode[ptr].val != -1) 
221 | 	{
222 | 		if(ptr >= tree->maxnodes)
223 | 		{
224 | 			printf("mynd_check_realloc\n");
225 | 			treenode = tree->treenode = (treenode2_t *)mynd_check_realloc(treenode, sizeof(treenode2_t) * tree->maxnodes * 2, sizeof(reordering_int_t) * tree->maxnodes, "Insert_TreeNode2: treenode");
226 | 			for(reordering_int_t i = tree->maxnodes;i < tree->maxnodes * 2;i++)
227 | 			{
228 | 				treenode[i].val = -1;
229 | 				treenode[i].key = 0;
230 | 			}
231 | 			tree->maxnodes *= 2;
232 | 		}
233 | 
234 |         if (treenode[ptr].val < val) 
235 |             ptr = 2 * ptr + 2;
236 |         else if (treenode[ptr].val > val) 
237 |             ptr = 2 * ptr + 1; 
238 | 		else if(treenode[ptr].val == val) 
239 | 		{
240 | 			treenode[ptr].key += key;
241 | 			printf("Update: val=%"PRIDX" key=%"PRIDX" ptr=%"PRIDX"\n",val,treenode[ptr].key,ptr);
242 |             return ;
243 |         }
244 |     }
245 | 
246 | 	printf("Insert: val=%"PRIDX" key=%"PRIDX" ptr=%"PRIDX"\n",val,key,ptr);
247 |     treenode[ptr].val = val;
248 |     treenode[ptr].key = key;
249 |     tree->nownodes++;
250 | 
251 |     return ;
252 | }
253 | 
254 | void mynd_binary_search_tree_Insert2(binary_search_tree2_t *tree, reordering_int_t val, reordering_int_t key)
255 | {
256 | 	mynd_Insert_TreeNode2(tree, val, key);
257 | 	printf("\n");
258 | 	return ;
259 | }
260 | 
261 | void mynd_InorderTraversal_TreeNode2(binary_search_tree2_t *tree, treenode2_t *treenode, reordering_int_t maxnodes, reordering_int_t *dst1, reordering_int_t *dst2, reordering_int_t located, reordering_int_t *ptr) 
262 | {
263 | 	printf("InorderTraversal_TreeNode2 1 located=%"PRIDX"\n",located);
264 | 	mynd_exam_binary_search_tree2(tree);
265 | 	if(treenode[located].val == -1)
266 | 		return;
267 | 
268 | 	if (2 * located + 1 < maxnodes)
269 | 		mynd_InorderTraversal_TreeNode2(tree, treenode, maxnodes, dst1, dst2, 2 * located + 1, ptr);
270 | 	printf("located=%"PRIDX" ptr=%"PRIDX" val=%"PRIDX" key=%"PRIDX"\n",located,*ptr,treenode[located].val,treenode[located].key);
271 | 	mynd_exam_binary_search_tree2(tree);
272 | 	dst1[*ptr] = treenode[located].val;
273 | 	printf("1\n");
274 | 	mynd_exam_binary_search_tree2(tree);
275 | 	dst2[*ptr] = treenode[located].key;
276 | 	printf("2\n");
277 | 	mynd_exam_binary_search_tree2(tree);
278 | 	(*ptr)++;
279 | 	printf("3\n");
280 | 	mynd_exam_binary_search_tree2(tree);
281 | 	if(2 * located + 2 < maxnodes)
282 | 		mynd_InorderTraversal_TreeNode2(tree, treenode, maxnodes, dst1, dst2, 2 * located + 2, ptr);
283 | }
284 | 
285 | void mynd_binary_search_tree_Traversal2(binary_search_tree2_t *tree, reordering_int_t *dst1, reordering_int_t *dst2)
286 | {
287 | 	treenode2_t *treenode = tree->treenode;
288 | 
289 | 	reordering_int_t ptr = 0;
290 | 
291 | 	mynd_InorderTraversal_TreeNode2(tree, treenode, tree->maxnodes, dst1, dst2, 0, &ptr);
292 | }
293 | 
294 | void mynd_Reset_TreeNode2(treenode2_t *treenode, reordering_int_t maxnodes, reordering_int_t located) 
295 | {
296 | 	if(treenode[located].val == -1)
297 | 		return;
298 | 	else
299 | 	{
300 | 		// if (2 * located + 1 < maxnodes)
301 | 		// 	Reset_TreeNode2(treenode, maxnodes, located * 2 + 1);
302 | 		// treenode[located].val = -1;
303 | 		// treenode[located].key = 0;
304 | 		// if (2 * located + 2 < maxnodes)
305 | 		// 	Reset_TreeNode2(treenode, maxnodes, located * 2 + 2);
306 | 		
307 |         if (2 * located + 1 < maxnodes)
308 |             mynd_Reset_TreeNode2(treenode, maxnodes, 2 * located + 1);
309 |         treenode[located].val = -1;
310 |         treenode[located].key = 0;
311 |         if (2 * located + 2 < maxnodes)
312 |             mynd_Reset_TreeNode2(treenode, maxnodes, 2 * located + 2);
313 | 	}
314 | }
315 | 
316 | void mynd_binary_search_tree_Reset2(binary_search_tree2_t *tree)
317 | {
318 | 	treenode2_t *treenode = tree->treenode;
319 | 
320 | 	mynd_Reset_TreeNode2(treenode, tree->maxnodes, 0);
321 | }
322 | 
323 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_splitgraph.c:
--------------------------------------------------------------------------------
  1 | #ifndef SPLITGRAPH_H
  2 | #define SPLITGRAPH_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | void mynd_SplitGraphReorder(graph_t *graph, graph_t **sub_lgraph, graph_t **sub_rgraph, reordering_int_t level)
  7 | {
  8 |     reordering_int_t nvtxs;
  9 |     reordering_int_t *xadj, *vwgt, *adjncy, *adjwgt, *label, *where;
 10 |     reordering_int_t subnvtxs[3], subnedges[3];
 11 |     reordering_int_t *subxadj[2], *subvwgt[2], *subadjncy[2], *subadjwgt[2], *sublabel[2];
 12 |     reordering_int_t *rename;
 13 |     graph_t *lgraph, *rgraph;
 14 | 
 15 |     nvtxs  = graph->nvtxs;
 16 |     xadj   = graph->xadj;
 17 |     vwgt   = graph->vwgt;
 18 |     adjncy = graph->adjncy;
 19 |     adjwgt = graph->adjwgt;
 20 |     label  = graph->label;
 21 |     where  = graph->where;
 22 | 
 23 |     subnvtxs[0]  = subnvtxs[1]  = subnvtxs[2]  = 0;
 24 |     subnedges[0] = subnedges[1] = subnedges[2] = 0;
 25 | 
 26 |     //cmap --> rename
 27 |     rename = graph->cmap;
 28 |     // rename = (reordering_int_t *)mynd_check_malloc(sizeof(reordering_int_t) * nvtxs, "SplitGraphoRerder: rename");
 29 | 
 30 |     // printf("SplitGraphoRerder 0\n");
 31 |     for(reordering_int_t i = 0;i < nvtxs;i++)
 32 |     {
 33 |         reordering_int_t partition = where[i];
 34 |         if(partition != 2)
 35 |         {
 36 |             rename[i] = subnvtxs[partition];
 37 |             subnvtxs[partition] ++;
 38 |             for(reordering_int_t j = xadj[i];j < xadj[i + 1];j++)
 39 |             {
 40 |                 reordering_int_t k = adjncy[j];
 41 |                 if(partition == where[k])
 42 |                     subnedges[partition]++;
 43 |             }
 44 |         }
 45 |     }
 46 |     // exam_num(rename, nvtxs);
 47 |     // printf("subnvtxs[0]=%d subnvtxs[1]=%d subnedges[0]=%d subnedges[1]=%d\n",subnvtxs[0],subnvtxs[1],subnedges[0],subnedges[1]);
 48 | 
 49 |     lgraph = mynd_SetupSplitGraph(lgraph, subnvtxs[0], subnedges[0]);
 50 |     subxadj[0]   = lgraph->xadj;
 51 |     subvwgt[0]   = lgraph->vwgt;
 52 |     subadjncy[0] = lgraph->adjncy;
 53 |     subadjwgt[0] = lgraph->adjwgt;
 54 |     sublabel[0]  = lgraph->label;
 55 | 
 56 |     rgraph = mynd_SetupSplitGraph(rgraph, subnvtxs[1], subnedges[1]);
 57 |     subxadj[1]   = rgraph->xadj;
 58 |     subvwgt[1]   = rgraph->vwgt;
 59 |     subadjncy[1] = rgraph->adjncy;
 60 |     subadjwgt[1] = rgraph->adjwgt;
 61 |     sublabel[1]  = rgraph->label;
 62 | 
 63 |     // printf("SplitGraphoRerder 1\n");
 64 |     subnvtxs[0]  = subnvtxs[1]  = subnvtxs[2]  = 0;
 65 |     subnedges[0] = subnedges[1] = subnedges[2] = 0;
 66 |     subxadj[0][0] = subxadj[1][0] = 0;
 67 |     for(reordering_int_t i = 0;i < nvtxs;i++)
 68 |     {
 69 |         reordering_int_t partition = where[i];
 70 | 
 71 |         if(partition == 2)
 72 |             continue;
 73 |         
 74 |         reordering_int_t numedge = 0;
 75 |         reordering_int_t map = subnvtxs[partition];
 76 |         reordering_int_t ptr = subnedges[partition];
 77 |         subxadj[partition][map + 1] = subxadj[partition][map];
 78 |         // printf("subxadj[partition][map + 1]=%d\n", subxadj[partition][map + 1]);
 79 |         for(reordering_int_t j = xadj[i];j < xadj[i + 1];j++)
 80 |         {
 81 |             reordering_int_t k = adjncy[j];
 82 |             if(where[k] == partition)
 83 |             {
 84 |                 subadjncy[partition][ptr] = rename[k];
 85 |                 subadjwgt[partition][ptr] = adjwgt[j];
 86 |                 // printf("k=%d rename[k]=%d subadjncy[partition][ptr]=%d subadjwgt[partition][ptr]=%d ptr=%d numedge=%d map=%d\n",
 87 |                 //     k,rename[k],subadjncy[partition][ptr],subadjwgt[partition][ptr],ptr, numedge,map);
 88 |                 ptr++;
 89 |                 numedge++;
 90 |             }
 91 |         }
 92 |         // printf("numedge=%d map=%d\n",numedge,map);
 93 |         subxadj[partition][map + 1] += numedge;
 94 |         subvwgt[partition][map] = vwgt[i];
 95 |         sublabel[partition][map] = label[i];
 96 |         subnvtxs[partition]++;
 97 |         subnedges[partition] = ptr;
 98 |         // printf("subxadj[partition][map + 1]=%d subvwgt[partition][map]=%d sublabel[partition][map]=%d subnvtxs[partition]=%d subnedges[partition]=%d\n",
 99 |         //     subxadj[partition][map + 1],subvwgt[partition][map],sublabel[partition][map],subnvtxs[partition],subnedges[partition]);
100 |     }
101 | 
102 |     // printf("SplitGraphoRerder 2\n");
103 |     // printf("subnvtxs[0]=%d subnvtxs[1]=%d subnedges[0]=%d subnedges[1]=%d\n",subnvtxs[0],subnvtxs[1],subnedges[0],subnedges[1]);
104 |     lgraph->nvtxs  = subnvtxs[0];
105 |     lgraph->nedges = subnedges[0];
106 |     rgraph->nvtxs  = subnvtxs[1];
107 |     rgraph->nedges = subnedges[1];
108 | 
109 |     mynd_SetupGraph_tvwgt(lgraph);
110 |     mynd_SetupGraph_tvwgt(rgraph);
111 | 
112 |     *sub_lgraph = lgraph;
113 |     *sub_rgraph = rgraph;
114 | 
115 |     // mynd_check_free(rename,"SplitGraphReorder: rename");
116 | 
117 |     // printf("lgraph:\n");
118 |     // exam_nvtxs_nedges(lgraph);
119 |     // exam_xadj(lgraph);
120 |     // exam_vwgt(lgraph);
121 |     // exam_adjncy_adjwgt(lgraph);
122 |     // exam_label(lgraph);
123 | 
124 |     // printf("rgraph:\n");
125 |     // exam_nvtxs_nedges(rgraph);
126 |     // exam_xadj(rgraph);
127 |     // exam_vwgt(rgraph);
128 |     // exam_adjncy_adjwgt(rgraph);
129 |     // exam_label(rgraph);
130 | }
131 | 
132 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_timer.c:
--------------------------------------------------------------------------------
  1 | #ifndef TIMER_H
  2 | #define TIMER_H
  3 | 
  4 | #include "mynd_functionset.h"
  5 | 
  6 | /*reordering_real_t time_all = 0;
  7 | struct timeval start_all;
  8 | struct timeval end_all;
  9 | 
 10 | reordering_real_t time_nestedbisection = 0;
 11 | struct timeval start_nestedbisection;
 12 | struct timeval end_nestedbisection;
 13 | 
 14 | reordering_real_t time_bisectionbest = 0;
 15 | struct timeval start_bisectionbest;
 16 | struct timeval end_bisectionbest;
 17 | 
 18 | reordering_real_t time_coarsen = 0;
 19 | struct timeval start_coarsen;
 20 | struct timeval end_coarsen;
 21 | 
 22 | reordering_real_t time_reorderbisection = 0;
 23 | struct timeval start_reorderbisection;
 24 | struct timeval end_reorderbisection;
 25 | 
 26 | reordering_real_t time_refine2waynode = 0;
 27 | struct timeval start_refine2waynode;
 28 | struct timeval end_refine2waynode;
 29 | 
 30 | reordering_real_t time_splitgraphreorder = 0;
 31 | struct timeval start_splitgraphreorder;
 32 | struct timeval end_splitgraphreorder;
 33 | 
 34 | reordering_real_t time_match = 0;
 35 | struct timeval start_match;
 36 | struct timeval end_match;
 37 | 
 38 | reordering_real_t time_createcoarsengraph = 0;
 39 | struct timeval start_createcoarsengraph;
 40 | struct timeval end_createcoarsengraph;
 41 | 
 42 | reordering_real_t time_partitioninf2way = 0;
 43 | struct timeval start_partitioninf2way;
 44 | struct timeval end_partitioninf2way;
 45 | 
 46 | reordering_real_t time_fm2waycutbalance = 0;
 47 | struct timeval start_fm2waycutbalance;
 48 | struct timeval end_fm2waycutbalance;
 49 | 
 50 | reordering_real_t time_fm2waycutrefine = 0;
 51 | struct timeval start_fm2waycutrefine;
 52 | struct timeval end_fm2waycutrefine;
 53 | 
 54 | reordering_real_t time_reorderinf2way = 0;
 55 | struct timeval start_reorderinf2way;
 56 | struct timeval end_reorderinf2way;
 57 | 
 58 | reordering_real_t time_fmnodebalance = 0;
 59 | struct timeval start_fmnodebalance;
 60 | struct timeval end_fmnodebalance;
 61 | 
 62 | reordering_real_t time_fm1sidenoderefine = 0;
 63 | struct timeval start_fm1sidenoderefine;
 64 | struct timeval end_fm1sidenoderefine;
 65 | 
 66 | reordering_real_t time_fm2sidenoderefine = 0;
 67 | struct timeval start_fm2sidenoderefine;
 68 | struct timeval end_fm2sidenoderefine;
 69 | 
 70 | reordering_real_t time_malloc = 0;
 71 | struct timeval start_malloc;
 72 | struct timeval end_malloc;
 73 | 
 74 | reordering_real_t time_free = 0;
 75 | struct timeval start_free;
 76 | struct timeval end_free;*/
 77 | 
 78 | void mynd_Timer_Init()
 79 | {
 80 | 	time_all = 0;
 81 | 	time_nestedbisection = 0;
 82 | 	time_bisectionbest = 0;
 83 | 	time_coarsen = 0;
 84 | 	time_reorderbisection = 0;
 85 | 	time_refine2waynode = 0;
 86 | 	time_splitgraphreorder = 0;
 87 | 	time_match = 0;
 88 | 	time_createcoarsengraph = 0;
 89 | 	time_partitioninf2way = 0;
 90 | 	time_fm2waycutbalance = 0;
 91 | 	time_fm2waycutrefine = 0;
 92 | 	time_reorderinf2way = 0;
 93 | 	time_fmnodebalance = 0;
 94 | 	time_fm1sidenoderefine = 0;
 95 | 	time_fm2sidenoderefine = 0;
 96 | 	time_malloc = 0;
 97 | 	time_free = 0;
 98 | }
 99 | 
100 | void mynd_gettimebegin(struct timeval *start, struct timeval *end, reordering_real_t *time)
101 | {
102 | 	gettimeofday(start,NULL);
103 | }
104 | 
105 | void mynd_gettimeend(struct timeval *start, struct timeval *end, reordering_real_t *time)
106 | {
107 | 	gettimeofday(end,NULL);
108 | 	time[0] += (end[0].tv_sec - start[0].tv_sec) * 1000 + (end[0].tv_usec - start[0].tv_usec) / 1000.0;
109 | }
110 | 
111 | void mynd_PrintTimeGeneral()
112 | {
113 | 	printf("\nTiming Information -------------------------------------------------\n");
114 | 	printf("All:                 %10.3"PRREAL" ms\n", time_all);
115 | 	printf("-------------------------------------------------------------------\n");
116 | }
117 | 
118 | void mynd_PrintTimePhases()
119 | {
120 | 	printf("\nTiming Information -------------------------------------------------\n");
121 | 	printf("All:                 %10.3"PRREAL" ms\n", time_all);
122 | 	printf("    Nested Bisection:      %10.3"PRREAL" ms\n", time_nestedbisection);
123 | 	printf("        Bisection-Best:          %10.3"PRREAL" ms\n", time_bisectionbest);
124 | 	printf("            Coarsen:                   %10.3"PRREAL" ms\n", time_coarsen);
125 | 	printf("            Reorder Bisection:         %10.3"PRREAL" ms\n", time_reorderbisection);
126 | 	printf("            Refine 2way-Node:          %10.3"PRREAL" ms\n", time_refine2waynode);
127 | 	printf("        Reorder Split Graph:     %10.3"PRREAL" ms\n", time_splitgraphreorder);
128 | 	printf("-------------------------------------------------------------------\n");
129 | 
130 | }
131 | 
132 | void mynd_PrintTimeSteps()
133 | {
134 | 	printf("\nTiming Information -------------------------------------------------\n");
135 | 	printf("All:                 %10.3"PRREAL" ms\n", time_all);
136 | 	printf("    Nested Bisection:      %10.3"PRREAL" ms\n", time_nestedbisection);
137 | 	printf("        Bisection-Best:          %10.3"PRREAL" ms\n", time_bisectionbest);
138 | 	printf("            Coarsen:                   %10.3"PRREAL" ms\n", time_coarsen);
139 | 	printf("                Matching:                    %10.3"PRREAL" ms\n", time_match);
140 | 	printf("                Create Coarsen Graph:        %10.3"PRREAL" ms\n", time_createcoarsengraph);
141 | 	printf("            Reorder Bisection:         %10.3"PRREAL" ms\n", time_reorderbisection);
142 | 	printf("                Compute Partition Inf 2way:  %10.3"PRREAL" ms\n", time_partitioninf2way);
143 | 	printf("                FM 2way-Cut Balance:         %10.3"PRREAL" ms\n", time_fm2waycutbalance);
144 | 	printf("                FM 2way-Cut Refine:          %10.3"PRREAL" ms\n", time_fm2waycutrefine);
145 | 	printf("            Refine 2way-Node:          %10.3"PRREAL" ms\n", time_refine2waynode);
146 | 	printf("                Compute Reorder Inf 2way:    %10.3"PRREAL" ms\n", time_reorderinf2way);
147 | 	printf("                FM 2way-Node Balance:        %10.3"PRREAL" ms\n", time_fmnodebalance);
148 | 	printf("                FM 1Side-Node Refine:        %10.3"PRREAL" ms\n", time_fm1sidenoderefine);
149 | 	printf("                FM 2Side-Node Refine:        %10.3"PRREAL" ms\n", time_fm2sidenoderefine);
150 | 	printf("        Reorder Split Graph:     %10.3"PRREAL" ms\n", time_splitgraphreorder);
151 | 	printf("-------------------------------------------------------------------\n");
152 | 
153 | }
154 | 
155 | void mynd_PrintTime(reordering_int_t control)
156 | {
157 | 	//	001
158 | 	if(control & PRINTTIMESTEPS) 
159 | 		mynd_PrintTimeSteps();
160 | 	//	010
161 | 	else if(control & PRINTTIMEPHASES) 
162 | 		mynd_PrintTimePhases();
163 | 	//	100
164 | 	else if(control & PRINTTIMEGENERAL) 
165 | 		mynd_PrintTimeGeneral();
166 | }
167 | 
168 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/mynd_variables.c:
--------------------------------------------------------------------------------
 1 | #ifndef VARIABLES_H
 2 | #define VARIABLES_H
 3 | 
 4 | #include "mynd_functionset.h"
 5 | 
 6 | reordering_int_t control;
 7 | 
 8 | reordering_real_t time_all;
 9 | struct timeval start_all;
10 | struct timeval end_all;
11 | 
12 | reordering_real_t time_nestedbisection;
13 | struct timeval start_nestedbisection;
14 | struct timeval end_nestedbisection;
15 | 
16 | reordering_real_t time_bisectionbest;
17 | struct timeval start_bisectionbest;
18 | struct timeval end_bisectionbest;
19 | 
20 | reordering_real_t time_coarsen;
21 | struct timeval start_coarsen;
22 | struct timeval end_coarsen;
23 | 
24 | reordering_real_t time_reorderbisection;
25 | struct timeval start_reorderbisection;
26 | struct timeval end_reorderbisection;
27 | 
28 | reordering_real_t time_refine2waynode;
29 | struct timeval start_refine2waynode;
30 | struct timeval end_refine2waynode;
31 | 
32 | reordering_real_t time_splitgraphreorder;
33 | struct timeval start_splitgraphreorder;
34 | struct timeval end_splitgraphreorder;
35 | 
36 | reordering_real_t time_match;
37 | struct timeval start_match;
38 | struct timeval end_match;
39 | 
40 | reordering_real_t time_createcoarsengraph;
41 | struct timeval start_createcoarsengraph;
42 | struct timeval end_createcoarsengraph;
43 | 
44 | reordering_real_t time_partitioninf2way;
45 | struct timeval start_partitioninf2way;
46 | struct timeval end_partitioninf2way;
47 | 
48 | reordering_real_t time_fm2waycutbalance;
49 | struct timeval start_fm2waycutbalance;
50 | struct timeval end_fm2waycutbalance;
51 | 
52 | reordering_real_t time_fm2waycutrefine;
53 | struct timeval start_fm2waycutrefine;
54 | struct timeval end_fm2waycutrefine;
55 | 
56 | reordering_real_t time_reorderinf2way;
57 | struct timeval start_reorderinf2way;
58 | struct timeval end_reorderinf2way;
59 | 
60 | reordering_real_t time_fmnodebalance;
61 | struct timeval start_fmnodebalance;
62 | struct timeval end_fmnodebalance;
63 | 
64 | reordering_real_t time_fm1sidenoderefine;
65 | struct timeval start_fm1sidenoderefine;
66 | struct timeval end_fm1sidenoderefine;
67 | 
68 | reordering_real_t time_fm2sidenoderefine;
69 | struct timeval start_fm2sidenoderefine;
70 | struct timeval end_fm2sidenoderefine;
71 | 
72 | reordering_real_t time_malloc;
73 | struct timeval start_malloc;
74 | struct timeval end_malloc;
75 | 
76 | reordering_real_t time_free;
77 | struct timeval start_free;
78 | struct timeval end_free;
79 | 
80 | #endif


--------------------------------------------------------------------------------
/reordering_omp/src/typedef.h:
--------------------------------------------------------------------------------
  1 | #ifndef MYND_OMP_H
  2 | #define MYND_OMP_H
  3 | 
  4 | #include "../include/mynd_types.h"
  5 | 
  6 | /*--------------------------------------------------------------------------
  7 |  Specifies the width of the elementary data type that will hold information
  8 |  about vertices and their adjacency lists.
  9 | 
 10 |  Possible values:
 11 |    32 : Use 32 bit signed integers
 12 |    64 : Use 64 bit signed integers
 13 | 
 14 |  A width of 64 should be specified if the number of vertices or the total
 15 |  number of edges in the graph exceed the limits of a 32 bit signed integer
 16 |  i.e., 2^31-1.
 17 |  Proper use of 64 bit integers requires that the c99 standard datatypes
 18 |  int32_t and int64_t are supported by the compiler.
 19 |  GCC does provides these definitions in stdint.h, but it may require some
 20 |  modifications on other architectures.
 21 | --------------------------------------------------------------------------*/
 22 | // #define IDXTYPEWIDTH 64
 23 | 
 24 | 
 25 | /*--------------------------------------------------------------------------
 26 |  Specifies the data type that will hold floating-point style information.
 27 | 
 28 |  Possible values:
 29 |    32 : single precission floating point (float)
 30 |    64 : double precission floating point (double)
 31 | --------------------------------------------------------------------------*/
 32 | // #define REALTYPEWIDTH 64
 33 | 
 34 | /****************************************************************************
 35 | * In principle, nothing needs to be changed beyond this point, unless the
 36 | * int32_t and int64_t cannot be found in the normal places.
 37 | *****************************************************************************/
 38 | 
 39 | /* Uniform definitions for various compilers */
 40 | #if defined(_MSC_VER)
 41 |   #define COMPILER_MSC
 42 | #endif
 43 | #if defined(__ICC)
 44 |   #define COMPILER_ICC
 45 | #endif
 46 | #if defined(__GNUC__)
 47 |   #define COMPILER_GCC
 48 | #endif
 49 | 
 50 | /* Include c99 int definitions and need constants. When building the library,
 51 |  * these are already defined by GKlib; hence the test for _GKLIB_H_ */
 52 | #ifndef _GKLIB_H_
 53 | #ifdef COMPILER_MSC
 54 | #include <limits.h>
 55 | 
 56 | typedef __int32 int32_t;
 57 | typedef __int64 int64_t;
 58 | #define PRId32       "I32d"
 59 | #define PRId64       "I64d"
 60 | #define SCNd32       "ld"
 61 | #define SCNd64       "I64d"
 62 | #define INT32_MIN    ((int32_t)_I32_MIN)
 63 | #define INT32_MAX    _I32_MAX
 64 | #define INT64_MIN    ((int64_t)_I64_MIN)
 65 | #define INT64_MAX    _I64_MAX
 66 | #else
 67 | #include <inttypes.h>
 68 | #endif
 69 | #endif
 70 | 
 71 | 
 72 | /*------------------------------------------------------------------------
 73 | * Setup the basic datatypes
 74 | *-------------------------------------------------------------------------*/
 75 | #if IDXTYPEWIDTH == 32
 76 |   typedef int32_t reordering_int_t;
 77 | 
 78 |   #define IDX_MAX   INT32_MAX
 79 |   #define IDX_MIN   INT32_MIN
 80 | 
 81 |   #define SCIDX  SCNd32
 82 |   #define PRIDX  PRId32
 83 | 
 84 |   #define strtoidx      strtol
 85 |   #define lyj_abs          abs
 86 | #elif IDXTYPEWIDTH == 64
 87 |   typedef int64_t reordering_int_t;
 88 | 
 89 |   #define IDX_MAX   INT64_MAX
 90 |   #define IDX_MIN   INT64_MIN
 91 | 
 92 |   #define SCIDX  SCNd64
 93 |   #define PRIDX  PRId64
 94 | 
 95 | #ifdef COMPILER_MSC
 96 |   #define strtoidx      _strtoi64
 97 | #else
 98 |   #define strtoidx      strtoll
 99 | #endif
100 |   #define lyj_abs          labs
101 | #else
102 |   #error "Incorrect user-supplied value fo IDXTYPEWIDTH"
103 | #endif
104 | 
105 | 
106 | #if REALTYPEWIDTH == 32
107 |   typedef float reordering_real_t;
108 | 
109 |   #define SCREAL         "f"
110 |   #define PRREAL         "f"
111 |   #define REAL_MAX       FLT_MAX
112 |   #define REAL_MIN       FLT_MIN
113 |   #define REAL_EPSILON   FLT_EPSILON
114 | 
115 |   #define rabs          fabsf
116 |   #define REALEQ(x,y) ((rabs((x)-(y)) <= FLT_EPSILON))
117 | 
118 | #ifdef COMPILER_MSC
119 |   #define strtoreal     (float)strtod
120 | #else
121 |   #define strtoreal     strtof
122 | #endif
123 | #elif REALTYPEWIDTH == 64
124 |   typedef double reordering_real_t;
125 | 
126 |   #define SCREAL         "lf"
127 |   #define PRREAL         "lf"
128 |   #define REAL_MAX       DBL_MAX
129 |   #define REAL_MIN       DBL_MIN
130 |   #define REAL_EPSILON   DBL_EPSILON
131 | 
132 |   #define rabs          fabs
133 |   #define REALEQ(x,y) ((rabs((x)-(y)) <= DBL_EPSILON))
134 | 
135 |   #define strtoreal     strtod
136 | #else
137 |   #error "Incorrect user-supplied value for REALTYPEWIDTH"
138 | #endif
139 | 
140 | #endif


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
 1 | include ../make.inc
 2 | all:pangulu_host
 3 | 
 4 | src:=$(wildcard *.c)
 5 | pangulu_host : pangulu_platforms $(src:.c=.o) 
 6 | 
 7 | %.o:%.c
 8 | 	$(MPICC) $(MPICCFLAGS) $(METIS_INC) $(REORDERING_INC) $(PANGULU_FLAGS) -c $< -o $@ -fPIC
 9 | 	mv $@ ../lib
10 | 
11 | pangulu_platforms:
12 | 	cd .. && python3 build_helper.py generate_platform_helper
13 | 	cd .. && python3 build_helper.py compile_platform_code
14 | 
15 | clean:
16 | 	-(rm -f ../lib/pangulu*.o)
17 | 	-(rm -f ./pangulu*.o)
18 | 


--------------------------------------------------------------------------------
/src/pangulu.c:
--------------------------------------------------------------------------------
  1 | #include "pangulu_common.h"
  2 | 
  3 | #ifdef PANGULU_PERF
  4 | pangulu_stat_t global_stat;
  5 | #endif
  6 | 
  7 | int pangulu_gpu_kernel_warp_per_block;
  8 | int pangulu_gpu_data_move_warp_per_block;
  9 | int pangulu_gpu_shared_mem_size;
 10 | 
 11 | void pangulu_init(
 12 |     pangulu_exblock_idx pangulu_n, 
 13 |     pangulu_exblock_ptr pangulu_nnz, 
 14 |     pangulu_exblock_ptr *csc_colptr, 
 15 |     pangulu_exblock_idx *csc_rowidx, 
 16 |     calculate_type *csc_value, 
 17 |     pangulu_init_options *init_options, 
 18 |     void **pangulu_handle)
 19 | {
 20 |     pangulu_int32_t size, rank;
 21 |     pangulu_cm_rank(&rank);
 22 |     pangulu_cm_size(&size);
 23 |     pangulu_common *common = (pangulu_common *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_common));
 24 |     common->rank = rank;
 25 |     common->size = size;
 26 |     common->n = pangulu_n;
 27 | 
 28 |     if((!init_options->is_complex_matrix) != (sizeof(calculate_type) == sizeof(calculate_real_type))){
 29 |         if(rank == 0)printf(PANGULU_E_COMPLEX_MISMATCH);
 30 |         exit(1);
 31 |     }
 32 | 
 33 |     if(init_options->sizeof_value != sizeof(calculate_type)){
 34 |         if(rank == 0)printf(PANGULU_E_ELEM_SIZE_MISMATCH);
 35 |         exit(1);
 36 |     }
 37 | 
 38 |     if (rank == 0)
 39 |     {   
 40 |         if (init_options == NULL)
 41 |         {
 42 |             printf(PANGULU_E_OPTION_IS_NULLPTR);
 43 |             exit(1);
 44 |         }
 45 |         if (init_options->nb == 0)
 46 |         {
 47 |             printf(PANGULU_E_NB_IS_ZERO);
 48 |             exit(1);
 49 |         }
 50 |     }
 51 | 
 52 |     if(init_options->nb <= 0){
 53 |         common->nb = 256;
 54 |     }else{
 55 |         common->nb = init_options->nb;
 56 |     }
 57 | 
 58 |     common->sum_rank_size = size;
 59 |     
 60 |     if(init_options->nthread == 0){
 61 |         common->omp_thread = 1;
 62 |     }else{
 63 |         common->omp_thread = init_options->nthread;
 64 |     }
 65 | 
 66 |     common->basic_param = init_options->mpi_recv_buffer_level;
 67 | 
 68 |     if(init_options->reordering_nthread == 0){
 69 |         common->reordering_nthread = 4;
 70 |     }else{
 71 |         common->reordering_nthread = init_options->reordering_nthread;
 72 |     }
 73 | 
 74 |     if(init_options->gpu_data_move_warp_per_block == 0){
 75 |         pangulu_gpu_data_move_warp_per_block = 4;
 76 |     }else{
 77 |         pangulu_gpu_data_move_warp_per_block = init_options->gpu_data_move_warp_per_block;
 78 |     }
 79 | 
 80 |     if(init_options->gpu_kernel_warp_per_block == 0){
 81 |         pangulu_gpu_kernel_warp_per_block = 4;
 82 |     }else{
 83 |         pangulu_gpu_kernel_warp_per_block = init_options->gpu_kernel_warp_per_block;
 84 |     }
 85 |     
 86 |     pangulu_cm_bcast(&common->n, 1, MPI_PANGULU_EXBLOCK_IDX, 0);
 87 |     pangulu_cm_bcast(&common->nb, 1, MPI_PANGULU_INBLOCK_IDX, 0);
 88 | 
 89 |     pangulu_int64_t tmp_p = sqrt(common->sum_rank_size);
 90 |     while (((common->sum_rank_size) % tmp_p) != 0)
 91 |     {
 92 |         tmp_p--;
 93 |     }
 94 | 
 95 |     common->p = tmp_p;
 96 |     common->q = common->sum_rank_size / tmp_p;
 97 |     pangulu_origin_smatrix *origin_smatrix = (pangulu_origin_smatrix *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_origin_smatrix));
 98 |     pangulu_init_pangulu_origin_smatrix(origin_smatrix);
 99 | 
100 |     if (rank == 0)
101 |     {
102 |         origin_smatrix->row = pangulu_n;
103 |         origin_smatrix->column = pangulu_n;
104 |         origin_smatrix->columnpointer = csc_colptr;
105 |         origin_smatrix->rowindex = csc_rowidx;
106 |         origin_smatrix->nnz = pangulu_nnz;
107 |         origin_smatrix->value_csc = csc_value;
108 |         if (origin_smatrix->row == 0)
109 |         {
110 |             printf(PANGULU_E_ROW_IS_ZERO);
111 |             exit(1);
112 |         }
113 |     }
114 | 
115 |     pangulu_int32_t p = common->p;
116 |     pangulu_int32_t q = common->q;
117 |     pangulu_int32_t nb = common->nb;
118 |     pangulu_cm_sync();
119 |     pangulu_cm_bcast(&origin_smatrix->row, 1, MPI_PANGULU_INT64_T, 0);
120 |     common->n = origin_smatrix->row;
121 |     pangulu_int64_t n = common->n;
122 |     omp_set_num_threads(init_options->nthread);
123 | #if defined(OPENBLAS_CONFIG_H) || defined(OPENBLAS_VERSION)
124 |     openblas_set_num_threads(1);
125 | #endif
126 | 
127 |     if (rank == 0)
128 |     {
129 |         printf(PANGULU_I_BASIC_INFO);
130 |     }
131 | 
132 | #ifdef GPU_OPEN
133 |     int device_num;
134 |     pangulu_platform_get_device_num(&device_num, PANGULU_DEFAULT_PLATFORM);
135 |     pangulu_platform_set_default_device(rank%device_num, PANGULU_DEFAULT_PLATFORM);
136 | #endif
137 | 
138 |     pangulu_block_smatrix *block_smatrix = (pangulu_block_smatrix *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_block_smatrix));
139 |     pangulu_init_pangulu_block_smatrix(block_smatrix);
140 |     pangulu_block_common *block_common = (pangulu_block_common *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_block_common));
141 |     block_common->rank = rank;
142 |     block_common->p = p;
143 |     block_common->q = q;
144 |     block_common->nb = nb;
145 |     block_common->n = n;
146 |     block_common->block_length = PANGULU_ICEIL(n, nb);
147 |     block_common->sum_rank_size = common->sum_rank_size;
148 |     pangulu_aggregate_init();
149 | 
150 |     pangulu_origin_smatrix *reorder_matrix = (pangulu_origin_smatrix *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_origin_smatrix));
151 |     pangulu_init_pangulu_origin_smatrix(reorder_matrix);
152 | 
153 |     struct timeval time_start;
154 |     double elapsed_time;
155 |     
156 |     pangulu_cm_sync();
157 |     pangulu_time_start(&time_start);
158 |     pangulu_reordering(
159 |         block_smatrix,
160 |         origin_smatrix,
161 |         reorder_matrix,
162 |         common->reordering_nthread
163 |     );
164 |     pangulu_cm_sync();
165 |     elapsed_time = pangulu_time_stop(&time_start);
166 |     if (rank == 0){printf(PANGULU_I_TIME_REORDER);}
167 | 
168 | #ifdef PANGULU_PERF
169 |     if(rank == 0){
170 |         block_smatrix->A_rowsum_reordered = pangulu_malloc(__FILE__, __LINE__, sizeof(calculate_type) * n);
171 |         memset(block_smatrix->A_rowsum_reordered, 0, sizeof(calculate_type) * n);
172 |         for(pangulu_exblock_idx col = 0; col < n; col++){
173 |             for(pangulu_exblock_ptr idx = reorder_matrix->columnpointer[col]; idx < reorder_matrix->columnpointer[col+1]; idx++){
174 |                 block_smatrix->A_rowsum_reordered[reorder_matrix->rowindex[idx]]+=reorder_matrix->value_csc[idx];
175 |             }
176 |         }
177 |     }
178 |     pangulu_cm_sync();
179 | #endif
180 | 
181 |     pangulu_time_start(&time_start);
182 |     if (rank == 0)
183 |     {
184 |         pangulu_symbolic(block_common,
185 |                          block_smatrix,
186 |                          reorder_matrix);
187 |     }
188 |     pangulu_cm_sync();
189 |     elapsed_time = pangulu_time_stop(&time_start);
190 |     if (rank == 0){printf(PANGULU_I_TIME_SYMBOLIC);}
191 | 
192 |     pangulu_cm_sync();
193 |     pangulu_time_start(&time_start);
194 |     pangulu_preprocessing(
195 |         common,
196 |         block_common,
197 |         block_smatrix,
198 |         reorder_matrix,
199 |         init_options->nthread);
200 |     pangulu_cm_sync();
201 |     elapsed_time = pangulu_time_stop(&time_start);
202 |     if (rank == 0){printf(PANGULU_I_TIME_PRE);}
203 | 
204 |     pangulu_free(__FILE__, __LINE__, origin_smatrix);
205 |     origin_smatrix = NULL;
206 |     pangulu_free(__FILE__, __LINE__, reorder_matrix);
207 |     reorder_matrix = NULL;
208 | 
209 |     pangulu_cm_sync();
210 | 
211 |     (*pangulu_handle) = pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_handle_t));
212 |     (*(pangulu_handle_t **)pangulu_handle)->block_common = block_common;
213 |     (*(pangulu_handle_t **)pangulu_handle)->block_smatrix = block_smatrix;
214 |     (*(pangulu_handle_t **)pangulu_handle)->commmon = common;
215 | }
216 | 
217 | void pangulu_gstrf(pangulu_gstrf_options *gstrf_options, void **pangulu_handle)
218 | {
219 |     pangulu_block_common *block_common = (*(pangulu_handle_t **)pangulu_handle)->block_common;
220 |     pangulu_block_smatrix *block_smatrix = (*(pangulu_handle_t **)pangulu_handle)->block_smatrix;
221 |     pangulu_common *common = (*(pangulu_handle_t **)pangulu_handle)->commmon;
222 |     pangulu_int32_t size, rank;
223 |     pangulu_cm_rank(&rank);
224 |     pangulu_cm_size(&size);
225 |     if (rank == 0)
226 |     {
227 | #ifdef PANGULU_PERF
228 |         printf(PANGULU_W_PERF_MODE_ON);
229 | #endif
230 |         if (gstrf_options == NULL)
231 |         {
232 |             printf(PANGULU_E_GSTRF_OPTION_IS_NULLPTR);
233 |             exit(1);
234 |         }
235 |     }
236 | 
237 |     struct timeval time_start;
238 |     double elapsed_time;
239 | 
240 |     pangulu_cm_sync();
241 |     pangulu_time_start(&time_start);
242 |     pangulu_numeric(common,
243 |                     block_common,
244 |                     block_smatrix);
245 |     pangulu_cm_sync();
246 |     elapsed_time = pangulu_time_stop(&time_start);
247 | #ifdef PANGULU_PERF
248 |     long long flop_recvbuf;
249 |     MPI_Reduce(&global_stat.flop, &flop_recvbuf, 1, MPI_LONG_LONG, MPI_SUM, 0, MPI_COMM_WORLD);
250 |     global_stat.flop = flop_recvbuf;
251 | #endif
252 |     if (rank == 0){printf(PANGULU_I_TIME_NUMERICAL);}
253 | 
254 | #ifdef PANGULU_PERF
255 |     if(rank == 0){
256 |         printf(PANGULU_I_PERF_TABLE_HEAD);
257 |         fflush(stdout);
258 |     }
259 |     pangulu_cm_sync();
260 |     for(int i = 0; i < size; i++){
261 |         if(rank == i){
262 |             printf(PANGULU_I_PERF_TABLE_ROW);
263 |             fflush(stdout);
264 |         }
265 |         pangulu_cm_sync();
266 |         usleep(10);
267 |     }
268 | #endif
269 |     pangulu_cm_sync();
270 |     pangulu_log_memory_usage();
271 | #ifdef PANGULU_PERF
272 |     pangulu_numeric_check(common, block_common, block_smatrix);
273 |     pangulu_cm_sync();
274 | #endif
275 | }
276 | 
277 | void pangulu_gstrs(calculate_type *rhs, pangulu_gstrs_options *gstrs_options, void **pangulu_handle)
278 | {
279 |     pangulu_block_common *block_common = (*(pangulu_handle_t **)pangulu_handle)->block_common;
280 |     pangulu_block_smatrix *block_smatrix = (*(pangulu_handle_t **)pangulu_handle)->block_smatrix;
281 |     pangulu_common *common = (*(pangulu_handle_t **)pangulu_handle)->commmon;
282 |     pangulu_int32_t size, rank;
283 |     pangulu_cm_rank(&rank);
284 |     pangulu_cm_size(&size);
285 |     if (rank == 0)
286 |     {
287 |         if (gstrs_options == NULL)
288 |         {
289 |             printf(PANGULU_E_GSTRS_OPTION_IS_NULLPTR);
290 |             exit(1);
291 |         }
292 |     }
293 |     pangulu_int64_t vector_length = common->n;
294 |     pangulu_vector *x_vector = NULL;
295 |     pangulu_vector *b_vector = NULL;
296 |     pangulu_vector *answer_vector = NULL;
297 |     if (rank == 0)
298 |     {
299 |         x_vector = (pangulu_vector *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_vector));
300 |         b_vector = (pangulu_vector *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_vector));
301 |         answer_vector = (pangulu_vector *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_vector));
302 |         b_vector->row = common->n;
303 |         b_vector->value = rhs;
304 |         pangulu_init_pangulu_vector(x_vector, vector_length);
305 |         pangulu_init_pangulu_vector(answer_vector, vector_length);
306 |         pangulu_reorder_vector_b_tran(block_smatrix->row_perm, block_smatrix->metis_perm, block_smatrix->row_scale, b_vector, answer_vector);
307 |     }
308 |     pangulu_sptrsv_preprocessing(
309 |         block_common,
310 |         block_smatrix,
311 |         answer_vector);
312 |     struct timeval time_start;
313 |     double elapsed_time;
314 |     pangulu_cm_sync();
315 |     pangulu_time_start(&time_start);
316 |     pangulu_solve(block_common, block_smatrix, answer_vector);
317 |     pangulu_cm_sync();
318 |     elapsed_time = pangulu_time_stop(&time_start);
319 |     if (rank == 0)
320 |     {
321 |         printf(PANGULU_I_TIME_SPTRSV);
322 |         pangulu_reorder_vector_x_tran(block_smatrix, answer_vector, x_vector);
323 |         for (int i = 0; i < common->n; i++)
324 |         {
325 |             rhs[i] = x_vector->value[i];
326 |         }
327 |         pangulu_destroy_pangulu_vector(x_vector);
328 |         pangulu_destroy_pangulu_vector(answer_vector);
329 |         pangulu_free(__FILE__, __LINE__, b_vector);
330 |     }
331 | }
332 | 
333 | void pangulu_gssv(calculate_type *rhs, pangulu_gstrf_options *gstrf_options, pangulu_gstrs_options *gstrs_options, void **pangulu_handle)
334 | {
335 |     pangulu_gstrf(gstrf_options, pangulu_handle);
336 |     pangulu_gstrs(rhs, gstrs_options, pangulu_handle);
337 | }
338 | 
339 | void pangulu_finalize(void **pangulu_handle)
340 | {
341 |     pangulu_block_common *block_common = (*(pangulu_handle_t **)pangulu_handle)->block_common;
342 |     pangulu_block_smatrix *block_smatrix = (*(pangulu_handle_t **)pangulu_handle)->block_smatrix;
343 |     pangulu_common *common = (*(pangulu_handle_t **)pangulu_handle)->commmon;
344 | 
345 |     pangulu_destroy(block_common, block_smatrix);
346 | 
347 |     pangulu_free(__FILE__, __LINE__, block_common);
348 |     pangulu_free(__FILE__, __LINE__, block_smatrix);
349 |     pangulu_free(__FILE__, __LINE__, common);
350 |     pangulu_free(__FILE__, __LINE__, *(pangulu_handle_t **)pangulu_handle);
351 | }


--------------------------------------------------------------------------------
/src/pangulu_kernel_interface.c:
--------------------------------------------------------------------------------
  1 | #include "pangulu_common.h"
  2 | 
  3 | #ifdef PANGULU_PERF
  4 | void pangulu_getrf_flop(
  5 |     pangulu_inblock_idx nb,
  6 |     pangulu_storage_slot_t *opdst,
  7 |     int tid)
  8 | {
  9 |     pangulu_storage_slot_t *upper_diag;
 10 |     pangulu_storage_slot_t *lower_diag;
 11 |     if (opdst->is_upper)
 12 |     {
 13 |         upper_diag = opdst;
 14 |         lower_diag = opdst->related_block;
 15 |     }
 16 |     else
 17 |     {
 18 |         upper_diag = opdst->related_block;
 19 |         lower_diag = opdst;
 20 |     }
 21 |     for (pangulu_int32_t level = 0; level < nb; level++)
 22 |     {
 23 |         if (upper_diag->columnpointer[level] == upper_diag->columnpointer[level + 1])
 24 |         {
 25 |             continue;
 26 |         }
 27 | 
 28 |         global_stat.flop += (lower_diag->columnpointer[level + 1] - lower_diag->columnpointer[level]);
 29 | 
 30 |         for (pangulu_int32_t csc_idx = lower_diag->columnpointer[level]; csc_idx < lower_diag->columnpointer[level + 1]; csc_idx++)
 31 |         {
 32 |             pangulu_int32_t row = lower_diag->rowindex[csc_idx];
 33 |             pangulu_int32_t csr_idx_op2 = upper_diag->columnpointer[level];
 34 |             pangulu_int32_t csr_idx_op2_ub = upper_diag->columnpointer[level + 1];
 35 |             pangulu_int32_t csr_idx_opdst = upper_diag->columnpointer[row];
 36 |             pangulu_int32_t csr_idx_opdst_ub = upper_diag->columnpointer[row + 1];
 37 |             while (csr_idx_op2 < csr_idx_op2_ub && csr_idx_opdst < csr_idx_opdst_ub)
 38 |             {
 39 |                 if (upper_diag->rowindex[csr_idx_opdst] == upper_diag->rowindex[csr_idx_op2])
 40 |                 {
 41 |                     global_stat.flop += 2;
 42 |                     csr_idx_op2++;
 43 |                     csr_idx_opdst++;
 44 |                 }
 45 |                 while (csr_idx_op2 < csr_idx_op2_ub && upper_diag->rowindex[csr_idx_op2] < upper_diag->rowindex[csr_idx_opdst])
 46 |                 {
 47 |                     csr_idx_op2++;
 48 |                 }
 49 |                 while (csr_idx_opdst < csr_idx_opdst_ub && upper_diag->rowindex[csr_idx_opdst] < upper_diag->rowindex[csr_idx_op2])
 50 |                 {
 51 |                     csr_idx_opdst++;
 52 |                 }
 53 |             }
 54 |         }
 55 | 
 56 |         for (pangulu_int32_t csr_idx = upper_diag->columnpointer[level] + 1; csr_idx < upper_diag->columnpointer[level + 1]; csr_idx++)
 57 |         {
 58 |             pangulu_int32_t col = upper_diag->rowindex[csr_idx];
 59 |             pangulu_int32_t csc_idx_op1 = lower_diag->columnpointer[level];
 60 |             pangulu_int32_t csc_idx_op1_ub = lower_diag->columnpointer[level + 1];
 61 |             pangulu_int32_t csc_idx_opdst = lower_diag->columnpointer[col];
 62 |             pangulu_int32_t csc_idx_opdst_ub = lower_diag->columnpointer[col + 1];
 63 |             while (csc_idx_op1 < csc_idx_op1_ub && csc_idx_opdst < csc_idx_opdst_ub)
 64 |             {
 65 |                 if (lower_diag->rowindex[csc_idx_opdst] == lower_diag->rowindex[csc_idx_op1])
 66 |                 {
 67 |                     global_stat.flop += 2;
 68 |                     csc_idx_op1++;
 69 |                     csc_idx_opdst++;
 70 |                 }
 71 |                 while (csc_idx_op1 < csc_idx_op1_ub && lower_diag->rowindex[csc_idx_op1] < lower_diag->rowindex[csc_idx_opdst])
 72 |                 {
 73 |                     csc_idx_op1++;
 74 |                 }
 75 |                 while (csc_idx_opdst < csc_idx_opdst_ub && lower_diag->rowindex[csc_idx_opdst] < lower_diag->rowindex[csc_idx_op1])
 76 |                 {
 77 |                     csc_idx_opdst++;
 78 |                 }
 79 |             }
 80 |         }
 81 |     }
 82 | }
 83 | 
 84 | void pangulu_tstrf_flop(
 85 |     pangulu_inblock_idx nb,
 86 |     pangulu_storage_slot_t *opdst,
 87 |     pangulu_storage_slot_t *opdiag,
 88 |     int tid)
 89 | {
 90 |     if (opdiag->is_upper == 0)
 91 |     {
 92 |         opdiag = opdiag->related_block;
 93 |     }
 94 |     for (pangulu_int32_t rhs_id = 0; rhs_id < nb; rhs_id++)
 95 |     {
 96 |         for (pangulu_int32_t rhs_idx = opdst->rowpointer[rhs_id]; rhs_idx < opdst->rowpointer[rhs_id + 1]; rhs_idx++)
 97 |         {
 98 |             pangulu_int32_t mul_row = opdst->columnindex[rhs_idx];
 99 |             pangulu_int32_t lsum_idx = rhs_idx + 1;
100 |             pangulu_int32_t diag_idx = opdiag->columnpointer[mul_row];
101 |             global_stat.flop++;
102 |             while (lsum_idx < opdst->rowpointer[rhs_id + 1] && diag_idx < opdiag->columnpointer[mul_row + 1])
103 |             {
104 |                 if (opdiag->rowindex[diag_idx] == opdst->columnindex[lsum_idx])
105 |                 {
106 |                     global_stat.flop += 2;
107 |                     lsum_idx++;
108 |                     diag_idx++;
109 |                 }
110 |                 while (lsum_idx < opdst->rowpointer[rhs_id + 1] && opdst->columnindex[lsum_idx] < opdiag->rowindex[diag_idx])
111 |                 {
112 |                     lsum_idx++;
113 |                 }
114 |                 while (diag_idx < opdiag->columnpointer[mul_row + 1] && opdiag->rowindex[diag_idx] < opdst->columnindex[lsum_idx])
115 |                 {
116 |                     diag_idx++;
117 |                 }
118 |             }
119 |         }
120 |     }
121 | }
122 | 
123 | void pangulu_gessm_flop(
124 |     pangulu_inblock_idx nb,
125 |     pangulu_storage_slot_t *opdst,
126 |     pangulu_storage_slot_t *opdiag,
127 |     int tid)
128 | {
129 |     if (opdiag->is_upper == 1)
130 |     {
131 |         opdiag = opdiag->related_block;
132 |     }
133 |     for (pangulu_int32_t rhs_id = 0; rhs_id < nb; rhs_id++)
134 |     {
135 |         for (pangulu_int32_t rhs_idx = opdst->columnpointer[rhs_id]; rhs_idx < opdst->columnpointer[rhs_id + 1]; rhs_idx++)
136 |         {
137 |             pangulu_int32_t mul_row = opdst->rowindex[rhs_idx];
138 |             pangulu_int32_t lsum_idx = rhs_idx + 1;
139 |             pangulu_int32_t diag_idx = opdiag->columnpointer[mul_row];
140 |             while (lsum_idx < opdst->columnpointer[rhs_id + 1] && diag_idx < opdiag->columnpointer[mul_row + 1])
141 |             {
142 |                 if (opdiag->rowindex[diag_idx] == opdst->rowindex[lsum_idx])
143 |                 {
144 |                     global_stat.flop += 2;
145 |                     lsum_idx++;
146 |                     diag_idx++;
147 |                 }
148 |                 while (lsum_idx < opdst->columnpointer[rhs_id + 1] && opdst->rowindex[lsum_idx] < opdiag->rowindex[diag_idx])
149 |                 {
150 |                     lsum_idx++;
151 |                 }
152 |                 while (diag_idx < opdiag->columnpointer[mul_row + 1] && opdiag->rowindex[diag_idx] < opdst->rowindex[lsum_idx])
153 |                 {
154 |                     diag_idx++;
155 |                 }
156 |             }
157 |         }
158 |     }
159 | }
160 | 
161 | void pangulu_ssssm_flop(
162 |     pangulu_inblock_idx nb,
163 |     pangulu_storage_slot_t *opdst,
164 |     pangulu_storage_slot_t *op1,
165 |     pangulu_storage_slot_t *op2,
166 |     int tid)
167 | {
168 |     for (pangulu_int32_t col2 = 0; col2 < nb; col2++)
169 |     {
170 |         for (pangulu_int32_t idx2 = op2->columnpointer[col2]; idx2 < op2->columnpointer[col2 + 1]; idx2++)
171 |         {
172 |             pangulu_inblock_idx row2 = op2->rowindex[idx2];
173 |             global_stat.flop += 2 * (op1->columnpointer[row2 + 1] - op1->columnpointer[row2]);
174 |         }
175 |     }
176 | }
177 | 
178 | #endif
179 | 
180 | void pangulu_getrf_interface(
181 |     pangulu_inblock_idx nb,
182 |     pangulu_storage_slot_t *opdst,
183 |     int tid)
184 | {
185 | #ifdef PANGULU_PERF
186 |     global_stat.kernel_cnt++;
187 |     struct timeval start;
188 |     pangulu_time_start(&start);
189 | #endif
190 |     pangulu_platform_getrf(nb, opdst, tid, PANGULU_DEFAULT_PLATFORM);
191 |     pangulu_platform_synchronize(PANGULU_DEFAULT_PLATFORM);
192 | #ifdef PANGULU_PERF
193 |     global_stat.time_outer_kernel += pangulu_time_stop(&start);
194 |     pangulu_getrf_flop(nb, opdst, tid);
195 | #endif
196 | }
197 | 
198 | void pangulu_tstrf_interface(
199 |     pangulu_inblock_idx nb,
200 |     pangulu_storage_slot_t *opdst,
201 |     pangulu_storage_slot_t *opdiag,
202 |     int tid)
203 | {
204 | #ifdef PANGULU_PERF
205 |     global_stat.kernel_cnt++;
206 |     struct timeval start;
207 |     pangulu_time_start(&start);
208 | #endif
209 |     pangulu_platform_tstrf(nb, opdst, opdiag, tid, PANGULU_DEFAULT_PLATFORM);
210 |     pangulu_platform_synchronize(PANGULU_DEFAULT_PLATFORM);
211 | #ifdef PANGULU_PERF
212 |     global_stat.time_outer_kernel += pangulu_time_stop(&start);
213 |     pangulu_tstrf_flop(nb, opdst, opdiag, tid);
214 | #endif
215 | }
216 | 
217 | void pangulu_gessm_interface(
218 |     pangulu_inblock_idx nb,
219 |     pangulu_storage_slot_t *opdst,
220 |     pangulu_storage_slot_t *opdiag,
221 |     int tid)
222 | {
223 | #ifdef PANGULU_PERF
224 |     global_stat.kernel_cnt++;
225 |     struct timeval start;
226 |     pangulu_time_start(&start);
227 | #endif
228 |     pangulu_platform_gessm(nb, opdst, opdiag, tid, PANGULU_DEFAULT_PLATFORM);
229 |     pangulu_platform_synchronize(PANGULU_DEFAULT_PLATFORM);
230 | #ifdef PANGULU_PERF
231 |     global_stat.time_outer_kernel += pangulu_time_stop(&start);
232 |     pangulu_gessm_flop(nb, opdst, opdiag, tid);
233 | #endif
234 | }
235 | 
236 | void pangulu_ssssm_interface(
237 |     pangulu_inblock_idx nb,
238 |     pangulu_storage_slot_t *opdst,
239 |     pangulu_storage_slot_t *op1,
240 |     pangulu_storage_slot_t *op2,
241 |     int tid)
242 | {
243 | #ifdef PANGULU_PERF
244 |     global_stat.kernel_cnt++;
245 |     struct timeval start;
246 |     pangulu_time_start(&start);
247 | #endif
248 |     pangulu_platform_ssssm(nb, opdst, op1, op2, tid, PANGULU_DEFAULT_PLATFORM);
249 |     pangulu_platform_synchronize(PANGULU_DEFAULT_PLATFORM);
250 | #ifdef PANGULU_PERF
251 |     global_stat.time_outer_kernel += pangulu_time_stop(&start);
252 |     pangulu_ssssm_flop(nb, opdst, op1, op2, tid);
253 | #endif
254 | }
255 | 
256 | void pangulu_hybrid_batched_interface(
257 |     pangulu_inblock_idx nb,
258 |     pangulu_uint64_t ntask,
259 |     pangulu_task_t *tasks)
260 | {
261 | #ifdef PANGULU_PERF
262 |     global_stat.kernel_cnt++;
263 |     struct timeval start;
264 |     pangulu_time_start(&start);
265 | #endif
266 |     pangulu_platform_hybrid_batched(nb, ntask, tasks, PANGULU_DEFAULT_PLATFORM);
267 |     pangulu_platform_synchronize(PANGULU_DEFAULT_PLATFORM);
268 | #ifdef PANGULU_PERF
269 |     global_stat.time_outer_kernel += pangulu_time_stop(&start);
270 | 
271 |     for (pangulu_int64_t i = 0; i < ntask; i++)
272 |     {
273 |         switch (tasks[i].kernel_id)
274 |         {
275 |         case PANGULU_TASK_GETRF:
276 |             pangulu_getrf_flop(nb, tasks[i].opdst, 0);
277 |             break;
278 |         case PANGULU_TASK_TSTRF:
279 |             pangulu_tstrf_flop(nb, tasks[i].opdst, tasks[i].op1, 0);
280 |             break;
281 |         case PANGULU_TASK_GESSM:
282 |             pangulu_gessm_flop(nb, tasks[i].opdst, tasks[i].op1, 0);
283 |             break;
284 |         case PANGULU_TASK_SSSSM:
285 |             pangulu_ssssm_flop(nb, tasks[i].opdst, tasks[i].op1, tasks[i].op2, 0);
286 |             break;
287 |         }
288 |     }
289 | #endif
290 | }
291 | 
292 | void pangulu_ssssm_batched_interface(
293 |     pangulu_inblock_idx nb,
294 |     pangulu_uint64_t ntask,
295 |     pangulu_task_t *tasks)
296 | {
297 | #ifdef PANGULU_PERF
298 |     global_stat.kernel_cnt++;
299 |     struct timeval start;
300 |     pangulu_time_start(&start);
301 | #endif
302 |     pangulu_platform_hybrid_batched(nb, ntask, tasks, PANGULU_DEFAULT_PLATFORM);
303 |     pangulu_platform_synchronize(PANGULU_DEFAULT_PLATFORM);
304 | #ifdef PANGULU_PERF
305 |     global_stat.time_outer_kernel += pangulu_time_stop(&start);
306 | 
307 |     for (pangulu_int64_t i = 0; i < ntask; i++)
308 |     {
309 |         pangulu_ssssm_flop(nb, tasks[i].opdst, tasks[i].op1, tasks[i].op2, 0);
310 |     }
311 | #endif
312 | }
313 | 
314 | void pangulu_spmv_interface(
315 |     pangulu_inblock_idx nb,
316 |     pangulu_storage_slot_t *a,
317 |     calculate_type *x,
318 |     calculate_type *y)
319 | {
320 |     pangulu_platform_spmv(nb, a, x, y, PANGULU_DEFAULT_PLATFORM);
321 | }
322 | 
323 | void pangulu_vecadd_interface(
324 |     pangulu_int64_t length,
325 |     calculate_type *bval,
326 |     calculate_type *xval)
327 | {
328 |     pangulu_platform_vecadd(length, bval, xval, PANGULU_DEFAULT_PLATFORM);
329 | }
330 | 
331 | void pangulu_sptrsv_interface(
332 |     pangulu_inblock_idx nb,
333 |     pangulu_storage_slot_t *s,
334 |     calculate_type *xval,
335 |     pangulu_int64_t uplo)
336 | {
337 |     pangulu_platform_sptrsv(nb, s, xval, uplo, PANGULU_DEFAULT_PLATFORM);
338 | }
339 | 


--------------------------------------------------------------------------------
/src/pangulu_memory.c:
--------------------------------------------------------------------------------
 1 | #include "pangulu_common.h"
 2 | 
 3 | void *pangulu_malloc(const char* file, pangulu_int64_t line, pangulu_int64_t size)
 4 | {
 5 |     void *malloc_address = NULL;
 6 |     malloc_address = (void *)malloc(size);
 7 |     if (size != 0 && malloc_address == NULL)
 8 |     {
 9 |         printf(PANGULU_E_CPU_MEM);
10 |         exit(1);
11 |     }
12 |     return malloc_address;
13 | }
14 | 
15 | void *pangulu_realloc(const char* file, pangulu_int64_t line, void* oldptr, pangulu_int64_t size)
16 | {
17 |     void *realloc_address = NULL;
18 |     realloc_address = (void *)realloc(oldptr, size);
19 |     if (size != 0 && realloc_address == NULL)
20 |     {
21 |         printf(PANGULU_E_CPU_MEM);
22 |         exit(1);
23 |     }
24 |     return realloc_address;
25 | }
26 | 
27 | void pangulu_free(const char* file, pangulu_int64_t line, void* ptr){
28 |     if(ptr==NULL){
29 |         return;
30 |     }
31 |     free(ptr);
32 | }
33 | 
34 | void pangulu_origin_smatrix_add_csc(pangulu_origin_smatrix *a)
35 | {
36 |     pangulu_exblock_ptr nnzA = a->rowpointer[a->row];
37 |     pangulu_exblock_idx n = a->row;
38 |     char *now_malloc_space = (char *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_exblock_ptr) * (n + 1) + sizeof(pangulu_exblock_idx) * nnzA + sizeof(calculate_type) * nnzA);
39 | 
40 |     pangulu_exblock_ptr *rowpointer = (pangulu_exblock_ptr *)now_malloc_space;
41 |     pangulu_exblock_idx *columnindex = (pangulu_exblock_idx *)(now_malloc_space + sizeof(pangulu_exblock_ptr) * (n + 1));
42 |     calculate_type *value = (calculate_type *)(now_malloc_space + sizeof(pangulu_exblock_ptr) * (n + 1) + sizeof(pangulu_exblock_idx) * nnzA);
43 |     pangulu_exblock_ptr *csc_to_csr_index = (pangulu_exblock_ptr *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_exblock_ptr) * nnzA);
44 |     for (pangulu_exblock_ptr i = 0; i < nnzA; i++)
45 |     {
46 |         value[i] = 0.0;
47 |     }
48 | 
49 |     for (pangulu_exblock_idx i = 0; i < (n + 1); i++)
50 |     {
51 |         rowpointer[i] = 0;
52 |     }
53 |     for (pangulu_exblock_ptr i = 0; i < nnzA; i++)
54 |     {
55 |         rowpointer[a->columnindex[i] + 1]++;
56 |     }
57 |     for (pangulu_exblock_idx i = 0; i < n; i++)
58 |     {
59 |         rowpointer[i + 1] += rowpointer[i];
60 |     }
61 |     pangulu_exblock_ptr *index_rowpointer = (pangulu_exblock_ptr *)pangulu_malloc(__FILE__, __LINE__, sizeof(pangulu_exblock_ptr) * (n + 1));
62 |     for (pangulu_exblock_idx i = 0; i < n; i++)
63 |     {
64 |         index_rowpointer[i] = rowpointer[i];
65 |     }
66 |     for (pangulu_exblock_idx i = 0; i < n; i++)
67 |     {
68 |         for (pangulu_exblock_ptr j = a->rowpointer[i]; j < a->rowpointer[i + 1]; j++)
69 |         {
70 | 
71 |             pangulu_exblock_idx col = a->columnindex[j];
72 |             pangulu_exblock_ptr index = index_rowpointer[col];
73 |             columnindex[index] = i;
74 |             value[index] = a->value[j];
75 |             csc_to_csr_index[index] = j;
76 |             index_rowpointer[col]++;
77 |         }
78 |     }
79 |     a->columnpointer = rowpointer;
80 |     a->rowindex = columnindex;
81 |     a->value_csc = value;
82 |     a->csc_to_csr_index = csc_to_csr_index;
83 |     pangulu_free(__FILE__, __LINE__, index_rowpointer);
84 | }


--------------------------------------------------------------------------------
/src/pangulu_platform_helper.c:
--------------------------------------------------------------------------------
 1 | // Warning : Don't modify this file directly.
 2 | // This file is automatically generated by build_helper.py.
 3 | // This file will be regenerated after the next compilation.
 4 | // All changes will be lost.
 5 | #include "pangulu_common.h"
 6 | #include "./platforms/pangulu_platform_common.h"
 7 | void pangulu_platform_malloc(void** platform_address, size_t size, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_malloc(platform_address, size); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_malloc(platform_address, size); break;default: printf("No platform implementation for function pangulu_platform_0201000_malloc.\n"); break;}}
 8 | void pangulu_platform_malloc_pinned(void** platform_address, size_t size, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_malloc_pinned(platform_address, size); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_malloc_pinned(platform_address, size); break;default: printf("No platform implementation for function pangulu_platform_0201000_malloc_pinned.\n"); break;}}
 9 | void pangulu_platform_synchronize(pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_synchronize(); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_synchronize(); break;default: printf("No platform implementation for function pangulu_platform_0201000_synchronize.\n"); break;}}
10 | void pangulu_platform_memset(void* s, int c, size_t n, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_memset(s, c, n); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_memset(s, c, n); break;default: printf("No platform implementation for function pangulu_platform_0201000_memset.\n"); break;}}
11 | void pangulu_platform_create_stream(void** stream, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_create_stream(stream); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_create_stream(stream); break;default: printf("No platform implementation for function pangulu_platform_0201000_create_stream.\n"); break;}}
12 | void pangulu_platform_memcpy(void *dst, const void *src, size_t count, unsigned int kind, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_memcpy(dst, src, count, kind); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_memcpy(dst, src, count, kind); break;default: printf("No platform implementation for function pangulu_platform_0201000_memcpy.\n"); break;}}
13 | void pangulu_platform_memcpy_async(void *dst, const void *src, size_t count, unsigned int kind, void* stream, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_memcpy_async(dst, src, count, kind, stream); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_memcpy_async(dst, src, count, kind, stream); break;default: printf("No platform implementation for function pangulu_platform_0201000_memcpy_async.\n"); break;}}
14 | void pangulu_platform_free(void* devptr, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_free(devptr); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_free(devptr); break;default: printf("No platform implementation for function pangulu_platform_0201000_free.\n"); break;}}
15 | void pangulu_platform_get_device_num(int* device_num, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_get_device_num(device_num); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_get_device_num(device_num); break;default: printf("No platform implementation for function pangulu_platform_0201000_get_device_num.\n"); break;}}
16 | void pangulu_platform_set_default_device(int device_num, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_set_default_device(device_num); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_set_default_device(device_num); break;default: printf("No platform implementation for function pangulu_platform_0201000_set_default_device.\n"); break;}}
17 | void pangulu_platform_get_device_name(char* name, int device_num, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_get_device_name(name, device_num); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_get_device_name(name, device_num); break;default: printf("No platform implementation for function pangulu_platform_0201000_get_device_name.\n"); break;}}
18 | void pangulu_platform_get_device_memory_usage(size_t* used_byte, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_get_device_memory_usage(used_byte); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_get_device_memory_usage(used_byte); break;default: printf("No platform implementation for function pangulu_platform_0201000_get_device_memory_usage.\n"); break;}}
19 | void pangulu_platform_getrf(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, int tid, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_getrf(nb, opdst, tid); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_getrf(nb, opdst, tid); break;default: printf("No platform implementation for function pangulu_platform_0201000_getrf.\n"); break;}}
20 | void pangulu_platform_tstrf(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* opdiag, int tid, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_tstrf(nb, opdst, opdiag, tid); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_tstrf(nb, opdst, opdiag, tid); break;default: printf("No platform implementation for function pangulu_platform_0201000_tstrf.\n"); break;}}
21 | void pangulu_platform_gessm(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* opdiag, int tid, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_gessm(nb, opdst, opdiag, tid); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_gessm(nb, opdst, opdiag, tid); break;default: printf("No platform implementation for function pangulu_platform_0201000_gessm.\n"); break;}}
22 | void pangulu_platform_ssssm(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* op1, pangulu_storage_slot_t* op2, int tid, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_ssssm(nb, opdst, op1, op2, tid); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_ssssm(nb, opdst, op1, op2, tid); break;default: printf("No platform implementation for function pangulu_platform_0201000_ssssm.\n"); break;}}
23 | void pangulu_platform_ssssm_batched(pangulu_inblock_idx nb, pangulu_uint64_t ntask, pangulu_task_t* tasks, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_ssssm_batched(nb, ntask, tasks); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_ssssm_batched(nb, ntask, tasks); break;default: printf("No platform implementation for function pangulu_platform_0201000_ssssm_batched.\n"); break;}}
24 | void pangulu_platform_hybrid_batched(pangulu_inblock_idx nb, pangulu_uint64_t ntask, pangulu_task_t* tasks, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_hybrid_batched(nb, ntask, tasks); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_hybrid_batched(nb, ntask, tasks); break;default: printf("No platform implementation for function pangulu_platform_0201000_hybrid_batched.\n"); break;}}
25 | void pangulu_platform_spmv(pangulu_inblock_idx nb, pangulu_storage_slot_t* a, calculate_type* x, calculate_type* y, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_spmv(nb, a, x, y); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_spmv(nb, a, x, y); break;default: printf("No platform implementation for function pangulu_platform_0201000_spmv.\n"); break;}}
26 | void pangulu_platform_vecadd(pangulu_int64_t length, calculate_type *bval, calculate_type *xval, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_vecadd(length, bval, xval); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_vecadd(length, bval, xval); break;default: printf("No platform implementation for function pangulu_platform_0201000_vecadd.\n"); break;}}
27 | void pangulu_platform_sptrsv(pangulu_inblock_idx nb, pangulu_storage_slot_t *s, calculate_type* xval, pangulu_int64_t uplo, pangulu_platform_t platform){switch(platform){case PANGULU_PLATFORM_CPU_NAIVE : pangulu_platform_0100000_sptrsv(nb, s, xval, uplo); break;case PANGULU_PLATFORM_GPU_CUDA : pangulu_platform_0201000_sptrsv(nb, s, xval, uplo); break;default: printf("No platform implementation for function pangulu_platform_0201000_sptrsv.\n"); break;}}
28 | 


--------------------------------------------------------------------------------
/src/pangulu_sptrsv.c:
--------------------------------------------------------------------------------
  1 | #include "pangulu_common.h"
  2 | 
  3 | void pangulu_sptrsv_preprocessing(
  4 |     pangulu_block_common *bcommon,
  5 |     pangulu_block_smatrix *bsmatrix,
  6 |     pangulu_vector *reordered_rhs)
  7 | {
  8 |     pangulu_exblock_idx block_length = bcommon->block_length;
  9 |     pangulu_inblock_idx nb = bcommon->nb;
 10 |     pangulu_int32_t rank = bcommon->rank;
 11 |     pangulu_int32_t nproc = bcommon->sum_rank_size;
 12 |     pangulu_int16_t p = bcommon->p;
 13 |     pangulu_int16_t q = bcommon->q;
 14 |     bsmatrix->rhs = pangulu_malloc(__FILE__, __LINE__, sizeof(calculate_type) * block_length * nb);
 15 |     if (rank == 0)
 16 |     {
 17 |         memcpy(bsmatrix->rhs, reordered_rhs->value, sizeof(calculate_type) * reordered_rhs->row);
 18 |     }
 19 |     pangulu_cm_bcast(bsmatrix->rhs, block_length * nb, MPI_VAL_TYPE, 0);
 20 |     bsmatrix->spmv_acc = pangulu_malloc(__FILE__, __LINE__, sizeof(calculate_type) * nb);
 21 |     bsmatrix->recv_buf = pangulu_malloc(__FILE__, __LINE__, sizeof(calculate_type) * nb);
 22 | }
 23 | 
 24 | void pangulu_sptrsv_uplo(
 25 |     pangulu_block_common *bcommon,
 26 |     pangulu_block_smatrix *bsmatrix,
 27 |     pangulu_int32_t uplo)
 28 | {
 29 |     pangulu_task_queue_t *heap = bsmatrix->heap;
 30 |     pangulu_int32_t rank = bcommon->rank;
 31 |     pangulu_exblock_idx block_length = bcommon->block_length;
 32 |     pangulu_int32_t p = bcommon->p;
 33 |     pangulu_int32_t q = bcommon->q;
 34 |     pangulu_storage_t *storage = bsmatrix->storage;
 35 | 
 36 |     pangulu_uint64_t *diag_uniaddr = bsmatrix->diag_uniaddr;
 37 |     pangulu_exblock_ptr *nondiag_block_rowptr = bsmatrix->nondiag_block_rowptr;
 38 |     pangulu_exblock_idx *nondiag_block_colidx = bsmatrix->nondiag_block_colidx;
 39 |     pangulu_exblock_ptr *nondiag_block_csr_to_csc = bsmatrix->nondiag_block_csr_to_csc;
 40 |     pangulu_uint64_t *related_nondiag_uniaddr = bsmatrix->related_nondiag_uniaddr;
 41 |     pangulu_inblock_idx nb = bcommon->nb;
 42 |     calculate_type *rhs = bsmatrix->rhs;
 43 |     calculate_type *spmv_acc = bsmatrix->spmv_acc;
 44 |     calculate_type *recv_buf = bsmatrix->recv_buf;
 45 | 
 46 |     if (uplo == PANGULU_LOWER)
 47 |     {
 48 |         for (pangulu_exblock_idx brow = 0; brow < block_length; brow++)
 49 |         {
 50 |             calculate_type *target_rhs = rhs + brow * nb;
 51 |             memset(spmv_acc, 0, sizeof(calculate_type) * nb);
 52 |             for (pangulu_int64_t bidx_csr = nondiag_block_rowptr[brow]; bidx_csr < nondiag_block_rowptr[brow + 1]; bidx_csr++)
 53 |             {
 54 |                 pangulu_exblock_idx bcol = nondiag_block_colidx[bidx_csr];
 55 |                 if (((brow % p) != (rank / q)) || ((bcol % q) != (rank % q)))
 56 |                 {
 57 |                     continue;
 58 |                 }
 59 |                 if (bcol < brow)
 60 |                 {
 61 |                     calculate_type *local_rhs = rhs + bcol * nb;
 62 |                     pangulu_platform_spmv(nb, &storage->bins[0].slots[nondiag_block_csr_to_csc[bidx_csr]], local_rhs, spmv_acc, PANGULU_PLATFORM_CPU_NAIVE);
 63 |                 }
 64 |                 else if (bcol > brow)
 65 |                 {
 66 |                     break;
 67 |                 }
 68 |             }
 69 | 
 70 |             if (((brow % p) == (rank / q)) && ((brow % q) == (rank % q)))
 71 |             {
 72 |                 for (int fetch_rank = (rank / q) * q; fetch_rank < ((rank / q) + 1) * q; fetch_rank++)
 73 |                 {
 74 |                     if (fetch_rank == rank)
 75 |                     {
 76 |                         continue;
 77 |                     }
 78 |                     pangulu_cm_recv(recv_buf, sizeof(calculate_type) * nb, fetch_rank, brow, 10);
 79 | #pragma omp simd
 80 |                     for (pangulu_inblock_idx i = 0; i < nb; i++)
 81 |                     {
 82 |                         spmv_acc[i] += recv_buf[i];
 83 |                     }
 84 |                 }
 85 |                 for (pangulu_inblock_idx i = 0; i < nb; i++)
 86 |                 {
 87 |                     target_rhs[i] += spmv_acc[i];
 88 |                 }
 89 |                 pangulu_storage_slot_t *lower_diag = pangulu_storage_get_diag(block_length, storage, diag_uniaddr[brow]);
 90 |                 if (lower_diag->is_upper)
 91 |                 {
 92 |                     lower_diag = lower_diag->related_block;
 93 |                 }
 94 |                 pangulu_platform_sptrsv(nb, lower_diag, target_rhs, PANGULU_LOWER, PANGULU_PLATFORM_CPU_NAIVE);
 95 |                 pangulu_cm_bcast(target_rhs, nb, MPI_VAL_TYPE, rank);
 96 |             }
 97 | 
 98 |             int diagonal_rank = (brow % p) * q + (brow % q);
 99 |             if (diagonal_rank != rank)
100 |             {
101 |                 if ((diagonal_rank / q) == (rank / q))
102 |                 {
103 |                     pangulu_cm_isend(spmv_acc, sizeof(calculate_type) * nb, diagonal_rank, brow, 10);
104 |                 }
105 |                 pangulu_cm_bcast(target_rhs, nb, MPI_VAL_TYPE, diagonal_rank);
106 |             }
107 |         }
108 |     }
109 |     else if (uplo == PANGULU_UPPER)
110 |     {
111 |         for (pangulu_exblock_idx brow = block_length - 1; brow < block_length; brow--)
112 |         {
113 |             calculate_type *target_rhs = rhs + brow * nb;
114 |             memset(spmv_acc, 0, sizeof(calculate_type) * nb);
115 | 
116 |             for (pangulu_int64_t bidx_csr = nondiag_block_rowptr[brow + 1] - 1; bidx_csr >= (pangulu_int64_t)nondiag_block_rowptr[brow]; bidx_csr--)
117 |             {
118 |                 pangulu_exblock_idx bcol = nondiag_block_colidx[bidx_csr];
119 |                 if (((brow % p) != (rank / q)) || ((bcol % q) != (rank % q)))
120 |                 {
121 |                     continue;
122 |                 }
123 |                 if (bcol > brow)
124 |                 {
125 |                     calculate_type *local_rhs = rhs + bcol * nb;
126 |                     pangulu_platform_spmv(nb, &storage->bins[0].slots[nondiag_block_csr_to_csc[bidx_csr]], local_rhs, spmv_acc, PANGULU_PLATFORM_CPU_NAIVE);
127 |                 }
128 |                 else if (bcol < brow)
129 |                 {
130 |                     break;
131 |                 }
132 |             }
133 | 
134 |             if (((brow % p) == (rank / q)) && ((brow % q) == (rank % q)))
135 |             {
136 |                 for (int fetch_rank = (rank / q) * q; fetch_rank < ((rank / q) + 1) * q; fetch_rank++)
137 |                 {
138 |                     if (fetch_rank == rank)
139 |                     {
140 |                         continue;
141 |                     }
142 |                     pangulu_cm_recv(recv_buf, sizeof(calculate_type) * nb, fetch_rank, 0, 10);
143 | #pragma omp simd
144 |                     for (pangulu_inblock_idx i = 0; i < nb; i++)
145 |                     {
146 |                         spmv_acc[i] += recv_buf[i];
147 |                     }
148 |                 }
149 | 
150 |                 for (pangulu_inblock_idx i = 0; i < nb; i++)
151 |                 {
152 |                     target_rhs[i] += spmv_acc[i];
153 |                 }
154 |                 pangulu_storage_slot_t *upper_diag = pangulu_storage_get_diag(block_length, storage, diag_uniaddr[brow]);
155 |                 if (upper_diag->is_upper == 0)
156 |                 {
157 |                     upper_diag = upper_diag->related_block;
158 |                 }
159 |                 pangulu_platform_sptrsv(nb, upper_diag, target_rhs, PANGULU_UPPER, PANGULU_PLATFORM_CPU_NAIVE);
160 |                 pangulu_cm_bcast(target_rhs, nb, MPI_VAL_TYPE, rank);
161 |             }
162 | 
163 |             int diagonal_rank = (brow % p) * q + (brow % q);
164 |             if (diagonal_rank != rank)
165 |             {
166 |                 if (diagonal_rank / q == rank / q)
167 |                 {
168 |                     pangulu_cm_isend(spmv_acc, sizeof(calculate_type) * nb, diagonal_rank, 0, 10);
169 |                 }
170 |                 pangulu_cm_bcast(target_rhs, nb, MPI_VAL_TYPE, diagonal_rank);
171 |             }
172 |         }
173 |     }
174 | }
175 | 
176 | void pangulu_solve(
177 |     pangulu_block_common *block_common,
178 |     pangulu_block_smatrix *block_smatrix,
179 |     pangulu_vector *result)
180 | {
181 |     int rank = block_common->rank;
182 |     pangulu_cm_sync();
183 |     pangulu_sptrsv_uplo(block_common, block_smatrix, PANGULU_LOWER);
184 |     pangulu_cm_sync();
185 |     pangulu_sptrsv_uplo(block_common, block_smatrix, PANGULU_UPPER);
186 |     pangulu_cm_sync();
187 |     if (block_common->rank == 0)
188 |     {
189 |         memcpy(result->value, block_smatrix->rhs, sizeof(calculate_type) * result->row);
190 |     }
191 | }


--------------------------------------------------------------------------------
/src/pangulu_strings.h:
--------------------------------------------------------------------------------
  1 | #ifdef PANGULU_LOG_INFO
  2 | #ifndef PANGULU_LOG_WARNING
  3 | #define PANGULU_LOG_WARNING
  4 | #endif
  5 | #endif
  6 | 
  7 | #ifdef PANGULU_LOG_WARNING
  8 | #ifndef PANGULU_LOG_ERROR
  9 | #define PANGULU_LOG_ERROR
 10 | #endif
 11 | #endif
 12 | 
 13 | #ifdef PANGULU_LOG_ERROR
 14 | #define PANGULU_E_NB_IS_ZERO "[PanguLU Error] Block size 'nb' is zero.\n"
 15 | #define PANGULU_E_INVALID_HEAP_SELECT "[PanguLU Error] Invalid heap comparison strategy selected.\n"
 16 | #define PANGULU_E_HEAP_FULL "[PanguLU Error] Task heap is full. Cannot insert new task.\n"
 17 | #define PANGULU_E_HEAP_EMPTY "[PanguLU Error] Task heap is empty on this MPI rank.\n"
 18 | #define PANGULU_E_CPU_MEM "[PanguLU Error] Failed to allocate " FMT_PANGULU_INT64_T " bytes. CPU memory exhausted. (%s:%lld)\n", size, file, line
 19 | #define PANGULU_E_ROW_IS_NULL "[PanguLU Error] Column %d (0-based) contains no non-zero elements. Exiting.\n", i
 20 | #define PANGULU_E_K_ID "[PanguLU Error] Invalid kernel ID " FMT_PANGULU_INT64_T " in numeric factorisation.\n", kernel_id
 21 | #define PANGULU_E_ASYM "[PanguLU Error] MPI_Barrier_asym failed.\n"
 22 | #define PANGULU_E_ADD_DIA "[PanguLU Error] Failed to add diagonal element in pangulu_add_diagonal_element.\n"
 23 | #define PANGULU_E_ROW_IS_ZERO "[PanguLU Error] Input matrix is invalid: order is zero.\n"
 24 | #define PANGULU_E_MAX_NULL "[PanguLU Error] All elements in column %d (0-based) are zero. Exiting.\n", i
 25 | #define PANGULU_E_OPTION_IS_NULLPTR "[PanguLU Error] initialisation options pointer is NULL. (pangulu_init)\n"
 26 | #define PANGULU_E_GSTRF_OPTION_IS_NULLPTR "[PanguLU Error] Options pointer is NULL. (pangulu_gstrf)\n"
 27 | #define PANGULU_E_GSTRS_OPTION_IS_NULLPTR "[PanguLU Error] Options pointer is NULL. (pangulu_gstrs)\n"
 28 | #define PANGULU_E_BIN_FULL "[PanguLU Error] No available slot in bin (slot_capacity = %d). Allocation failed.\n", bin->slot_capacity
 29 | #define PANGULU_E_RECYCLE_QUEUE_FULL "[PanguLU Error] Recycle queue is full. Cannot recycle slot.\n"
 30 | #define PANGULU_E_TASK_QUEUE_FULL "[PanguLU Error] No available task slot. Allocation failed.\n"
 31 | #define PANGULU_E_COMPLEX_MISMATCH "[PanguLU Error] Input matrix type (complex/real) does not match PanguLU configuration. Exiting.\n"
 32 | #define PANGULU_E_ELEM_SIZE_MISMATCH "[PanguLU Error] Input element size is %lld B, but expected %lld B in PanguLU. Exiting.\n", sizeof(sparse_value_t), sizeof(calculate_type)
 33 | #define PANGULU_E_MPI_BUF_WAIT_EXCEED "[PanguLU Error] MPI receive buffer for other ranks is too small, PanguLU can not continue.\n[PanguLU Error] Please increase 'init_options.mpi_recv_buffer_level' if memory allows.\n"
 34 | #else 
 35 | #define PANGULU_E_NB_IS_ZERO ""
 36 | #define PANGULU_E_INVALID_HEAP_SELECT ""
 37 | #define PANGULU_E_HEAP_FULL ""
 38 | #define PANGULU_E_HEAP_EMPTY ""
 39 | #define PANGULU_E_CPU_MEM ""
 40 | #define PANGULU_E_ROW_IS_NULL ""
 41 | #define PANGULU_E_K_ID ""
 42 | #define PANGULU_E_ASYM ""
 43 | #define PANGULU_E_ADD_DIA ""
 44 | #define PANGULU_E_ROW_IS_ZERO ""
 45 | #define PANGULU_E_MAX_NULL ""
 46 | #define PANGULU_E_OPTION_IS_NULLPTR ""
 47 | #define PANGULU_E_GSTRF_OPTION_IS_NULLPTR ""
 48 | #define PANGULU_E_GSTRS_OPTION_IS_NULLPTR ""
 49 | #define PANGULU_E_BIN_FULL ""
 50 | #define PANGULU_E_RECYCLE_QUEUE_FULL ""
 51 | #define PANGULU_E_TASK_QUEUE_FULL ""
 52 | #define PANGULU_E_COMPLEX_MISMATCH ""
 53 | #define PANGULU_E_ELEM_SIZE_MISMATCH ""
 54 | #define PANGULU_E_MPI_BUF_WAIT_EXCEED ""
 55 | #endif
 56 | 
 57 | #ifdef PANGULU_LOG_WARNING
 58 | #define PANGULU_W_INSUFFICIENT_MPI_BUF "[PanguLU Warning] MPI receive buffer is too small. Consider increasing 'init_options.mpi_recv_buffer_level'.\n"
 59 | #define PANGULU_W_MPI_BUF_FULL "[PanguLU Warning] MPI receive buffer for other ranks is full. PanguLU may run slowly.\n[PanguLU Warning] Consider increasing 'init_options.mpi_recv_buffer_level' if memory allows.\n"
 60 | #define PANGULU_W_MC64_FAIL "[PanguLU Warning] MC64 reordering failed. Proceeding with original matrix ordering.\n"
 61 | #define PANGULU_W_BIND_CORE_FAIL "[PanguLU Warning] Failed to bind thread to core %d: %s\n", core, strerror(errno)
 62 | #define PANGULU_W_PERF_MODE_ON "[PanguLU Warning] Macro PANGULU_PERF is defined. Numeric factorisation may run in performance mode and be slower.\n"
 63 | #else
 64 | #define PANGULU_W_INSUFFICIENT_MPI_BUF ""
 65 | #define PANGULU_W_MPI_BUF_FULL ""
 66 | #define PANGULU_W_MC64_FAIL ""
 67 | #define PANGULU_W_BIND_CORE_FAIL ""
 68 | #define PANGULU_W_PERF_MODE_ON ""
 69 | #endif
 70 | 
 71 | #ifdef PANGULU_LOG_INFO
 72 | #define PANGULU_I_NUMERIC_CHECK "[PanguLU Info] Numerical check: || LUx - Ax || / || Ax || = %le\n", residual_norm2 / rhs_norm2
 73 | #define PANGULU_I_TIME_REORDER "[PanguLU Info] Reordering time: %lf s\n", elapsed_time
 74 | #define PANGULU_I_TIME_SYMBOLIC "[PanguLU Info] Symbolic factorisation time: %lf s\n", elapsed_time
 75 | #define PANGULU_I_TIME_PRE "[PanguLU Info] Preprocessing time: %lf s\n", elapsed_time
 76 | #define PANGULU_I_TIME_SPTRSV "[PanguLU Info] Solving time (SpTRSV): %lf s\n", elapsed_time
 77 | #define PANGULU_I_SYMBOLIC_NONZERO "[PanguLU Info] Symbolic nonzero count: " FMT_PANGULU_EXBLOCK_PTR "\n", *symbolic_nnz
 78 | #define PANGULU_I_MEMUSAGE_HOST "[PanguLU Info] Host memory usage: %.0f MiB (bytes = %llu)\n", (double)total / 1024.0, total * 1024
 79 | #define PANGULU_I_MEMUSAGE_DEVICE "[PanguLU Info] GPU memory usage: %.0f MiB (bytes = %llu)\n", (double)total_gpu_mem_byte / (1024.0 * 1024.0), total_gpu_mem_byte
 80 | #define PANGULU_I_PERF_TABLE_HEAD "[PanguLU Info] rank\tkernel_time\ttime_recv\trecv_count\n"
 81 | #define PANGULU_I_PERF_TABLE_ROW "[PanguLU Info] #%d\t%lf\t%lf\t%lld\t\t\n", rank, global_stat.time_outer_kernel, global_stat.time_recv, global_stat.recv_cnt
 82 | 
 83 | #ifdef PANGULU_PERF
 84 | #define PANGULU_I_TIME_NUMERICAL "[PanguLU Info] Numeric factorisation time: %lf s. %lld flop, %lf GFlop/s\n", elapsed_time, global_stat.flop, ((double)global_stat.flop) / elapsed_time / 1e9
 85 | #else
 86 | #define PANGULU_I_TIME_NUMERICAL "[PanguLU Info] Numeric factorisation time: %lf s\n", elapsed_time
 87 | #endif
 88 | 
 89 | #ifdef GPU_OPEN
 90 | #ifdef METIS
 91 | #define PANGULU_I_BASIC_INFO "[PanguLU Info]\n\
 92 | [PanguLU Info] --- PanguLU Configuration & Matrix Info ---\n\
 93 | [PanguLU Info]       Matrix Order:             " FMT_PANGULU_INT64_T "\n\
 94 | [PanguLU Info]       #NNZ:                     " FMT_PANGULU_EXBLOCK_PTR "\n\
 95 | [PanguLU Info]       Matrix Block Order (nb):  " FMT_PANGULU_INT32_T "\n\
 96 | [PanguLU Info]       MPI Processes Count:      " FMT_PANGULU_INT32_T "\n\
 97 | [PanguLU Info]       MPI Recv Buffer Level:    " "%.2f" "\n\
 98 | [PanguLU Info]       METIS Index Type:         %s\n\
 99 | [PanguLU Info]       GPU:                      Enabled\n\
100 | [PanguLU Info]       GPU Kernel Warp/Block:    %d\n\
101 | [PanguLU Info]       GPU Data Move Warp/Block: %d\n\
102 | [PanguLU Info] -------------------------------------------\n[PanguLU Info]\n\
103 | ", n, origin_smatrix->columnpointer[n], nb, size, common->basic_param, (sizeof(reordering_int_t) == 4) ? ("i32") : ((sizeof(reordering_int_t) == 8) ? ("i64") : ("unknown")), \
104 | init_options->gpu_kernel_warp_per_block, init_options->gpu_data_move_warp_per_block
105 | #else
106 | #define PANGULU_I_BASIC_INFO "[PanguLU Info]\n\
107 | [PanguLU Info] --- PanguLU Configuration & Matrix Info ---\n\
108 | [PanguLU Info]       Matrix Order:             " FMT_PANGULU_INT64_T "\n\
109 | [PanguLU Info]       #NNZ:                     " FMT_PANGULU_EXBLOCK_PTR "\n\
110 | [PanguLU Info]       Matrix Block Order (nb):  " FMT_PANGULU_INT32_T "\n\
111 | [PanguLU Info]       MPI Processes Count:      " FMT_PANGULU_INT32_T "\n\
112 | [PanguLU Info]       MPI Recv Buffer Level:    " "%.2f" "\n\
113 | [PanguLU Info]       Reordering Index Type:    %s\n\
114 | [PanguLU Info]       Reordering Thread Count:  %d\n\
115 | [PanguLU Info]       GPU:                      Enabled\n\
116 | [PanguLU Info]       GPU Kernel Warp/Block:    %d\n\
117 | [PanguLU Info]       GPU Data Move Warp/Block: %d\n\
118 | [PanguLU Info] -------------------------------------------\n[PanguLU Info]\n\
119 | ", n, origin_smatrix->columnpointer[n], nb, size, common->basic_param, (sizeof(reordering_int_t) == 4) ? ("i32") : ((sizeof(reordering_int_t) == 8) ? ("i64") : ("unknown")), init_options->reordering_nthread, \
120 | init_options->gpu_kernel_warp_per_block, init_options->gpu_data_move_warp_per_block
121 | #endif
122 | #else
123 | #ifdef METIS
124 | #define PANGULU_I_BASIC_INFO "[PanguLU Info]\n\
125 | [PanguLU Info] --- PanguLU Configuration & Matrix Info ---\n\
126 | [PanguLU Info]       Matrix Order:             " FMT_PANGULU_INT64_T "\n\
127 | [PanguLU Info]       #NNZ:                     " FMT_PANGULU_EXBLOCK_PTR "\n\
128 | [PanguLU Info]       Matrix Block Order (nb):  " FMT_PANGULU_INT32_T "\n\
129 | [PanguLU Info]       MPI Processes Count:      " FMT_PANGULU_INT32_T "\n\
130 | [PanguLU Info]       MPI Recv Buffer Level:    " "%.2f" "\n\
131 | [PanguLU Info]       METIS Index Type:         %s\n\
132 | [PanguLU Info]       GPU:                      Disabled\n\
133 | [PanguLU Info] -------------------------------------------\n[PanguLU Info]\n\
134 | ", n, origin_smatrix->columnpointer[n], nb, size, common->basic_param, (sizeof(reordering_int_t) == 4) ? ("i32") : ((sizeof(reordering_int_t) == 8) ? ("i64") : ("unknown"))
135 | #else
136 | #define PANGULU_I_BASIC_INFO "[PanguLU Info]\n\
137 | [PanguLU Info] --- PanguLU Configuration & Matrix Info ---\n\
138 | [PanguLU Info]       Matrix Order:             " FMT_PANGULU_INT64_T "\n\
139 | [PanguLU Info]       #NNZ:                     " FMT_PANGULU_EXBLOCK_PTR "\n\
140 | [PanguLU Info]       Matrix Block Order (nb):  " FMT_PANGULU_INT32_T "\n\
141 | [PanguLU Info]       MPI Processes Count:      " FMT_PANGULU_INT32_T "\n\
142 | [PanguLU Info]       MPI Recv Buffer Level:    " "%.2f" "\n\
143 | [PanguLU Info]       Reordering Index Type:    %s\n\
144 | [PanguLU Info]       Reordering Thread Count:  %d\n\
145 | [PanguLU Info]       GPU:                      Disabled\n\
146 | [PanguLU Info] -------------------------------------------\n[PanguLU Info]\n\
147 | ", n, origin_smatrix->columnpointer[n], nb, size, common->basic_param, (sizeof(reordering_int_t) == 4) ? ("i32") : ((sizeof(reordering_int_t) == 8) ? ("i64") : ("unknown")), init_options->reordering_nthread
148 | #endif
149 | #endif
150 | 
151 | #else
152 | #define PANGULU_I_NUMERIC_CHECK ""
153 | #define PANGULU_I_TIME_REORDER ""
154 | #define PANGULU_I_TIME_SYMBOLIC ""
155 | #define PANGULU_I_TIME_PRE ""
156 | #define PANGULU_I_TIME_SPTRSV ""
157 | #define PANGULU_I_SYMBOLIC_NONZERO ""
158 | #define PANGULU_I_MEMUSAGE_HOST ""
159 | #define PANGULU_I_MEMUSAGE_DEVICE ""
160 | #define PANGULU_I_PERF_TABLE_HEAD ""
161 | #define PANGULU_I_PERF_TABLE_ROW ""
162 | #define PANGULU_I_TIME_NUMERICAL ""
163 | #define PANGULU_I_BASIC_INFO ""
164 | #endif


--------------------------------------------------------------------------------
/src/pangulu_symbolic.c:
--------------------------------------------------------------------------------
  1 | #include "pangulu_common.h"
  2 | 
  3 | void pangulu_a_plus_at(
  4 |     const pangulu_exblock_idx n,
  5 |     const pangulu_exblock_ptr nz,
  6 |     pangulu_exblock_ptr *colptr,
  7 |     pangulu_exblock_idx *rowind,
  8 |     pangulu_exblock_ptr *bnz,
  9 |     pangulu_exblock_ptr **b_colptr,
 10 |     pangulu_exblock_idx **b_rowind)
 11 | {
 12 |     register pangulu_exblock_idx i, j, k, col;
 13 |     register pangulu_exblock_ptr num_nz;
 14 |     pangulu_exblock_ptr *t_colptr;
 15 |     pangulu_exblock_idx *t_rowind;
 16 |     pangulu_int32_t *marker;
 17 | 
 18 |     marker = (pangulu_int32_t *)pangulu_malloc(__FILE__, __LINE__, n * sizeof(pangulu_int32_t));
 19 |     t_colptr = (pangulu_exblock_ptr *)pangulu_malloc(__FILE__, __LINE__, (n + 1) * sizeof(pangulu_exblock_ptr));
 20 |     t_rowind = (pangulu_exblock_idx *)pangulu_malloc(__FILE__, __LINE__, nz * sizeof(pangulu_exblock_idx));
 21 | 
 22 |     for (i = 0; i < n; ++i)
 23 |         marker[i] = 0;
 24 |     for (j = 0; j < n; ++j)
 25 |     {
 26 |         for (i = colptr[j]; i < colptr[j + 1]; ++i)
 27 |             ++marker[rowind[i]];
 28 |     }
 29 | 
 30 |     t_colptr[0] = 0;
 31 |     for (i = 0; i < n; ++i)
 32 |     {
 33 |         t_colptr[i + 1] = t_colptr[i] + marker[i];
 34 |         marker[i] = t_colptr[i];
 35 |     }
 36 | 
 37 |     for (j = 0; j < n; ++j)
 38 |     {
 39 |         for (i = colptr[j]; i < colptr[j + 1]; ++i)
 40 |         {
 41 |             col = rowind[i];
 42 |             t_rowind[marker[col]] = j;
 43 |             ++marker[col];
 44 |         }
 45 |     }
 46 |     for (i = 0; i < n; ++i)
 47 |         marker[i] = -1;
 48 | 
 49 |     num_nz = 0;
 50 |     for (j = 0; j < n; ++j)
 51 |     {
 52 |         for (i = colptr[j]; i < colptr[j + 1]; ++i)
 53 |         {
 54 |             k = rowind[i];
 55 |             if (marker[k] != j)
 56 |             {
 57 |                 marker[k] = j;
 58 |                 ++num_nz;
 59 |             }
 60 |         }
 61 | 
 62 |         for (i = t_colptr[j]; i < t_colptr[j + 1]; ++i)
 63 |         {
 64 |             k = t_rowind[i];
 65 |             if (marker[k] != j)
 66 |             {
 67 |                 marker[k] = j;
 68 |                 ++num_nz;
 69 |             }
 70 |         }
 71 |     }
 72 |     *bnz = num_nz;
 73 | 
 74 |     *b_colptr = (pangulu_exblock_ptr *)pangulu_malloc(__FILE__, __LINE__, (n + 1) * sizeof(pangulu_exblock_ptr));
 75 |     *b_rowind = (pangulu_exblock_idx *)pangulu_malloc(__FILE__, __LINE__, *bnz * sizeof(pangulu_exblock_idx));
 76 | 
 77 |     for (i = 0; i < n; ++i)
 78 |         marker[i] = -1;
 79 | 
 80 |     num_nz = 0;
 81 |     for (j = 0; j < n; ++j)
 82 |     {
 83 |         (*b_colptr)[j] = num_nz;
 84 |         for (i = colptr[j]; i < colptr[j + 1]; ++i)
 85 |         {
 86 |             k = rowind[i];
 87 |             if (marker[k] != j)
 88 |             {
 89 |                 marker[k] = j;
 90 |                 (*b_rowind)[num_nz++] = k;
 91 |             }
 92 |         }
 93 |         for (i = t_colptr[j]; i < t_colptr[j + 1]; ++i)
 94 |         {
 95 |             k = t_rowind[i];
 96 |             if (marker[k] != j)
 97 |             {
 98 |                 marker[k] = j;
 99 |                 (*b_rowind)[num_nz++] = k;
100 |             }
101 |         }
102 |     }
103 |     (*b_colptr)[n] = num_nz;
104 | 
105 |     pangulu_free(__FILE__, __LINE__, marker);
106 |     pangulu_free(__FILE__, __LINE__, t_colptr);
107 |     pangulu_free(__FILE__, __LINE__, t_rowind);
108 | }
109 | 
110 | void pangulu_symbolic_add_prune(
111 |     pangulu_symbolic_node_t *prune,
112 |     pangulu_symbolic_node_t *prune_next,
113 |     pangulu_int64_t num,
114 |     pangulu_int64_t num_value,
115 |     pangulu_int64_t p)
116 | {
117 |     prune[num].value++;
118 |     prune_next[p].value = num_value;
119 |     prune_next[p].next = NULL;
120 |     pangulu_symbolic_node_t *p2 = &prune[num];
121 |     for (;;)
122 |     {
123 |         if (p2->next == NULL)
124 |         {
125 |             break;
126 |         }
127 |         p2 = p2->next;
128 |     }
129 |     p2->next = &prune_next[p];
130 | }
131 | 
132 | void pangulu_symbolic_symmetric(
133 |     pangulu_exblock_idx n,
134 |     pangulu_exblock_ptr nnz,
135 |     pangulu_exblock_idx *ai,
136 |     pangulu_exblock_ptr *ap,
137 |     pangulu_exblock_ptr **symbolic_rowpointer,
138 |     pangulu_exblock_idx **symbolic_columnindex,
139 |     pangulu_inblock_idx nb,
140 |     pangulu_exblock_idx block_length,
141 |     pangulu_exblock_ptr *symbolic_nnz)
142 | {
143 |     pangulu_exblock_ptr realloc_capacity = nnz;
144 |     pangulu_exblock_idx *L_r_idx = (pangulu_exblock_idx *)pangulu_malloc(__FILE__, __LINE__, realloc_capacity * sizeof(pangulu_exblock_idx)); // include diagonal
145 |     pangulu_exblock_ptr *L_c_ptr = (pangulu_exblock_ptr *)pangulu_malloc(__FILE__, __LINE__, (n + 1) * sizeof(pangulu_exblock_ptr));
146 |     L_c_ptr[0] = 0;
147 | 
148 |     pangulu_symbolic_node_t *prune = (pangulu_symbolic_node_t *)pangulu_malloc(__FILE__, __LINE__, n * sizeof(pangulu_symbolic_node_t));
149 |     pangulu_symbolic_node_t *prune_next = (pangulu_symbolic_node_t *)pangulu_malloc(__FILE__, __LINE__, n * sizeof(pangulu_symbolic_node_t));
150 |     pangulu_symbolic_node_t *p1;
151 | 
152 |     pangulu_int64_t *work_space = (pangulu_int64_t *)pangulu_malloc(__FILE__, __LINE__, n * sizeof(pangulu_int64_t));
153 |     pangulu_int64_t *merge = (pangulu_int64_t *)pangulu_malloc(__FILE__, __LINE__, n * sizeof(pangulu_int64_t));
154 | 
155 |     for (pangulu_exblock_idx i = 0; i < n; i++)
156 |     {
157 |         work_space[i] = -1;
158 |         prune[i].value = 0;
159 |         prune[i].next = NULL;
160 |         prune_next[i].value = -1;
161 |         prune_next[i].next = NULL;
162 |     }
163 |     pangulu_int64_t L_maxsize = realloc_capacity;
164 |     pangulu_int64_t L_size = 0;
165 | 
166 |     pangulu_int64_t row = -1;
167 |     pangulu_int64_t num_merge = 0;
168 | 
169 |     pangulu_int64_t p = 0;
170 |     for (pangulu_exblock_idx i = 0; i < n; i++)
171 |     {
172 | 
173 |         pangulu_exblock_idx n_rows = ap[i + 1] - ap[i];
174 | 
175 |         for (pangulu_exblock_idx k = 0; k < n_rows; k++)
176 |         {
177 | 
178 |             row = (ai + ap[i])[k];
179 |             if (row >= i)
180 |             {
181 |                 work_space[row] = i;
182 |                 L_r_idx[L_size] = row;
183 |                 L_size++;
184 |                 if (L_size + 1 > L_maxsize)
185 |                 {
186 |                     L_r_idx = (pangulu_exblock_idx *)pangulu_realloc(__FILE__, __LINE__, L_r_idx, (L_maxsize + realloc_capacity) * sizeof(pangulu_exblock_idx));
187 |                     L_maxsize = L_maxsize + realloc_capacity;
188 |                 }
189 |             }
190 |         }
191 | 
192 |         num_merge = prune[i].value;
193 |         p1 = &prune[i];
194 |         for (pangulu_int64_t k = 0;; k++)
195 |         {
196 |             if (p1->next == NULL)
197 |                 break;
198 |             p1 = p1->next;
199 |             merge[k] = p1->value;
200 |         }
201 |         for (pangulu_int64_t k = 0; k < num_merge; k++)
202 |         {
203 |             row = merge[k];
204 |             pangulu_int64_t min = L_c_ptr[row];
205 |             pangulu_int64_t max = L_c_ptr[row + 1];
206 |             for (pangulu_int64_t j = min; j < max; j++)
207 |             {
208 |                 pangulu_int64_t crow = L_r_idx[j];
209 | 
210 |                 if (crow > i && work_space[crow] != i)
211 |                 {
212 |                     work_space[crow] = i;
213 |                     L_r_idx[L_size] = crow;
214 |                     L_size++;
215 |                     if (L_size + 1 > L_maxsize)
216 |                     {
217 |                         L_r_idx = (pangulu_exblock_idx *)pangulu_realloc(__FILE__, __LINE__, L_r_idx, (L_maxsize + realloc_capacity) * sizeof(pangulu_exblock_idx));
218 |                         L_maxsize = L_maxsize + realloc_capacity;
219 |                     }
220 |                 }
221 |             }
222 |         }
223 |         L_c_ptr[i + 1] = L_size;
224 | 
225 |         if (L_c_ptr[i + 1] - L_c_ptr[i] > 1)
226 |         {
227 |             pangulu_int64_t todo_prune = n + 1;
228 |             for (pangulu_int64_t k = L_c_ptr[i]; k < L_c_ptr[i + 1]; k++)
229 |             {
230 |                 if (todo_prune > L_r_idx[k] && L_r_idx[k] > i)
231 |                     todo_prune = L_r_idx[k];
232 |             }
233 |             pangulu_symbolic_add_prune(prune, prune_next, todo_prune, i, p);
234 |             p++;
235 |         }
236 |     }
237 |     pangulu_free(__FILE__, __LINE__, work_space);
238 |     pangulu_free(__FILE__, __LINE__, merge);
239 |     pangulu_free(__FILE__, __LINE__, prune);
240 |     pangulu_free(__FILE__, __LINE__, prune_next);
241 | 
242 |     *symbolic_nnz = L_size * 2 - n;
243 |     *symbolic_rowpointer = L_c_ptr;
244 |     *symbolic_columnindex = L_r_idx;
245 | 
246 |     printf(PANGULU_I_SYMBOLIC_NONZERO);
247 | }
248 | 
249 | void pangulu_symbolic(
250 |     pangulu_block_common *block_common,
251 |     pangulu_block_smatrix *block_smatrix,
252 |     pangulu_origin_smatrix *reorder_matrix)
253 | {
254 |     pangulu_exblock_ptr *symmetric_columnpointer = NULL;
255 |     pangulu_exblock_idx *symmetric_rowindex = NULL;
256 |     pangulu_exblock_ptr symmetric_nnz;
257 |     pangulu_a_plus_at(reorder_matrix->row, reorder_matrix->nnz,
258 |                       reorder_matrix->columnpointer, reorder_matrix->rowindex,
259 |                       &symmetric_nnz, &symmetric_columnpointer, &symmetric_rowindex);
260 |     pangulu_exblock_ptr *symbolic_columnpointer = NULL;
261 |     pangulu_exblock_idx *symbolic_rowindex = NULL;
262 |     pangulu_inblock_idx nb = block_common->nb;
263 |     pangulu_exblock_idx block_length = block_common->block_length;
264 |     pangulu_symbolic_symmetric(reorder_matrix->row, symmetric_nnz, symmetric_rowindex, symmetric_columnpointer,
265 |                                &symbolic_columnpointer, &symbolic_rowindex,
266 |                                nb, block_length,
267 |                                &block_smatrix->symbolic_nnz);
268 |     pangulu_free(__FILE__, __LINE__, symmetric_columnpointer);
269 |     pangulu_free(__FILE__, __LINE__, symmetric_rowindex);
270 |     block_smatrix->symbolic_rowpointer = symbolic_columnpointer;
271 |     block_smatrix->symbolic_columnindex = symbolic_rowindex;
272 | }


--------------------------------------------------------------------------------
/src/pangulu_thread.c:
--------------------------------------------------------------------------------
 1 | #include "pangulu_common.h"
 2 | 
 3 | void pangulu_bind_to_core(pangulu_int32_t core)
 4 | {
 5 |     cpu_set_t cpuset;
 6 |     CPU_ZERO(&cpuset);
 7 |     CPU_SET(core, &cpuset);
 8 |     if (pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset) != 0)
 9 |     {
10 |         printf(PANGULU_W_BIND_CORE_FAIL);
11 |     }
12 | }
13 | 
14 | void pangulu_mutex_init(pthread_mutex_t *mutex)
15 | {
16 |     pthread_mutex_init((mutex), NULL);
17 | }
18 | 
19 | void pangulu_bsem_init(pangulu_bsem_t *bsem_p, pangulu_int64_t value)
20 | {
21 |     if (value < 0 || value > 1)
22 |     {
23 |         exit(1);
24 |     }
25 |     bsem_p->mutex = (pthread_mutex_t *)pangulu_malloc(__FILE__, __LINE__, sizeof(pthread_mutex_t));
26 |     bsem_p->cond = (pthread_cond_t *)pangulu_malloc(__FILE__, __LINE__, sizeof(pthread_cond_t));
27 |     pangulu_mutex_init((bsem_p->mutex));
28 |     pthread_cond_init((bsem_p->cond), NULL);
29 |     bsem_p->v = value;
30 | }
31 | 
32 | pangulu_bsem_t *pangulu_bsem_destory(pangulu_bsem_t *bsem_p)
33 | {
34 |     pangulu_free(__FILE__, __LINE__, bsem_p->mutex);
35 |     bsem_p->mutex = NULL;
36 |     pangulu_free(__FILE__, __LINE__, bsem_p->cond);
37 |     bsem_p->cond = NULL;
38 |     bsem_p->v = 0;
39 |     pangulu_free(__FILE__, __LINE__, bsem_p);
40 |     return NULL;
41 | }
42 | 
43 | void pangulu_bsem_post(pangulu_task_queue_t *heap)
44 | {
45 |     pangulu_bsem_t *bsem_p = heap->heap_bsem;
46 |     pthread_mutex_lock(bsem_p->mutex);
47 |     pangulu_int64_t flag = pangulu_task_queue_empty(heap);
48 |     if (((bsem_p->v == 0) && (flag == 0)))
49 |     {
50 |         bsem_p->v = 1;
51 |         pthread_cond_signal(bsem_p->cond);
52 |     }
53 |     pthread_mutex_unlock(bsem_p->mutex);
54 | }
55 | 
56 | void pangulu_bsem_stop(pangulu_task_queue_t *heap)
57 | {
58 |     pangulu_bsem_t *bsem_p = heap->heap_bsem;
59 |     pthread_mutex_lock(bsem_p->mutex);
60 |     bsem_p->v = 0;
61 |     pthread_mutex_unlock(bsem_p->mutex);
62 | }
63 | 
64 | void pangulu_bsem_synchronize(pangulu_bsem_t *bsem_p)
65 | {
66 |     pthread_mutex_lock((bsem_p->mutex));
67 |     pangulu_int64_t v = bsem_p->v;
68 |     if (v == 1)
69 |     {
70 |         bsem_p->v = 0;
71 |         pthread_cond_signal(bsem_p->cond);
72 |         pthread_mutex_unlock(bsem_p->mutex);
73 |     }
74 |     else
75 |     {
76 |         bsem_p->v = 1;
77 |         while (bsem_p->v == 1)
78 |         {
79 |             pthread_cond_wait((bsem_p->cond), (bsem_p->mutex));
80 |             bsem_p->v = 0;
81 |         }
82 |         bsem_p->v = 0;
83 |         pthread_mutex_unlock(bsem_p->mutex);
84 |     }
85 | }


--------------------------------------------------------------------------------
/src/platforms/01_SHAREDMEM/00_CPU/000_CPU/Makefile:
--------------------------------------------------------------------------------
1 | include ../../../../../make.inc
2 | pangulu_platform_0100000.o:pangulu_platform_0100000.c
3 | 	$(CC) $(COMPILE_LEVEL) $(CFLAGS) $(PANGULU_FLAGS) -fPIC -c $< -o $@
4 | 	mv $@ ../../../../../lib


--------------------------------------------------------------------------------
/src/platforms/01_SHAREDMEM/00_CPU/000_CPU/pangulu_platform_0100000.h:
--------------------------------------------------------------------------------
 1 | #ifndef PANGULU_PLATFORM_0100000
 2 | #define PANGULU_PLATFORM_0100000
 3 | #include "../../../../pangulu_common.h"
 4 | 
 5 | void pangulu_platform_0100000_malloc(void** platform_address, size_t size);
 6 | 
 7 | void pangulu_platform_0100000_malloc_pinned(void** platform_address, size_t size);
 8 | 
 9 | void pangulu_platform_0100000_synchronize();
10 | 
11 | void pangulu_platform_0100000_memset(void* s, int c, size_t n);
12 | 
13 | void pangulu_platform_0100000_create_stream(void** stream);
14 | 
15 | void pangulu_platform_0100000_memcpy(void *dst, const void *src, size_t count, unsigned int kind);
16 | 
17 | void pangulu_platform_0100000_memcpy_async(void *dst, const void *src, size_t count, unsigned int kind, void* stream);
18 | 
19 | void pangulu_platform_0100000_free(void* devptr);
20 | 
21 | void pangulu_platform_0100000_get_device_num(int* device_num);
22 | 
23 | void pangulu_platform_0100000_set_default_device(int device_num);
24 | 
25 | void pangulu_platform_0100000_get_device_name(char* name, int device_num);
26 | 
27 | void pangulu_platform_0100000_get_device_memory_usage(size_t *used_byte);
28 | 
29 | 
30 | void pangulu_platform_0100000_getrf(
31 |     pangulu_inblock_idx nb,
32 |     pangulu_storage_slot_t* opdst,
33 |     int tid
34 | );
35 | void pangulu_platform_0100000_tstrf(
36 |     pangulu_inblock_idx nb,
37 |     pangulu_storage_slot_t* opdst,
38 |     pangulu_storage_slot_t* opdiag,
39 |     int tid
40 | );
41 | void pangulu_platform_0100000_gessm(
42 |     pangulu_inblock_idx nb,
43 |     pangulu_storage_slot_t* opdst,
44 |     pangulu_storage_slot_t* opdiag,
45 |     int tid
46 | );
47 | void pangulu_platform_0100000_ssssm(
48 |     pangulu_inblock_idx nb,
49 |     pangulu_storage_slot_t* opdst,
50 |     pangulu_storage_slot_t* op1,
51 |     pangulu_storage_slot_t* op2,
52 |     int tid
53 | );
54 | void pangulu_platform_0100000_ssssm_batched(
55 |     pangulu_inblock_idx nb,
56 |     pangulu_uint64_t ntask,
57 |     pangulu_task_t* tasks
58 | );
59 | void pangulu_platform_0100000_hybrid_batched(
60 |     pangulu_inblock_idx nb,
61 |     pangulu_uint64_t ntask,
62 |     pangulu_task_t* tasks
63 | );
64 | 
65 | 
66 | void pangulu_platform_0100000_spmv(
67 |     pangulu_inblock_idx nb,
68 |     pangulu_storage_slot_t* a,
69 |     calculate_type* x,
70 |     calculate_type* y
71 | );
72 | void pangulu_platform_0100000_vecadd(
73 |     pangulu_int64_t length,
74 |     calculate_type *bval, 
75 |     calculate_type *xval
76 | );
77 | void pangulu_platform_0100000_sptrsv(
78 |     pangulu_inblock_idx nb,
79 |     pangulu_storage_slot_t *s,
80 |     calculate_type* xval,
81 |     pangulu_int64_t uplo
82 | );
83 | 
84 | #endif


--------------------------------------------------------------------------------
/src/platforms/02_NONSHAREDMEM/01_GPU/000_CUDA/Makefile:
--------------------------------------------------------------------------------
1 | include ../../../../../make.inc
2 | pangulu_platform_0201000.o:pangulu_platform_0201000.cu
3 | 	$(NVCC) $(COMPILE_LEVEL) $(NVCCFLAGS) $(PANGULU_FLAGS) -Xcompiler -fPIC  -c $< -o $@
4 | 	mv $@ ../../../../../lib


--------------------------------------------------------------------------------
/src/platforms/02_NONSHAREDMEM/01_GPU/000_CUDA/pangulu_platform_0201000.h:
--------------------------------------------------------------------------------
 1 | #include "../../../../pangulu_common.h"
 2 | 
 3 | #ifdef __cplusplus
 4 | extern "C"{
 5 | #endif
 6 | 
 7 | void pangulu_platform_0201000_malloc(void** platform_address, size_t size);
 8 | 
 9 | void pangulu_platform_0201000_malloc_pinned(void** platform_address, size_t size);
10 | 
11 | void pangulu_platform_0201000_synchronize();
12 | 
13 | void pangulu_platform_0201000_memset(void* s, int c, size_t n);
14 | 
15 | void pangulu_platform_0201000_create_stream(void** stream);
16 | 
17 | void pangulu_platform_0201000_memcpy(void *dst, const void *src, size_t count, unsigned int kind);
18 | 
19 | void pangulu_platform_0201000_memcpy_async(void *dst, const void *src, size_t count, unsigned int kind, void* stream);
20 | 
21 | void pangulu_platform_0201000_free(void* devptr);
22 | 
23 | void pangulu_platform_0201000_get_device_num(int* device_num);
24 | 
25 | void pangulu_platform_0201000_set_default_device(int device_num);
26 | 
27 | void pangulu_platform_0201000_get_device_name(char* name, int device_num);
28 | 
29 | void pangulu_platform_0201000_get_device_memory_usage(size_t *used_byte);
30 | 
31 | 
32 | void pangulu_platform_0201000_getrf(
33 |     pangulu_inblock_idx nb,
34 |     pangulu_storage_slot_t* opdst,
35 |     int tid
36 | );
37 | void pangulu_platform_0201000_tstrf(
38 |     pangulu_inblock_idx nb,
39 |     pangulu_storage_slot_t* opdst,
40 |     pangulu_storage_slot_t* opdiag,
41 |     int tid
42 | );
43 | void pangulu_platform_0201000_gessm(
44 |     pangulu_inblock_idx nb,
45 |     pangulu_storage_slot_t* opdst,
46 |     pangulu_storage_slot_t* opdiag,
47 |     int tid
48 | );
49 | void pangulu_platform_0201000_ssssm(
50 |     pangulu_inblock_idx nb,
51 |     pangulu_storage_slot_t* opdst,
52 |     pangulu_storage_slot_t* op1,
53 |     pangulu_storage_slot_t* op2,
54 |     int tid
55 | );
56 | void pangulu_platform_0201000_ssssm_batched(
57 |     pangulu_inblock_idx nb,
58 |     pangulu_uint64_t ntask,
59 |     pangulu_task_t* tasks
60 | );
61 | void pangulu_platform_0201000_hybrid_batched(
62 |     pangulu_inblock_idx nb,
63 |     pangulu_uint64_t ntask,
64 |     pangulu_task_t* tasks
65 | );
66 | 
67 | void pangulu_platform_0201000_spmv(
68 |     pangulu_inblock_idx nb,
69 |     pangulu_storage_slot_t* a,
70 |     calculate_type* x,
71 |     calculate_type* y
72 | );
73 | void pangulu_platform_0201000_vecadd(
74 |     pangulu_int64_t length,
75 |     calculate_type *bval, 
76 |     calculate_type *xval
77 | );
78 | void pangulu_platform_0201000_sptrsv(
79 |     pangulu_inblock_idx nb,
80 |     pangulu_storage_slot_t *s,
81 |     calculate_type* xval,
82 |     pangulu_int64_t uplo
83 | );
84 | 
85 | #ifdef __cplusplus
86 | }
87 | #endif


--------------------------------------------------------------------------------
/src/platforms/pangulu_platform_common.h:
--------------------------------------------------------------------------------
 1 | // Warning : Don't modify this file directly.
 2 | // This file is automatically generated by build_helper.py.
 3 | // This file will be regenerated after the next compilation.
 4 | // All changes will be lost.
 5 | #ifndef PANGULU_PLATFORM_HELPER
 6 | #define PANGULU_PLATFORM_HELPER
 7 | #include "01_SHAREDMEM/00_CPU/000_CPU/pangulu_platform_0100000.h"
 8 | #include "02_NONSHAREDMEM/01_GPU/000_CUDA/pangulu_platform_0201000.h"
 9 | typedef unsigned long long pangulu_platform_t;
10 | #define PANGULU_PLATFORM_CPU_NAIVE 0x0100000
11 | #define PANGULU_PLATFORM_GPU_CUDA 0x0201000
12 | void pangulu_platform_malloc(void** platform_address, size_t size, pangulu_platform_t platform);
13 | void pangulu_platform_malloc_pinned(void** platform_address, size_t size, pangulu_platform_t platform);
14 | void pangulu_platform_synchronize(pangulu_platform_t platform);
15 | void pangulu_platform_memset(void* s, int c, size_t n, pangulu_platform_t platform);
16 | void pangulu_platform_create_stream(void** stream, pangulu_platform_t platform);
17 | void pangulu_platform_memcpy(void *dst, const void *src, size_t count, unsigned int kind, pangulu_platform_t platform);
18 | void pangulu_platform_memcpy_async(void *dst, const void *src, size_t count, unsigned int kind, void* stream, pangulu_platform_t platform);
19 | void pangulu_platform_free(void* devptr, pangulu_platform_t platform);
20 | void pangulu_platform_get_device_num(int* device_num, pangulu_platform_t platform);
21 | void pangulu_platform_set_default_device(int device_num, pangulu_platform_t platform);
22 | void pangulu_platform_get_device_name(char* name, int device_num, pangulu_platform_t platform);
23 | void pangulu_platform_get_device_memory_usage(size_t* used_byte, pangulu_platform_t platform);
24 | void pangulu_platform_getrf(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, int tid, pangulu_platform_t platform);
25 | void pangulu_platform_tstrf(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* opdiag, int tid, pangulu_platform_t platform);
26 | void pangulu_platform_gessm(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* opdiag, int tid, pangulu_platform_t platform);
27 | void pangulu_platform_ssssm(pangulu_inblock_idx nb, pangulu_storage_slot_t* opdst, pangulu_storage_slot_t* op1, pangulu_storage_slot_t* op2, int tid, pangulu_platform_t platform);
28 | void pangulu_platform_ssssm_batched(pangulu_inblock_idx nb, pangulu_uint64_t ntask, pangulu_task_t* tasks, pangulu_platform_t platform);
29 | void pangulu_platform_hybrid_batched(pangulu_inblock_idx nb, pangulu_uint64_t ntask, pangulu_task_t* tasks, pangulu_platform_t platform);
30 | void pangulu_platform_spmv(pangulu_inblock_idx nb, pangulu_storage_slot_t* a, calculate_type* x, calculate_type* y, pangulu_platform_t platform);
31 | void pangulu_platform_vecadd(pangulu_int64_t length, calculate_type *bval, calculate_type *xval, pangulu_platform_t platform);
32 | void pangulu_platform_sptrsv(pangulu_inblock_idx nb, pangulu_storage_slot_t *s, calculate_type* xval, pangulu_int64_t uplo, pangulu_platform_t platform);
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/platforms/platform_list.csv:
--------------------------------------------------------------------------------
1 | 0100000,CPU_NAIVE
2 | 0201000,GPU_CUDA


--------------------------------------------------------------------------------