├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── examples ├── AFN_precond │ ├── GCC-OpenBLAS.make │ ├── ICC-MKL.make │ ├── IE_diag_quad.h │ ├── Nys_precond.c │ ├── Nys_precond.h │ ├── common.make │ ├── precond_test_utils.h │ ├── test_AFN.c │ ├── test_AFN_IE.c │ └── test_Nys.c ├── GCC-OpenBLAS.make ├── ICC-MKL.make ├── PCG │ ├── pcg.c │ └── pcg.h ├── SPDHSS-H2 │ ├── CSRPlus.c │ ├── CSRPlus.h │ ├── FSAI_precond.c │ ├── FSAI_precond.h │ ├── GCC-OpenBLAS.make │ ├── ICC-MKL.make │ ├── LRD_precond.c │ ├── LRD_precond.h │ ├── block_jacobi_precond.c │ ├── block_jacobi_precond.h │ ├── common.make │ ├── example_SPDHSSH2.c │ ├── example_SPDHSSH2_tol.c │ ├── example_regularHSS.c │ ├── parse_scalar_params.h │ ├── pcg_tests.c │ ├── pcg_tests.h │ ├── point_set │ │ ├── 3Dball_80000.csv │ │ └── 3Dsphere_80000.csv │ ├── test_FSAI.c │ └── test_FSAI_IE.c ├── common.make ├── direct_nbody.h ├── example_H2.c ├── example_H2_RPY.c ├── example_H2_tensor.c ├── example_HSS.c ├── example_read_H2_file.c ├── meta_json_to_txt.py └── meta_txt_to_json.py ├── extra ├── GCC-OpenBLAS.make ├── ICC-MKL.make ├── common.make ├── debug.h ├── direct_nbody.h ├── parse_scalar_params.h ├── parse_tensor_params.h ├── rand_3D_sphere_points.m ├── src-obsolete │ ├── H2P_build_H2_UJ_proxy_levelup.c │ └── H2P_generate_proxy_point_ID.c ├── test_H2_accuracy.c ├── test_H2_matmul.h ├── test_H2_scalar.c ├── test_H2_scalar_samplept.c ├── test_HSS_scalar.c ├── test_ID_compress.c ├── test_ID_compress_dim.c ├── test_kernel_SIMD.c └── test_scalar_matmul.c ├── pyh2pack ├── example.py ├── example_hss.py ├── example_samplept.py ├── pyh2pack.c ├── pyh2pack.h ├── pyh2pack_kernel.h ├── readme.md ├── setup.py └── setup_icc.py └── src ├── AFN_precond.c ├── AFN_precond.h ├── DAG_task_queue.c ├── DAG_task_queue.h ├── GCC-OpenBLAS.make ├── H2Pack.h ├── H2Pack_2D_kernels.h ├── H2Pack_3D_kernels.h ├── H2Pack_HSS_ULV.c ├── H2Pack_HSS_ULV.h ├── H2Pack_ID_compress.c ├── H2Pack_ID_compress.h ├── H2Pack_SPDHSS_H2.c ├── H2Pack_SPDHSS_H2.h ├── H2Pack_aux_structs.c ├── H2Pack_aux_structs.h ├── H2Pack_build.c ├── H2Pack_build.h ├── H2Pack_build_periodic.c ├── H2Pack_build_periodic.h ├── H2Pack_build_with_sample_point.c ├── H2Pack_build_with_sample_point.h ├── H2Pack_config.h ├── H2Pack_file_IO.c ├── H2Pack_file_IO.h ├── H2Pack_gen_proxy_point.c ├── H2Pack_gen_proxy_point.h ├── H2Pack_kernels.h ├── H2Pack_matmul.c ├── H2Pack_matmul.h ├── H2Pack_matmul_periodic.c ├── H2Pack_matmul_periodic.h ├── H2Pack_matvec.c ├── H2Pack_matvec.h ├── H2Pack_matvec_periodic.c ├── H2Pack_matvec_periodic.h ├── H2Pack_partition.c ├── H2Pack_partition.h ├── H2Pack_partition_periodic.c ├── H2Pack_partition_periodic.h ├── H2Pack_typedef.c ├── H2Pack_typedef.h ├── H2Pack_utils.c ├── H2Pack_utils.h ├── ICC-MKL.make ├── common.make ├── linalg_lib_wrapper.h ├── utils.c └── utils.h /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | 54 | .vscode/ 55 | lib/ 56 | include/ 57 | install/ 58 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/ASTER"] 2 | path = src/ASTER 3 | url = https://github.com/huanghua1994/ASTER 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Hua Huang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /examples/AFN_precond/GCC-OpenBLAS.make: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | USE_MKL = 0 3 | USE_OPENBLAS = 1 4 | 5 | include common.make -------------------------------------------------------------------------------- /examples/AFN_precond/ICC-MKL.make: -------------------------------------------------------------------------------- 1 | CC = icc 2 | USE_MKL = 1 3 | USE_OPENBLAS = 0 4 | 5 | include common.make -------------------------------------------------------------------------------- /examples/AFN_precond/Nys_precond.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "Nys_precond.h" 8 | #include "H2Pack_utils.h" 9 | 10 | // In AFN_precond.c 11 | void Nys_precond_build_( 12 | const DTYPE mu, const int n1, const int n2, DTYPE *K11, 13 | DTYPE *K12, DTYPE **nys_M_, DTYPE **nys_U_ 14 | ); 15 | void Nys_precond_apply_( 16 | const int n1, const int n, const DTYPE *nys_M, const DTYPE *nys_U, 17 | const DTYPE *x, DTYPE *y, DTYPE *t 18 | ); 19 | 20 | // Build a randomize Nystrom preconditioner for a kernel matrix 21 | void Nys_precond_build( 22 | kernel_eval_fptr krnl_eval, void *krnl_param, const int npt, const int pt_dim, 23 | const DTYPE *coord, const DTYPE mu, const int nys_k, Nys_precond_p *Nys_precond_ 24 | ) 25 | { 26 | Nys_precond_p Nys_precond = (Nys_precond_p) malloc(sizeof(Nys_precond_s)); 27 | memset(Nys_precond, 0, sizeof(Nys_precond_s)); 28 | 29 | // 1. Randomly select nys_k points from npt points 30 | int n = npt, n1 = nys_k, n2 = npt - nys_k; 31 | int *perm = (int *) malloc(sizeof(int) * n); 32 | uint8_t *flag = (uint8_t *) malloc(sizeof(uint8_t) * n); 33 | DTYPE *coord_perm = (DTYPE *) malloc(sizeof(DTYPE) * npt * pt_dim); 34 | H2P_rand_sample(npt, nys_k, perm, flag); 35 | memset(flag, 0, sizeof(uint8_t) * n); 36 | for (int i = 0; i < n1; i++) flag[perm[i]] = 1; 37 | int idx = n1; 38 | for (int i = 0; i < n; i++) 39 | if (flag[i] == 0) perm[idx++] = i; 40 | H2P_gather_matrix_columns(coord, npt, coord_perm, npt, pt_dim, perm, npt); 41 | 42 | // 2. Build K11 and K12 blocks 43 | DTYPE *coord_n1 = coord_perm; 44 | DTYPE *coord_n2 = coord_perm + n1; 45 | DTYPE *K11 = (DTYPE *) malloc(sizeof(DTYPE) * n1 * n1); 46 | DTYPE *K12 = (DTYPE *) malloc(sizeof(DTYPE) * n1 * n2); 47 | int n_thread = omp_get_max_threads(); 48 | ASSERT_PRINTF( 49 | K11 != NULL && K12 != NULL, 50 | "Failed to allocate Nystrom preconditioner K11/K12 buffers\n" 51 | ); 52 | H2P_eval_kernel_matrix_OMP( 53 | krnl_eval, krnl_param, 54 | coord_n1, n, n1, coord_n1, n, n1, 55 | K11, n1, n_thread 56 | ); 57 | H2P_eval_kernel_matrix_OMP( 58 | krnl_eval, krnl_param, 59 | coord_n1, n, n1, coord_n2, n, n2, 60 | K12, n2, n_thread 61 | ); 62 | free(coord_perm); 63 | free(flag); 64 | 65 | // 3. Build U and M matrices 66 | Nys_precond->n = n; 67 | Nys_precond->n1 = n1; 68 | Nys_precond->perm = perm; 69 | Nys_precond->t = (DTYPE*) malloc(sizeof(DTYPE) * n); 70 | Nys_precond->px = (DTYPE*) malloc(sizeof(DTYPE) * n); 71 | Nys_precond->py = (DTYPE*) malloc(sizeof(DTYPE) * n); 72 | Nys_precond_build_(mu, n1, n2, K11, K12, &Nys_precond->M, &Nys_precond->U); 73 | *Nys_precond_ = Nys_precond; 74 | } 75 | 76 | // Apply a Nystrom preconditioner to a vector 77 | void Nys_precond_apply(Nys_precond_p Nys_precond, const DTYPE *x, DTYPE *y) 78 | { 79 | int n = Nys_precond->n, n1 = Nys_precond->n1; 80 | int *perm = Nys_precond->perm; 81 | DTYPE *px = Nys_precond->px, *py = Nys_precond->py, *t1 = Nys_precond->t; 82 | DTYPE *M = Nys_precond->M, *U = Nys_precond->U; 83 | for (int i = 0; i < n; i++) px[i] = x[perm[i]]; 84 | Nys_precond_apply_(n1, n, M, U, px, py, t1); 85 | for (int i = 0; i < n; i++) y[perm[i]] = py[i]; 86 | } 87 | 88 | // Destroy an initialized Nys_precond struct 89 | void Nys_precond_destroy(Nys_precond_p *Nys_precond_) 90 | { 91 | Nys_precond_p p = *Nys_precond_; 92 | if (p == NULL) return; 93 | free(p->perm); 94 | free(p->M); 95 | free(p->U); 96 | free(p->t); 97 | free(p); 98 | } 99 | -------------------------------------------------------------------------------- /examples/AFN_precond/Nys_precond.h: -------------------------------------------------------------------------------- 1 | #ifndef __NYS_PRECOND_H__ 2 | #define __NYS_PRECOND_H__ 3 | 4 | #include "H2Pack.h" 5 | 6 | struct Nys_precond 7 | { 8 | int n; // Size of the kernel matrix, == number of points 9 | int n1; // Size of K11 block (== Nystrom approximation rank) 10 | int *perm; // Permutation array, size n 11 | DTYPE *t; // Size n, intermediate vectors in Nystrom_precond_apply 12 | DTYPE *px, *py; // Size n, permuted x and y in Nystrom_precond_apply 13 | DTYPE *U; // Size n * n1, row major, Nystrom basis 14 | DTYPE *M; // Size n1, Nystrom eigenvalues + diagonal shift, then scaled 15 | }; 16 | typedef struct Nys_precond Nys_precond_s; 17 | typedef struct Nys_precond* Nys_precond_p; 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | // Build a randomize Nystrom preconditioner for a kernel matrix 24 | // Input parameters: 25 | // krnl_eval : Pointer to kernel matrix evaluation function 26 | // krnl_param : Pointer to kernel function parameter array 27 | // npt : Number of points in coord 28 | // pt_dim : Dimension of each point 29 | // coord : Matrix, size pt_dim-by-npt, coordinates of points 30 | // mu : Scalar, diagonal shift of the kernel matrix 31 | // nys_k : Nystrom approximation rank 32 | // Output parameter: 33 | // Nys_precond_ : Pointer to an initialized Nys_precond struct 34 | void Nys_precond_build( 35 | kernel_eval_fptr krnl_eval, void *krnl_param, const int npt, const int pt_dim, 36 | const DTYPE *coord, const DTYPE mu, const int nys_k, Nys_precond_p *Nys_precond_ 37 | ); 38 | 39 | // Apply a Nystrom preconditioner to a vector 40 | void Nys_precond_apply(Nys_precond_p Nys_precond, const DTYPE *x, DTYPE *y); 41 | 42 | // Destroy an initialized Nys_precond struct 43 | void Nys_precond_destroy(Nys_precond_p *Nys_precond_); 44 | 45 | #ifdef __cplusplus 46 | } 47 | #endif 48 | 49 | #endif 50 | -------------------------------------------------------------------------------- /examples/AFN_precond/common.make: -------------------------------------------------------------------------------- 1 | H2PACK_DIR = ../.. 2 | 3 | DEFS = 4 | INCS = -I$(H2PACK_DIR)/include 5 | CFLAGS = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS) 6 | LDFLAGS = -g -O3 -fopenmp 7 | LIBS = 8 | 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1) 10 | CFLAGS += -fopenmp -xHost 11 | endif 12 | 13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1) 14 | CFLAGS += -fopenmp -march=native -Wno-unused-result -Wno-unused-function 15 | LIBS += -lgfortran -lm 16 | endif 17 | 18 | ifeq ($(strip $(USE_MKL)), 1) 19 | DEFS += -DUSE_MKL 20 | CFLAGS += -mkl=parallel 21 | LIBS += -mkl 22 | endif 23 | 24 | ifeq ($(strip $(USE_OPENBLAS)), 1) 25 | OPENBLAS_INSTALL_DIR = ../../../OpenBLAS-git/install 26 | DEFS += -DUSE_OPENBLAS 27 | INCS += -I$(OPENBLAS_INSTALL_DIR)/include 28 | LIBS += -L$(OPENBLAS_INSTALL_DIR)/lib -lopenblas 29 | endif 30 | 31 | C_SRCS = $(wildcard *.c) 32 | C_OBJS = $(C_SRCS:.c=.c.o) 33 | EXES = test_AFN.exe test_Nys.exe test_AFN_IE.exe 34 | SHARED_OBJS = Nys_precond.c.o ../PCG/pcg.c.o 35 | 36 | # Delete the default old-fashion double-suffix rules 37 | .SUFFIXES: 38 | 39 | .SECONDARY: $(C_OBJS) $(SHARED_OBJS) 40 | 41 | all: $(EXES) 42 | 43 | %.c.o: %.c 44 | $(CC) $(CFLAGS) -c $^ -o $@ 45 | 46 | %.exe: %.c.o $(SHARED_OBJS) $(H2PACK_DIR)/lib/libH2Pack.a 47 | $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) 48 | 49 | clean: 50 | rm -f $(EXES) $(C_OBJS) $(SHARED_OBJS) 51 | -------------------------------------------------------------------------------- /examples/AFN_precond/precond_test_utils.h: -------------------------------------------------------------------------------- 1 | #ifndef __PRECOND_TEST_UTILS_H__ 2 | #define __PRECOND_TEST_UTILS_H__ 3 | 4 | #include 5 | #include "H2Pack_typedef.h" 6 | #include "H2Pack_kernels.h" 7 | #include "H2Pack.h" 8 | #include "../PCG/pcg.h" 9 | 10 | static DTYPE shift_ = 0.0; 11 | static int n_ = 0; 12 | 13 | static void H2Pack_matvec_diagshift(const void *h2pack_, const DTYPE *b, DTYPE *x) 14 | { 15 | H2Pack_p h2pack = (H2Pack_p) h2pack_; 16 | H2P_matvec(h2pack, b, x); 17 | #pragma omp simd 18 | for (int i = 0; i < h2pack->krnl_mat_size; i++) x[i] += shift_ * b[i]; 19 | } 20 | 21 | static void select_kernel( 22 | const int kid, const int pt_dim, const DTYPE kp, const DTYPE mu, const int npt, 23 | kernel_eval_fptr *krnl_eval_, kernel_bimv_fptr *krnl_bimv_, int *krnl_bimv_flops_ 24 | ) 25 | { 26 | shift_ = mu; 27 | n_ = npt; 28 | kernel_eval_fptr krnl_eval = NULL; 29 | kernel_bimv_fptr krnl_bimv = NULL; 30 | int krnl_bimv_flops = 0; 31 | switch (kid) 32 | { 33 | case 1: 34 | { 35 | if (pt_dim == 3) 36 | { 37 | krnl_eval = Gaussian_3D_eval_intrin_t; 38 | krnl_bimv = Gaussian_3D_krnl_bimv_intrin_t; 39 | krnl_bimv_flops = Gaussian_3D_krnl_bimv_flop; 40 | } else { 41 | krnl_eval = Gaussian_2D_eval_intrin_t; 42 | krnl_bimv = Gaussian_2D_krnl_bimv_intrin_t; 43 | krnl_bimv_flops = Gaussian_2D_krnl_bimv_flop; 44 | } 45 | printf("Test kernel: Gaussian k(x, y) = exp(-l * |x-y|^2), l = %.4f\n", kp); 46 | break; 47 | } 48 | case 2: 49 | { 50 | if (pt_dim == 3) 51 | { 52 | krnl_eval = Expon_3D_eval_intrin_t; 53 | krnl_bimv = Expon_3D_krnl_bimv_intrin_t; 54 | krnl_bimv_flops = Expon_3D_krnl_bimv_flop; 55 | } else { 56 | krnl_eval = Expon_2D_eval_intrin_t; 57 | krnl_bimv = Expon_2D_krnl_bimv_intrin_t; 58 | krnl_bimv_flops = Expon_2D_krnl_bimv_flop; 59 | } 60 | printf("Test kernel: Exponential k(x, y) = exp(-l * |x-y|), l = %.4f\n", kp); 61 | break; 62 | } 63 | case 3: 64 | { 65 | if (pt_dim == 3) 66 | { 67 | krnl_eval = Matern32_3D_eval_intrin_t; 68 | krnl_bimv = Matern32_3D_krnl_bimv_intrin_t; 69 | krnl_bimv_flops = Matern32_3D_krnl_bimv_flop; 70 | } else { 71 | krnl_eval = Matern32_2D_eval_intrin_t; 72 | krnl_bimv = Matern32_2D_krnl_bimv_intrin_t; 73 | krnl_bimv_flops = Matern32_2D_krnl_bimv_flop; 74 | } 75 | printf("Test kernel: 3/2 Matern k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|, l = %.4f\n", kp); 76 | break; 77 | } 78 | case 4: 79 | { 80 | if (pt_dim == 3) 81 | { 82 | krnl_eval = Matern52_3D_eval_intrin_t; 83 | krnl_bimv = Matern52_3D_krnl_bimv_intrin_t; 84 | krnl_bimv_flops = Matern32_3D_krnl_bimv_flop; 85 | } else { 86 | krnl_eval = Matern52_2D_eval_intrin_t; 87 | krnl_bimv = Matern52_2D_krnl_bimv_intrin_t; 88 | krnl_bimv_flops = Matern32_2D_krnl_bimv_flop; 89 | } 90 | printf("Test kernel: 5/2 Matern k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), l = %.4f\n", kp); 91 | break; 92 | } 93 | } 94 | *krnl_eval_ = krnl_eval; 95 | *krnl_bimv_ = krnl_bimv; 96 | *krnl_bimv_flops_ = krnl_bimv_flops; 97 | } 98 | 99 | static void H2mat_build( 100 | const int npt, const int pt_dim, DTYPE *coord, DTYPE reltol, kernel_eval_fptr krnl_eval, 101 | kernel_bimv_fptr krnl_bimv, int krnl_bimv_flops, void *krnl_param, H2Pack_p *h2mat_ 102 | ) 103 | { 104 | double st, et; 105 | H2Pack_p h2mat = NULL; 106 | int krnl_dim = 1, BD_JIT = 1; 107 | H2P_dense_mat_p *pp = NULL; 108 | printf("Building H2 representation with reltol = %.4e for kernel matrix...\n", reltol); 109 | H2P_init(&h2mat, pt_dim, krnl_dim, QR_REL_NRM, &reltol); 110 | H2P_calc_enclosing_box(pt_dim, npt, coord, NULL, &h2mat->root_enbox); 111 | H2P_partition_points(h2mat, npt, coord, 0, 0); 112 | st = get_wtime_sec(); 113 | H2P_generate_proxy_point_ID_file(h2mat, krnl_param, krnl_eval, NULL, &pp); 114 | et = get_wtime_sec(); 115 | printf("H2Pack proxy point selection time = %.3f s\n", et - st); 116 | st = get_wtime_sec(); 117 | H2P_build(h2mat, pp, BD_JIT, krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops); 118 | et = get_wtime_sec(); 119 | printf("H2Pack build time = %.3f s\n", et - st); 120 | H2P_print_statistic(h2mat); 121 | H2P_dense_mat_destroy(pp); 122 | *h2mat_ = h2mat; 123 | printf("\n"); 124 | } 125 | 126 | static void test_PCG( 127 | matvec_fptr Ax, void *Ax_param, matvec_fptr invMx, void *invMx_param, 128 | const int n, const int max_iter, const DTYPE CG_reltol 129 | ) 130 | { 131 | DTYPE relres; 132 | int flag, iter, pcg_print_level = 1; 133 | DTYPE *x = malloc(sizeof(DTYPE) * n); 134 | DTYPE *b = malloc(sizeof(DTYPE) * n); 135 | srand(126); // Match with Tianshi's code 136 | for (int i = 0; i < n; i++) 137 | { 138 | b[i] = (rand() / (DTYPE) RAND_MAX) - 0.5; 139 | x[i] = 0.0; 140 | } 141 | pcg( 142 | n, CG_reltol, max_iter, 143 | Ax, Ax_param, b, invMx, invMx_param, x, 144 | &flag, &relres, &iter, NULL, pcg_print_level 145 | ); 146 | free(x); 147 | free(b); 148 | } 149 | 150 | #endif 151 | -------------------------------------------------------------------------------- /examples/AFN_precond/test_AFN.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "precond_test_utils.h" 11 | #include "AFN_precond.h" 12 | #include "../PCG/pcg.h" 13 | 14 | int main(int argc, char **argv) 15 | { 16 | // Parse command line arguments 17 | int kid, npt, pt_dim, max_k, ss_npt, fsai_npt, fast_knn; 18 | DTYPE mu, kp, *coord = NULL; 19 | void *krnl_param = &kp; 20 | kernel_eval_fptr krnl_eval = NULL; 21 | kernel_bimv_fptr krnl_bimv = NULL; 22 | int krnl_bimv_flops = 0; 23 | if (argc < 10) 24 | { 25 | printf("Usage: %s kid kp mu npt pt_dim max_k ss_npt fsai_npt fast_knn coord_bin\n", argv[0]); 26 | printf(" - kid [int] : Kernel function ID\n"); 27 | printf(" 1 - Gaussian k(x, y) = exp(-l * |x-y|^2)\n"); 28 | printf(" 2 - Exponential k(x, y) = exp(-l * |x-y|)\n"); 29 | printf(" 3 - 3/2 Matern k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|\n"); 30 | printf(" 4 - 5/2 Matern k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), k = sqrt(5) * |x-y|\n"); 31 | printf(" - kp [double] : Kernel function parameter (l)\n"); 32 | printf(" - mu [double] : Kernel matrix diagonal shift\n"); 33 | printf(" - npt [int] : Number of points\n"); 34 | printf(" - pt_dim [int] : Point dimension\n"); 35 | printf(" - max_k [int] : Maximum global low-rank approximation rank\n"); 36 | printf(" - ss_npt [int] : Number of points in the sample set\n"); 37 | printf(" - fsai_npt [int] : Maximum number of nonzeros in each row of the AFN FSAI matrix\n"); 38 | printf(" - fast_knn [0 or 1] : If AFN FSAI should use fast approximated KNN instead of exact KNN\n"); 39 | printf(" - coord_bin [str] : (Optional) Binary file containing the coordinates, size pt_dim * npt,\n"); 40 | printf(" row major, each column is a point coordinate\n"); 41 | return 255; 42 | } 43 | kid = atoi(argv[1]); 44 | kp = atof(argv[2]); 45 | mu = atof(argv[3]); 46 | npt = atoi(argv[4]); 47 | pt_dim = atoi(argv[5]); 48 | max_k = atoi(argv[6]); 49 | ss_npt = atoi(argv[7]); 50 | fsai_npt = atoi(argv[8]); 51 | fast_knn = atoi(argv[9]); 52 | coord = (DTYPE*) malloc(sizeof(DTYPE) * npt * pt_dim); 53 | if (kid < 1 || kid > 4) kid = 1; 54 | if (pt_dim < 2 || pt_dim > 3) pt_dim = 3; 55 | if (argc >= 11) 56 | { 57 | FILE *inf = fopen(argv[10], "rb"); 58 | fread(coord, sizeof(DTYPE), npt * pt_dim, inf); 59 | fclose(inf); 60 | } else { 61 | srand(814); // Match with Tianshi's code 62 | DTYPE scale = DPOW((DTYPE) npt, 1.0 / (DTYPE) pt_dim); 63 | for (int i = 0; i < npt * pt_dim; i++) coord[i] = scale * (rand() / (DTYPE)(RAND_MAX)); 64 | } 65 | select_kernel(kid, pt_dim, kp, mu, npt, &krnl_eval, &krnl_bimv, &krnl_bimv_flops); 66 | printf("Point set: %d points in %d-D\n", npt, pt_dim); 67 | printf("Linear system to solve: (K(X, X) + %.4f * I) * x = b\n", mu); 68 | printf("\nAFN preconditioner parameters:\n"); 69 | printf("- Maximum Nystrom approximation rank = %d\n", max_k); 70 | printf("- Maximum Rank estimation sampled points = %d\n", ss_npt); 71 | printf("- Maximum FSAI matrix nonzeros per row = %d\n", fsai_npt); 72 | printf("- Fast KNN for FSAI sparsity pattern = %s\n", fast_knn ? "Yes" : "No"); 73 | printf("\n"); 74 | 75 | // Build H2 matrix 76 | double st, et; 77 | H2Pack_p h2mat = NULL; 78 | DTYPE h2_reltol = 1e-8; 79 | H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat); 80 | 81 | // Build AFN preconditioner 82 | printf("Building AFN preconditioner...\n"); 83 | st = get_wtime_sec(); 84 | AFN_precond_p AFN_precond = NULL; 85 | void *h2mat_ = (fast_knn) ? (void *) h2mat : NULL; 86 | AFN_precond_build(krnl_eval, krnl_param, npt, pt_dim, coord, mu, max_k, ss_npt, fsai_npt, h2mat_, &AFN_precond); 87 | et = get_wtime_sec(); 88 | printf("AFN_precond build time = %.3lf s\n", et - st); 89 | printf("AFN estimated kernel matrix rank = %d, ", AFN_precond->est_rank); 90 | printf("will use %s\n", (AFN_precond->est_rank >= max_k) ? "AFN" : "Nystrom"); 91 | 92 | // PCG test 93 | DTYPE CG_reltol = 1e-4; 94 | int max_iter = 400; 95 | test_PCG( 96 | H2Pack_matvec_diagshift, (void *) h2mat, 97 | (matvec_fptr) AFN_precond_apply, (void *) AFN_precond, 98 | npt, max_iter, CG_reltol 99 | ); 100 | 101 | // Print AFN preconditioner statistics and clean up 102 | AFN_precond_print_stat(AFN_precond); 103 | printf("\n"); 104 | free(coord); 105 | H2P_destroy(&h2mat); 106 | AFN_precond_destroy(&AFN_precond); 107 | return 0; 108 | } 109 | -------------------------------------------------------------------------------- /examples/AFN_precond/test_Nys.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "Nys_precond.h" 10 | #include "precond_test_utils.h" 11 | #include "../PCG/pcg.h" 12 | 13 | int main(int argc, char **argv) 14 | { 15 | // Parse command line arguments 16 | int kid, npt, pt_dim, nys_k; 17 | DTYPE mu, kp, *coord = NULL; 18 | void *krnl_param = &kp; 19 | kernel_eval_fptr krnl_eval = NULL; 20 | kernel_bimv_fptr krnl_bimv = NULL; 21 | int krnl_bimv_flops = 0; 22 | if (argc < 7) 23 | { 24 | printf("Usage: %s kid kp mu npt pt_dim nys_k coord_bin\n", argv[0]); 25 | printf(" - kid [int] : Kernel function ID\n"); 26 | printf(" 1 - Gaussian k(x, y) = exp(-l * |x-y|^2)\n"); 27 | printf(" 2 - Exponential k(x, y) = exp(-l * |x-y|)\n"); 28 | printf(" 3 - 3/2 Matern k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|\n"); 29 | printf(" 4 - 5/2 Matern k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), k = sqrt(5) * |x-y|\n"); 30 | printf(" - kp [double] : Kernel function parameter (l)\n"); 31 | printf(" - mu [double] : Kernel matrix diagonal shift\n"); 32 | printf(" - npt [int] : Number of points\n"); 33 | printf(" - pt_dim [int] : Point dimension\n"); 34 | printf(" - nys_k [int] : Randomized Nystrom approximation rank\n"); 35 | printf(" - coord_bin [str] : (Optional) Binary file containing the coordinates, size pt_dim * npt,\n"); 36 | printf(" row major, each column is a point coordinate\n"); 37 | return 255; 38 | } 39 | kid = atoi(argv[1]); 40 | kp = atof(argv[2]); 41 | mu = atof(argv[3]); 42 | npt = atoi(argv[4]); 43 | pt_dim = atoi(argv[5]); 44 | nys_k = atoi(argv[6]); 45 | coord = (DTYPE*) malloc(sizeof(DTYPE) * npt * pt_dim); 46 | if (kid < 1 || kid > 4) kid = 1; 47 | if (pt_dim < 2 || pt_dim > 3) pt_dim = 3; 48 | if (argc >= 8) 49 | { 50 | FILE *inf = fopen(argv[7], "rb"); 51 | fread(coord, sizeof(DTYPE), npt * pt_dim, inf); 52 | fclose(inf); 53 | } else { 54 | srand(814); // Match with Tianshi's code 55 | DTYPE scale = DPOW((DTYPE) npt, 1.0 / (DTYPE) pt_dim); 56 | for (int i = 0; i < npt * pt_dim; i++) coord[i] = scale * (rand() / (DTYPE)(RAND_MAX)); 57 | } 58 | select_kernel(kid, pt_dim, kp, mu, npt, &krnl_eval, &krnl_bimv, &krnl_bimv_flops); 59 | printf("Point set: %d points in %d-D\n", npt, pt_dim); 60 | printf("Linear system to solve: (K(X, X) + %.4f * I) * x = b\n", mu); 61 | printf("Nystrom approximation rank: %d\n", nys_k); 62 | 63 | // Build H2 matrix 64 | double st, et; 65 | H2Pack_p h2mat = NULL; 66 | DTYPE h2_reltol = 1e-8; 67 | H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat); 68 | 69 | // Build Randomized Nystrom preconditioner 70 | printf("Building randomize Nystrom preconditioner...\n"); 71 | st = get_wtime_sec(); 72 | Nys_precond_p Nys_precond = NULL; 73 | Nys_precond_build(krnl_eval, krnl_param, npt, pt_dim, coord, mu, nys_k, &Nys_precond); 74 | et = get_wtime_sec(); 75 | printf("Nys_precond build time = %.3lf s\n", et - st); 76 | 77 | // PCG test 78 | DTYPE CG_reltol = 1e-4; 79 | int max_iter = 400; 80 | test_PCG( 81 | H2Pack_matvec_diagshift, (void *) h2mat, 82 | (matvec_fptr) Nys_precond_apply, (void *) Nys_precond, 83 | npt, max_iter, CG_reltol 84 | ); 85 | 86 | // Clean up 87 | printf("\n"); 88 | free(coord); 89 | H2P_destroy(&h2mat); 90 | Nys_precond_destroy(&Nys_precond); 91 | return 0; 92 | } -------------------------------------------------------------------------------- /examples/GCC-OpenBLAS.make: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | USE_MKL = 0 3 | USE_OPENBLAS = 1 4 | 5 | include common.make 6 | 7 | # GCC 10 need to manually specify using SVE, -march=native is not enough 8 | # On A64FX SVE vector bits = 512, on other SVE supported processors this value might be different 9 | USE_AARCH64_SVE = 0 10 | SVE_VECTOR_BITS = 512 11 | ifeq ($(strip $(USE_AARCH64_SVE)), 1) 12 | CFLAGS := $(subst -march=native, -march=armv8.2-a+sve -msve-vector-bits=$(SVE_VECTOR_BITS), $(CFLAGS)) 13 | endif -------------------------------------------------------------------------------- /examples/ICC-MKL.make: -------------------------------------------------------------------------------- 1 | CC = icc 2 | USE_MKL = 1 3 | USE_OPENBLAS = 0 4 | 5 | include common.make -------------------------------------------------------------------------------- /examples/PCG/pcg.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "pcg.h" 9 | 10 | // Left preconditioned Conjugate Gradient for solving A * x = b 11 | void pcg( 12 | const int n, const DTYPE tol, const int max_iter, 13 | const matvec_fptr Ax, const void *Ax_param, const DTYPE *b, 14 | const matvec_fptr invMx, const void *invMx_param, DTYPE *x, 15 | int *flag_, DTYPE *relres_, int *iter_, DTYPE *res_vec, int print_level 16 | ) 17 | { 18 | size_t vec_msize = sizeof(DTYPE) * n; 19 | DTYPE *r = (DTYPE*) malloc(vec_msize); 20 | DTYPE *z = (DTYPE*) malloc(vec_msize); 21 | DTYPE *p = (DTYPE*) malloc(vec_msize); 22 | DTYPE *s = (DTYPE*) malloc(vec_msize); 23 | assert(r != NULL && z != NULL && p != NULL && s != NULL); 24 | 25 | double st, st0, et, et0; 26 | double t_Ax = 0, t_invMx = 0, t_vec = 0; 27 | st0 = omp_get_wtime(); 28 | 29 | // r = b - A * x; 30 | st = omp_get_wtime(); 31 | Ax(Ax_param, x, r); 32 | et = omp_get_wtime(); 33 | t_Ax += et - st; 34 | 35 | st = omp_get_wtime(); 36 | #pragma omp simd 37 | for (int i = 0; i < n; i++) r[i] = b[i] - r[i]; 38 | 39 | // b_2norm = norm(b, 2); 40 | // r_2norm = norm(r, 2); 41 | // rn_stop = b_2norm * tol; 42 | DTYPE b_2norm = 0.0, r_2norm = 0.0, rn_stop; 43 | #pragma omp simd 44 | for (int i = 0; i < n; i++) 45 | { 46 | b_2norm += b[i] * b[i]; 47 | r_2norm += r[i] * r[i]; 48 | } 49 | b_2norm = DSQRT(b_2norm); 50 | r_2norm = DSQRT(r_2norm); 51 | rn_stop = b_2norm * tol; 52 | et = omp_get_wtime(); 53 | t_vec += et - st; 54 | 55 | if (print_level > 0) 56 | { 57 | printf("\nPCG: ||b||_2 = %e, initial ||r||_2 = %e, stopping ||r||_2 = %e\n", b_2norm, r_2norm, rn_stop); 58 | printf("PCG: Max number of iterations: %d\n", max_iter); 59 | printf("Iter Residual norm Relative res. \n"); 60 | } 61 | 62 | int iter = 0; 63 | DTYPE alpha, beta, rho0, tmp, rho = 1.0; 64 | while (iter < max_iter && r_2norm > rn_stop) 65 | { 66 | // z = M \ r; 67 | st = omp_get_wtime(); 68 | if (invMx != NULL) invMx(invMx_param, r, z); 69 | else memcpy(z, r, vec_msize); 70 | et = omp_get_wtime(); 71 | t_invMx += et - st; 72 | 73 | // rho0 = rho; 74 | // rho = r' * z; 75 | // beta = rho / rho0; 76 | st = omp_get_wtime(); 77 | rho0 = rho; 78 | rho = 0.0; 79 | #pragma omp simd 80 | for (int i = 0; i < n; i++) rho += r[i] * z[i]; 81 | beta = rho / rho0; 82 | 83 | // p = z + beta * p; or p = z; 84 | if (iter == 0) memcpy(p, z, vec_msize); 85 | else 86 | { 87 | #pragma omp simd 88 | for (int i = 0; i < n; i++) p[i] = z[i] + beta * p[i]; 89 | } 90 | et = omp_get_wtime(); 91 | t_vec += et - st; 92 | 93 | // s = A * p; 94 | // alpha = rho / (p' * s); 95 | st = omp_get_wtime(); 96 | Ax(Ax_param, p, s); 97 | et = omp_get_wtime(); 98 | t_Ax += et - st; 99 | 100 | st = omp_get_wtime(); 101 | tmp = 0.0; 102 | #pragma omp simd 103 | for (int i = 0; i < n; i++) tmp += p[i] * s[i]; 104 | alpha = rho / tmp; 105 | 106 | // x = x + alpha * p; 107 | // r = r - alpha * s; 108 | r_2norm = 0.0; 109 | #pragma omp simd 110 | for (int i = 0; i < n; i++) 111 | { 112 | x[i] += alpha * p[i]; 113 | r[i] -= alpha * s[i]; 114 | r_2norm += r[i] * r[i]; 115 | } 116 | r_2norm = DSQRT(r_2norm); 117 | if (res_vec != NULL) res_vec[iter] = r_2norm; 118 | iter++; 119 | et = omp_get_wtime(); 120 | t_vec += et - st; 121 | 122 | if (print_level > 0) printf("%4d %5.4e %5.4e\n", iter, r_2norm, r_2norm / b_2norm); 123 | } // End of while 124 | *flag_ = (r_2norm <= rn_stop) ? 0 : 1; 125 | *relres_ = r_2norm / b_2norm; 126 | *iter_ = iter; 127 | et0 = omp_get_wtime(); 128 | 129 | // Sanity check 130 | Ax(Ax_param, x, r); 131 | r_2norm = 0.0; 132 | #pragma omp simd 133 | for (int i = 0; i < n; i++) 134 | { 135 | r[i] = b[i] - r[i]; 136 | r_2norm += r[i] * r[i]; 137 | } 138 | r_2norm = DSQRT(r_2norm); 139 | 140 | if (print_level > 0) 141 | { 142 | printf("PCG: Final relres = %e\n", r_2norm / b_2norm); 143 | if (*flag_ == 0) printf("PCG: Converged in %d iterations, %.2f seconds\n", iter, et0 - st0); 144 | else printf("PCG: Reached maximum number of iterations, %.2f seconds\n", et0 - st0); 145 | printf("PCG: time for Ax, invMx, vector operations: %.2f, %.2f, %.2f seconds\n\n", t_Ax, t_invMx, t_vec); 146 | } 147 | 148 | free(r); 149 | free(z); 150 | free(p); 151 | free(s); 152 | } 153 | -------------------------------------------------------------------------------- /examples/PCG/pcg.h: -------------------------------------------------------------------------------- 1 | #ifndef __PCG_H__ 2 | #define __PCG_H__ 3 | 4 | #ifndef DTYPE 5 | #define DTYPE double 6 | #define DSQRT sqrt 7 | #endif 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | // b := A * x 14 | typedef void (*matvec_fptr) (const void *param, const DTYPE *x, DTYPE *b); 15 | 16 | // Left preconditioned Conjugate Gradient for solving A * x = b 17 | // Reference: Iterative Methods for Sparse Linear System (2nd Edition), algorithm 9.1 18 | // Input parameters: 19 | // n : Size of the matrix 20 | // tol : Residual vector norm tolerance 21 | // max_iter : Maximum number of iterations 22 | // Ax : Function pointer for calculating A * x 23 | // Ax_param : Pointer to Ax function parameters 24 | // b : Size n, right-hand size vector 25 | // invMx : Function pointer for applying preconditioner M^{-1} * r, 26 | // NULL pointer means no preconditioning 27 | // invMx_param : Pointer to invMx function parameters 28 | // x : Size n, initial guess vector 29 | // print_level : Positive integer, higher value means more output 30 | // Output parameters: 31 | // x : Size n, solution vector 32 | // *flag_ : 0 == converged, 1 == not converged 33 | // *relres_ : Residual vector relative 2-norm at last step 34 | // *iter_ : Number of iterations performed 35 | // res_vec : Size >= max_iter, Residual vector relative 2-norms at each iteration, 36 | // NULL pointer means these values will not be recorded 37 | void pcg( 38 | const int n, const DTYPE tol, const int max_iter, 39 | const matvec_fptr Ax, const void *Ax_param, const DTYPE *b, 40 | const matvec_fptr invMx, const void *invMx_param, DTYPE *x, 41 | int *flag_, DTYPE *relres_, int *iter_, DTYPE *res_vec, int print_level 42 | ); 43 | 44 | #ifdef __cplusplus 45 | } 46 | #endif 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/CSRPlus.h: -------------------------------------------------------------------------------- 1 | // --------------------------------------------------------------- 2 | // @brief : CSRPlus matrix header file 3 | // @author : Hua Huang 4 | // Edmond Chow 5 | // 6 | // Copyright (c) 2017-2020 Georgia Institute of Technology 7 | // ---------------------------------------------------------------- 8 | 9 | #ifndef __CSRPLUS_H__ 10 | #define __CSRPLUS_H__ 11 | 12 | struct CSRP_mat 13 | { 14 | // Standard CSR arrays and parameters 15 | int nrow, ncol, nnz; 16 | int *row_ptr; 17 | int *col; 18 | double *val; 19 | 20 | // CSRPlus task partitioning information 21 | int nblk; // Number of non-zero blocks 22 | int nthread; // Number of threads 23 | int *nnz_spos; // First nnz of a block 24 | int *nnz_epos; // Last nnz (included) of a block 25 | int *first_row; // First row of a block 26 | int *last_row; // Last row of a block 27 | int *fr_intact; // If the first row of a block is intact 28 | int *lr_intact; // If the last row of a block is intact 29 | double *fr_res; // Partial result of the first row 30 | double *lr_res; // Partial result of the last row 31 | }; 32 | 33 | typedef struct CSRP_mat CSRP_mat_s; 34 | typedef struct CSRP_mat* CSRP_mat_p; 35 | 36 | #ifdef __cplusplus 37 | extern "C" { 38 | #endif 39 | 40 | // Initialize a CSRP_mat structure using a COO matrix 41 | // Note: This function assumes that the input COO matrix is not sorted 42 | // Input parameters: 43 | // nrow, ncol, nnz : Number of rows, columns and non-zeros 44 | // row, col, val : Row indices, column indices and values of non-zeros 45 | // Output parameter: 46 | // *csrp_mat_ : Pointer to a initialized CSRP_mat structure 47 | void CSRP_init_with_COO_mat( 48 | const int nrow, const int ncol, const int nnz, const int *row, 49 | const int *col, const double *val, CSRP_mat_p *csrp_mat_ 50 | ); 51 | 52 | // Free a CSRP_mat structure 53 | // Input parameter: 54 | // *csrp_mat_ : Pointer to a CSRP_mat structure 55 | void CSRP_free(CSRP_mat_p *csrp_mat_); 56 | 57 | // Partition a CSR matrix into multiple blocks with the same nnz 58 | // for multiple threads execution of SpMV 59 | // Input parameters: 60 | // csrp_mat : Pointer to a CSRP_mat structure 61 | // nblk : Number of non-zero blocks 62 | // nthread : Number of threads to be used in SpMV later 63 | // Output parameter: 64 | // csrp_mat : Pointer to a CSRP_mat structure with partitioning information 65 | void CSRP_partition_multithread(CSRP_mat_p csrp_mat, const int nblk, const int nthread); 66 | 67 | // Use first-touch policy to optimize the storage of CSR arrays in a CSRP_mat structure 68 | // Input: 69 | // csrp_mat : Pointer to a CSRP_mat structure 70 | // Output: 71 | // csrp_mat : Pointer to a CSRP_mat structure with NUMA optimized storage 72 | void CSRP_optimize_NUMA(CSRP_mat_p csrp_mat); 73 | 74 | // Perform OpenMP parallelized CSR SpMV with a CSRP_mat structure 75 | // Input parameters: 76 | // csrp_mat : Pointer to an initialized and partitioned CSRP_mat structure 77 | // x : Input vector 78 | // Output parameter: 79 | // y : Output vector, will be overwritten by csrp_mat * x 80 | void CSRP_SpMV(CSRP_mat_p csrp_mat, const double *x, double *y); 81 | 82 | #ifdef __cplusplus 83 | } 84 | #endif 85 | 86 | #endif 87 | 88 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/FSAI_precond.h: -------------------------------------------------------------------------------- 1 | #ifndef __FSAI_PRECOND_H__ 2 | #define __FSAI_PRECOND_H__ 3 | 4 | #include "H2Pack.h" 5 | #include "CSRPlus.h" 6 | 7 | struct FSAI_precond 8 | { 9 | int mat_size; // Size of the matrix to be preconditioned 10 | int *fwd_pmt; // Forward permutation index array for input vector 11 | int *bwd_pmt; // Backward permutation index array for output vector 12 | DTYPE *x0; // Size mat_size, storing G * b in apply_FSAI_precond() 13 | DTYPE *pmt_b; // Size mat_size, storing the input vector after permutation 14 | DTYPE *pmt_x; // Size mat_size, storing the output vector before permutation 15 | CSRP_mat_p G, Gt; // FSAI constructed matrix and its transpose 16 | 17 | // Statistic info 18 | int n_apply; 19 | double t_apply, t_build; 20 | double mem_MB; 21 | }; 22 | typedef struct FSAI_precond FSAI_precond_s; 23 | typedef struct FSAI_precond* FSAI_precond_p; 24 | 25 | #ifdef __cplusplus 26 | extern "C" { 27 | #endif 28 | 29 | // Construct a FSAI_precond from a H2Pack structure 30 | // Input parameters: 31 | // h2pack : Constructed H2Pack structure 32 | // rank : Number of nearest neighbors 33 | // shift : Diagonal shifting of the target matrix 34 | // Output parameter: 35 | // *precond_ : Constructed FSAI_precond structure 36 | void H2P_build_FSAI_precond(H2Pack_p h2pack, const int rank, const DTYPE shift, FSAI_precond_p *precond_); 37 | 38 | // Apply FSAI preconditioner, x := M_{FSAI}^{-1} * b 39 | // Input parameters: 40 | // precond : Constructed FSAI_precond structure 41 | // b : Size precond->mat_size, input vector 42 | // Output parameter: 43 | // x : Size precond->mat_size, output vector 44 | void FSAI_precond_apply(FSAI_precond_p precond, const DTYPE *b, DTYPE *x); 45 | 46 | // Destroy a FSAI_precond structure 47 | // Input parameter: 48 | // *precond_ : Pointer to a FSAI_precond structure to be destroyed 49 | void FSAI_precond_destroy(FSAI_precond_p *precond_); 50 | 51 | // Print statistic info of a FSAI_precond structure 52 | // Input parameter: 53 | // precond : FSAI_precond structure whose statistic info to be printed 54 | void FSAI_precond_print_stat(FSAI_precond_p precond); 55 | 56 | #ifdef __cplusplus 57 | } 58 | #endif 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/GCC-OpenBLAS.make: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | USE_MKL = 0 3 | USE_OPENBLAS = 1 4 | 5 | include common.make -------------------------------------------------------------------------------- /examples/SPDHSS-H2/ICC-MKL.make: -------------------------------------------------------------------------------- 1 | CC = icc 2 | USE_MKL = 1 3 | USE_OPENBLAS = 0 4 | 5 | include common.make -------------------------------------------------------------------------------- /examples/SPDHSS-H2/LRD_precond.h: -------------------------------------------------------------------------------- 1 | #ifndef __LRD_PRECOND_H__ 2 | #define __LRD_PRECOND_H__ 3 | 4 | #include "H2Pack.h" 5 | 6 | struct LRD_precond 7 | { 8 | int mat_size; // Size of the matrix to be preconditioned 9 | int rank; // Rank of the low-rank decomposition 10 | int *fwd_pmt; // Forward permutation index array for input vector 11 | int *bwd_pmt; // Backward permutation index array for output vector 12 | DTYPE shift; // Diagonal shift 13 | DTYPE *Ut; // Size rank * mat_size, LRD matrix 14 | DTYPE *pmt_b; // Size mat_size, storing the input vector after permutation 15 | DTYPE *pmt_x; // Size mat_size, storing the output vector before permutation 16 | DTYPE *workbuf; // Size rank, working buffer in apply_LRD_precond 17 | 18 | // Statistic info 19 | int n_apply; 20 | double t_apply, t_build; 21 | double mem_MB; 22 | }; 23 | typedef struct LRD_precond LRD_precond_s; 24 | typedef struct LRD_precond* LRD_precond_p; 25 | 26 | #ifdef __cplusplus 27 | extern "C" { 28 | #endif 29 | 30 | // Construct a LRD_precond from a H2Pack structure using Nystrom method with random sampling 31 | // Input parameters: 32 | // h2pack : Constructed H2Pack structure 33 | // rank : Rank of the low-rank decomposition 34 | // shift : Diagonal shifting of the target matrix 35 | // Output parameter: 36 | // *precond_ : Constructed LRD_precond structure 37 | void H2P_build_LRD_precond(H2Pack_p h2pack, const int rank, const DTYPE shift, LRD_precond_p *precond_); 38 | 39 | // Apply LRD preconditioner, x := M_{LRD}^{-1} * b 40 | // Input parameters: 41 | // precond : Constructed LRD_precond structure 42 | // b : Size precond->mat_size, input vector 43 | // Output parameter: 44 | // x : Size precond->mat_size, output vector 45 | void LRD_precond_apply(LRD_precond_p precond, const DTYPE *b, DTYPE *x); 46 | 47 | // Destroy a LRD_precond structure 48 | // Input parameter: 49 | // *precond_ : Pointer to a LRD_precond structure to be destroyed 50 | void LRD_precond_destroy(LRD_precond_p *precond_); 51 | 52 | // Print statistic info of a LRD_precond structure 53 | // Input parameter: 54 | // precond : LRD_precond structure whose statistic info to be printed 55 | void LRD_precond_print_stat(LRD_precond_p precond); 56 | 57 | #ifdef __cplusplus 58 | } 59 | #endif 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/block_jacobi_precond.h: -------------------------------------------------------------------------------- 1 | #ifndef __BLOCK_JACOBI_PRECOND_H__ 2 | #define __BLOCK_JACOBI_PRECOND_H__ 3 | 4 | #include "H2Pack.h" 5 | 6 | struct block_jacobi_precond 7 | { 8 | int mat_size; // Size of the matrix to be preconditioned 9 | int n_block; // Number of blocks to use 10 | int *blk_sizes; // Size n_block, size of each block 11 | int *blk_displs; // Size n_block+1, start row & column of each block 12 | int *fwd_pmt; // Forward permutation index array for input vector 13 | int *bwd_pmt; // Backward permutation index array for output vector 14 | size_t *blk_inv_ptr; // Size n_block, offset of the inverse of each block 15 | DTYPE *pmt_b; // Size mat_size, storing the input vector after permutation 16 | DTYPE *pmt_x; // Size mat_size, storing the output vector before permutation 17 | DTYPE *blk_inv; // Size unknown, inverse of each block 18 | 19 | // Statistic info 20 | int n_apply; 21 | double t_apply, t_build; 22 | double mem_MB; 23 | }; 24 | typedef struct block_jacobi_precond block_jacobi_precond_s; 25 | typedef struct block_jacobi_precond* block_jacobi_precond_p; 26 | 27 | #ifdef __cplusplus 28 | extern "C" { 29 | #endif 30 | 31 | // Construct a block_jacobi_precond from a H2Pack structure 32 | // Input parameters: 33 | // h2pack : Constructed H2Pack structure 34 | // shift : Diagonal shifting of the target matrix 35 | // Output parameter: 36 | // *precond_ : Constructed block_jacobi_precond structure 37 | void H2P_build_block_jacobi_precond(H2Pack_p h2pack, const DTYPE shift, block_jacobi_precond_p *precond_); 38 | 39 | // Apply block Jacobi preconditioner, x := M_{BJP}^{-1} * b 40 | // Input parameters: 41 | // precond : Constructed block_jacobi_precond structure 42 | // b : Size precond->mat_size, input vector 43 | // Output parameter: 44 | // x : Size precond->mat_size, output vector 45 | void block_jacobi_precond_apply(block_jacobi_precond_p precond, const DTYPE *b, DTYPE *x); 46 | 47 | // Destroy a block_jacobi_precond structure 48 | // Input parameter: 49 | // *precond_ : Pointer to a block_jacobi_precond structure to be destroyed 50 | void block_jacobi_precond_destroy(block_jacobi_precond_p *precond_); 51 | 52 | // Print statistic info of a block_jacobi_precond structure 53 | // Input parameter: 54 | // precond : block_jacobi_precond structure whose statistic info to be printed 55 | void block_jacobi_precond_print_stat(block_jacobi_precond_p precond); 56 | 57 | #ifdef __cplusplus 58 | } 59 | #endif 60 | 61 | #endif 62 | 63 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/common.make: -------------------------------------------------------------------------------- 1 | H2PACK_DIR = ../.. 2 | 3 | DEFS = 4 | INCS = -I$(H2PACK_DIR)/include 5 | CFLAGS = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS) 6 | LDFLAGS = -g -O3 -fopenmp 7 | LIBS = 8 | 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1) 10 | CFLAGS += -fopenmp -xHost 11 | endif 12 | 13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1) 14 | CFLAGS += -fopenmp -march=native -Wno-unused-result -Wno-unused-function 15 | LIBS += -lgfortran -lm 16 | endif 17 | 18 | ifeq ($(strip $(USE_MKL)), 1) 19 | DEFS += -DUSE_MKL 20 | CFLAGS += -mkl=parallel 21 | LIBS += -mkl 22 | endif 23 | 24 | ifeq ($(strip $(USE_OPENBLAS)), 1) 25 | OPENBLAS_INSTALL_DIR = ../../../OpenBLAS-git/install 26 | DEFS += -DUSE_OPENBLAS 27 | INCS += -I$(OPENBLAS_INSTALL_DIR)/include 28 | LIBS += -L$(OPENBLAS_INSTALL_DIR)/lib -lopenblas 29 | endif 30 | 31 | C_SRCS = $(wildcard *.c) 32 | C_OBJS = $(C_SRCS:.c=.c.o) 33 | EXES = example_regularHSS.exe example_SPDHSSH2.exe example_SPDHSSH2_tol.exe test_FSAI.exe test_FSAI_IE.exe 34 | SHARED_OBJS = ../PCG/pcg.c.o block_jacobi_precond.c.o LRD_precond.c.o FSAI_precond.c.o CSRPlus.c.o pcg_tests.c.o 35 | 36 | # Delete the default old-fashion double-suffix rules 37 | .SUFFIXES: 38 | 39 | .SECONDARY: $(C_OBJS) $(SHARED_OBJS) 40 | 41 | all: $(EXES) 42 | 43 | %.c.o: %.c 44 | $(CC) $(CFLAGS) -c $^ -o $@ 45 | 46 | %.exe: %.c.o $(SHARED_OBJS) $(H2PACK_DIR)/lib/libH2Pack.a 47 | $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) 48 | 49 | clean: 50 | rm -f $(EXES) $(C_OBJS) 51 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/pcg_tests.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "H2Pack.h" 7 | 8 | #include "../PCG/pcg.h" 9 | #include "block_jacobi_precond.h" 10 | #include "LRD_precond.h" 11 | #include "FSAI_precond.h" 12 | 13 | static DTYPE shift_; 14 | 15 | void H2Pack_matvec(const void *h2pack_, const DTYPE *b, DTYPE *x) 16 | { 17 | H2Pack_p h2pack = (H2Pack_p) h2pack_; 18 | H2P_matvec(h2pack, b, x); 19 | #pragma omp simd 20 | for (int i = 0; i < h2pack->krnl_mat_size; i++) x[i] += shift_ * b[i]; 21 | } 22 | 23 | void block_jacobi_precond(const void *precond_, const DTYPE *b, DTYPE *x) 24 | { 25 | block_jacobi_precond_p precond = (block_jacobi_precond_p) precond_; 26 | block_jacobi_precond_apply(precond, b, x); 27 | } 28 | 29 | void LRD_precond(const void *precond_, const DTYPE *b, DTYPE *x) 30 | { 31 | LRD_precond_p precond = (LRD_precond_p) precond_; 32 | LRD_precond_apply(precond, b, x); 33 | } 34 | 35 | void FSAI_precond(const void *precond_, const DTYPE *b, DTYPE *x) 36 | { 37 | FSAI_precond_p precond = (FSAI_precond_p) precond_; 38 | FSAI_precond_apply(precond, b, x); 39 | } 40 | 41 | void HSS_ULV_Chol_precond(const void *hssmat_, const DTYPE *b, DTYPE *x) 42 | { 43 | H2Pack_p hssmat = (H2Pack_p) hssmat_; 44 | H2P_HSS_ULV_Cholesky_solve(hssmat, 3, b, x); 45 | } 46 | 47 | // Test preconditioned conjugate gradient solver with different preconditioner 48 | void pcg_tests( 49 | const int krnl_mat_size, H2Pack_p h2mat, H2Pack_p hssmat, const DTYPE shift, 50 | const int max_rank, const int max_iter, const DTYPE CG_tol, const int method 51 | ) 52 | { 53 | DTYPE *x = malloc(sizeof(DTYPE) * krnl_mat_size); 54 | DTYPE *y = malloc(sizeof(DTYPE) * krnl_mat_size); 55 | assert(x != NULL && y != NULL); 56 | 57 | 58 | // Random right hand side vector 59 | srand48(2); 60 | for (int i = 0; i < krnl_mat_size; i++) y[i] = 0.5 - drand48(); 61 | 62 | int flag, iter, pcg_print_level = 1; 63 | DTYPE relres; 64 | double st, et; 65 | 66 | shift_ = shift; 67 | 68 | if (method == 0 || method == 1) 69 | { 70 | printf("\nStarting PCG solve without preconditioner...\n"); 71 | memset(x, 0, sizeof(DTYPE) * krnl_mat_size); 72 | st = get_wtime_sec(); 73 | pcg( 74 | krnl_mat_size, CG_tol, max_iter, 75 | H2Pack_matvec, h2mat, y, NULL, NULL, x, 76 | &flag, &relres, &iter, NULL, pcg_print_level 77 | ); 78 | et = get_wtime_sec(); 79 | printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st); 80 | } 81 | 82 | if (method == 0 || method == 2) 83 | { 84 | printf("\nConstructing block Jacobi preconditioner...\n"); 85 | block_jacobi_precond_p bj_precond; 86 | H2P_build_block_jacobi_precond(h2mat, shift, &bj_precond); 87 | printf("Starting PCG solve with block Jacobi preconditioner...\n"); 88 | memset(x, 0, sizeof(DTYPE) * krnl_mat_size); 89 | st = get_wtime_sec(); 90 | pcg( 91 | krnl_mat_size, CG_tol, max_iter, 92 | H2Pack_matvec, h2mat, y, block_jacobi_precond, bj_precond, x, 93 | &flag, &relres, &iter, NULL, pcg_print_level 94 | ); 95 | et = get_wtime_sec(); 96 | printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st); 97 | block_jacobi_precond_print_stat(bj_precond); 98 | block_jacobi_precond_destroy(&bj_precond); 99 | } 100 | 101 | if (method == 0 || method == 3) 102 | { 103 | printf("\nConstructing LRD preconditioner...\n"); 104 | LRD_precond_p lrd_precond; 105 | H2P_build_LRD_precond(h2mat, max_rank, shift, &lrd_precond); 106 | printf("Starting PCG solve with LRD preconditioner...\n"); 107 | memset(x, 0, sizeof(DTYPE) * krnl_mat_size); 108 | st = get_wtime_sec(); 109 | pcg( 110 | krnl_mat_size, CG_tol, max_iter, 111 | H2Pack_matvec, h2mat, y, LRD_precond, lrd_precond, x, 112 | &flag, &relres, &iter, NULL, pcg_print_level 113 | ); 114 | et = get_wtime_sec(); 115 | printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st); 116 | LRD_precond_print_stat(lrd_precond); 117 | LRD_precond_destroy(&lrd_precond); 118 | } 119 | 120 | if (method == 0 || method == 4) 121 | { 122 | printf("\nConstructing FSAI preconditioner...\n"); 123 | FSAI_precond_p fsai_precond; 124 | H2P_build_FSAI_precond(h2mat, max_rank, shift, &fsai_precond); 125 | printf("Starting PCG solve with FSAI preconditioner...\n"); 126 | memset(x, 0, sizeof(DTYPE) * krnl_mat_size); 127 | st = get_wtime_sec(); 128 | pcg( 129 | krnl_mat_size, CG_tol, max_iter, 130 | H2Pack_matvec, h2mat, y, FSAI_precond, fsai_precond, x, 131 | &flag, &relres, &iter, NULL, pcg_print_level 132 | ); 133 | et = get_wtime_sec(); 134 | printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st); 135 | FSAI_precond_print_stat(fsai_precond); 136 | FSAI_precond_destroy(&fsai_precond); 137 | } 138 | 139 | if (method == 0 || method == 5) 140 | { 141 | printf("\nStarting PCG solve with SPDHSS preconditioner...\n"); 142 | memset(x, 0, sizeof(DTYPE) * krnl_mat_size); 143 | st = get_wtime_sec(); 144 | pcg( 145 | krnl_mat_size, CG_tol, max_iter, 146 | H2Pack_matvec, h2mat, y, HSS_ULV_Chol_precond, hssmat, x, 147 | &flag, &relres, &iter, NULL, pcg_print_level 148 | ); 149 | et = get_wtime_sec(); 150 | printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st); 151 | } 152 | 153 | free(x); 154 | free(y); 155 | } 156 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/pcg_tests.h: -------------------------------------------------------------------------------- 1 | #ifndef __PCG_TESTS_H__ 2 | #define __PCG_TESTS_H__ 3 | 4 | #include "H2Pack.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | // Test preconditioned conjugate gradient solver with different preconditioner 11 | // Input parameters: 12 | // krnl_mat_size : Size of the kernel matrix 13 | // h2mat : Constructed H2 matrix 14 | // hssmat : Constructed SPDHSS matrix 15 | // shift : Diagonal shift 16 | // max_rank : Maximum approximation rank for LRD and FSAI 17 | // max_iter : Maximum number of PCG iterations 18 | // CG_tol : Residual vector norm tolerance 19 | // method : Method(s) to be tested: 1-5: no precond, block Jaboci, LRD, 20 | // FSAI, HSS. 0: test all. 21 | void pcg_tests( 22 | const int krnl_mat_size, H2Pack_p h2mat, H2Pack_p hssmat, const DTYPE shift, 23 | const int max_rank, const int max_iter, const DTYPE CG_tol, const int method 24 | ); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/test_FSAI.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "FSAI_precond.h" 10 | #include "../AFN_precond/precond_test_utils.h" 11 | #include "../PCG/pcg.h" 12 | 13 | int main(int argc, char **argv) 14 | { 15 | // Parse command line arguments 16 | int kid, npt, pt_dim, fsai_npt, fast_knn; 17 | DTYPE mu, kp, *coord = NULL; 18 | void *krnl_param = &kp; 19 | kernel_eval_fptr krnl_eval = NULL; 20 | kernel_bimv_fptr krnl_bimv = NULL; 21 | int krnl_bimv_flops = 0; 22 | if (argc < 8) 23 | { 24 | printf("Usage: %s kid kp mu npt pt_dim fsai_npt fast_knn coord_bin\n", argv[0]); 25 | printf(" - kid [int] : Kernel function ID\n"); 26 | printf(" 1 - Gaussian k(x, y) = exp(-l * |x-y|^2)\n"); 27 | printf(" 2 - Exponential k(x, y) = exp(-l * |x-y|)\n"); 28 | printf(" 3 - 3/2 Matern k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|\n"); 29 | printf(" 4 - 5/2 Matern k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), k = sqrt(5) * |x-y|\n"); 30 | printf(" - kp [double] : Kernel function parameter (l)\n"); 31 | printf(" - mu [double] : Kernel matrix diagonal shift\n"); 32 | printf(" - npt [int] : Number of points\n"); 33 | printf(" - pt_dim [int] : Point dimension\n"); 34 | printf(" - fsai_npt [int] : Maximum number of nonzeros in each row of the AFN FSAI matrix\n"); 35 | printf(" - fast_knn [0 or 1] : If AFN FSAI should use fast approximated KNN instead of exact KNN\n"); 36 | printf(" - coord_bin [str] : (Optional) Binary file containing the coordinates, size pt_dim * npt,\n"); 37 | printf(" row major, each column is a point coordinate\n"); 38 | return 255; 39 | } 40 | kid = atoi(argv[1]); 41 | kp = atof(argv[2]); 42 | mu = atof(argv[3]); 43 | npt = atoi(argv[4]); 44 | pt_dim = atoi(argv[5]); 45 | fsai_npt = atoi(argv[6]); 46 | fast_knn = atoi(argv[7]); 47 | coord = (DTYPE*) malloc(sizeof(DTYPE) * npt * pt_dim); 48 | if (kid < 1 || kid > 4) kid = 1; 49 | if (pt_dim < 2 || pt_dim > 3) pt_dim = 3; 50 | if (argc >= 9) 51 | { 52 | FILE *inf = fopen(argv[8], "rb"); 53 | fread(coord, sizeof(DTYPE), npt * pt_dim, inf); 54 | fclose(inf); 55 | } else { 56 | srand(814); // Match with Tianshi's code 57 | DTYPE scale = DPOW((DTYPE) npt, 1.0 / (DTYPE) pt_dim); 58 | for (int i = 0; i < npt * pt_dim; i++) coord[i] = scale * (rand() / (DTYPE)(RAND_MAX)); 59 | } 60 | select_kernel(kid, pt_dim, kp, mu, npt, &krnl_eval, &krnl_bimv, &krnl_bimv_flops); 61 | printf("Point set: %d points in %d-D\n", npt, pt_dim); 62 | printf("Linear system to solve: (K(X, X) + %.4f * I) * x = b\n", mu); 63 | printf("\nFSAI preconditioner parameters:\n"); 64 | printf("- Maximum FSAI matrix nonzeros per row = %d\n", fsai_npt); 65 | printf("- Fast KNN for FSAI sparsity pattern = %s\n", fast_knn ? "Yes" : "No"); 66 | printf("\n"); 67 | 68 | // Build H2 matrix 69 | double st, et; 70 | H2Pack_p h2mat = NULL; 71 | DTYPE h2_reltol = 1e-8; 72 | H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat); 73 | 74 | // Build FSAI preconditioner 75 | printf("Building FSAI preconditioner...\n"); 76 | FSAI_precond_p FSAI_precond = NULL; 77 | H2P_build_FSAI_precond(h2mat, fsai_npt, mu, &FSAI_precond); 78 | int nnz_upper = fsai_npt * (fsai_npt + 1) / 2 + fsai_npt * (npt - fsai_npt); 79 | DEBUG_PRINTF("FSAI G matrix nnz = %d, nnz upper bound = %d\n", FSAI_precond->G->nnz, nnz_upper); 80 | 81 | // PCG test 82 | DTYPE CG_reltol = 1e-4; 83 | int max_iter = 400; 84 | test_PCG( 85 | H2Pack_matvec_diagshift, (void *) h2mat, 86 | (matvec_fptr) FSAI_precond_apply, (void *) FSAI_precond, 87 | npt, max_iter, CG_reltol 88 | ); 89 | FSAI_precond_print_stat(FSAI_precond); 90 | 91 | // Clean up 92 | printf("\n"); 93 | free(coord); 94 | H2P_destroy(&h2mat); 95 | FSAI_precond_destroy(&FSAI_precond); 96 | return 0; 97 | } 98 | -------------------------------------------------------------------------------- /examples/SPDHSS-H2/test_FSAI_IE.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "H2Pack.h" 9 | #include "FSAI_precond.h" 10 | #include "../AFN_precond/IE_diag_quad.h" 11 | #include "../AFN_precond/precond_test_utils.h" 12 | 13 | // This file follows the test settings in the FLAM library rskelf/test/{ie_cube1.m, ie_square1.m} 14 | // Solve an intergral equation (IE): a_i * u_i + b_i * \sum_{j=1}^n K(x_i, x_j) * c_j * u_j = f_i, 15 | // with setting a(x_i) == 0, b(x_i) == c(x_i) == 1, K(x, y) is the Laplace kernel. 16 | // This test setting is the same as Example 5 in paper DOI:10.1002/cpa.21577 17 | 18 | static DTYPE k_scale; // Scaling factor for the kernel function 19 | static void H2Pack_matvec_scale(const void *h2pack_, const DTYPE *b, DTYPE *x) 20 | { 21 | H2Pack_p h2pack = (H2Pack_p) h2pack_; 22 | H2P_matvec(h2pack, b, x); 23 | #pragma omp simd 24 | for (int i = 0; i < h2pack->krnl_mat_size; i++) x[i] *= k_scale; 25 | } 26 | 27 | int main(int argc, char **argv) 28 | { 29 | // Parse command line arguments 30 | int npt, pt_dim, dim_n; 31 | int fsai_npt, fast_knn, max_iter; 32 | DTYPE mu = 0.0, dv, solve_tol, h, *coord = NULL, *krnl_param = NULL; 33 | kernel_eval_fptr krnl_eval = NULL; 34 | kernel_bimv_fptr krnl_bimv = NULL; 35 | int krnl_bimv_flops = 0; 36 | if (argc < 7) 37 | { 38 | printf("Usage: %s pt_dim dim_n fsai_npt fast_knn solve_tol max_iter\n", argv[0]); 39 | printf(" - pt_dim [int] : Point dimension, 2 or 3\n"); 40 | printf(" - dim_n [int] : Number of discretization points in each dimension\n"); 41 | printf(" - fsai_npt [int] : FSAI nonzeros per row\n"); 42 | printf(" - fast_knn [0 or 1] : If FSAI should use fast approximated KNN instead of exact KNN\n"); 43 | printf(" - solve_tol [double] : PCG relative residual tolerance\n"); 44 | printf(" - max_iter [int] : PCG maximum iteration\n"); 45 | return 255; 46 | } 47 | pt_dim = atoi(argv[1]); 48 | dim_n = atoi(argv[2]); 49 | fsai_npt = atoi(argv[3]); 50 | fast_knn = atoi(argv[4]); 51 | solve_tol = atof(argv[5]); 52 | max_iter = atoi(argv[6]); 53 | printf("Point set: %d^%d equal-space points in [0, 1]^%d\n", dim_n, pt_dim, pt_dim); 54 | printf("Laplace kernel, K(x, y) = "); 55 | if (pt_dim == 2) 56 | { 57 | krnl_eval = Laplace_2D_eval_intrin_t; 58 | krnl_bimv = Laplace_2D_krnl_bimv_intrin_t; 59 | krnl_bimv_flops = Laplace_2D_krnl_bimv_flop; 60 | // Laplace_2D computes -log(|x - y|), so the scaling factor is 1 / (2 * pi) 61 | k_scale = 1.0 / (2.0 * M_PI); 62 | dv = diag_quad_2d[dim_n - 1]; 63 | printf("-1 / (2 * pi) * log(|x - y|), K(x, x) = %e\n", dv); 64 | } else { 65 | krnl_eval = Coulomb_3D_eval_intrin_t; 66 | krnl_bimv = Coulomb_3D_krnl_bimv_intrin_t; 67 | krnl_bimv_flops = Coulomb_3D_krnl_bimv_flop; 68 | // Coulomb_3D computes 1 / |x - y|, so the scaling factor is 1 / (4 * pi) 69 | k_scale = 1.0 / (4.0 * M_PI); 70 | dv = diag_quad_3d[dim_n - 1]; 71 | printf("1 / (4 * pi * |x - y|), K(x, x) = %e\n", dv); 72 | } 73 | printf("Linear system to solve: K(X, X) * x = b\n"); 74 | printf("PCG relative residual tolerance = %.2e, max iterations = %d\n", solve_tol, max_iter); 75 | printf("\nFSAI nonzeros per row = %d\n", fsai_npt); 76 | printf("\nFast KNN for FSAI sparsity pattern = %s\n", fast_knn ? "Yes" : "No"); 77 | 78 | // Generate equal-space grid 79 | h = 1.0 / (DTYPE) dim_n; 80 | npt = (pt_dim == 2) ? dim_n * dim_n : dim_n * dim_n * dim_n; 81 | n_ = npt; 82 | coord = (DTYPE*) malloc(sizeof(DTYPE) * pt_dim * npt); 83 | if (pt_dim == 2) 84 | { 85 | for (int i = 0; i < dim_n; i++) 86 | { 87 | for (int j = 0; j < dim_n; j++) 88 | { 89 | int idx = i * dim_n + j; 90 | coord[0 * npt + idx] = h * (i + 1); 91 | coord[1 * npt + idx] = h * (j + 1); 92 | } 93 | } 94 | } else { 95 | for (int i = 0; i < dim_n; i++) 96 | { 97 | for (int j = 0; j < dim_n; j++) 98 | { 99 | for (int k = 0; k < dim_n; k++) 100 | { 101 | int idx = i * dim_n * dim_n + j * dim_n + k; 102 | coord[0 * npt + idx] = h * (i + 1); 103 | coord[1 * npt + idx] = h * (j + 1); 104 | coord[2 * npt + idx] = h * (k + 1); 105 | } 106 | } 107 | } 108 | } 109 | 110 | // Scale the kernel matrix for area-weighted point interaction (what's this?) 111 | k_scale = k_scale / (DTYPE) npt; 112 | // Since the diagonal value will also be scaled by k_scale, we need to scale it back 113 | dv = dv / k_scale; 114 | krnl_param = &dv; 115 | 116 | // Build H2 matrix 117 | double st, et; 118 | H2Pack_p h2mat = NULL; 119 | DTYPE h2_reltol = (solve_tol < 1e-8) ? solve_tol : 1e-8; 120 | H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat); 121 | 122 | // Build FSAI preconditioner 123 | printf("Building FSAI preconditioner...\n"); 124 | FSAI_precond_p FSAI_precond = NULL; 125 | H2P_build_FSAI_precond(h2mat, fsai_npt, mu, &FSAI_precond); 126 | int nnz_upper = fsai_npt * (fsai_npt + 1) / 2 + fsai_npt * (npt - fsai_npt); 127 | DEBUG_PRINTF("FSAI G matrix nnz = %d, nnz upper bound = %d\n", FSAI_precond->G->nnz, nnz_upper); 128 | printf("\n\n"); 129 | 130 | // PCG test 131 | printf("\nTesting FSAI preconditioner...\n"); 132 | test_PCG( 133 | H2Pack_matvec_scale, (void *) h2mat, 134 | (matvec_fptr) FSAI_precond_apply, (void *) FSAI_precond, 135 | npt, max_iter, solve_tol 136 | ); 137 | 138 | // Clean up 139 | free(coord); 140 | H2P_destroy(&h2mat); 141 | FSAI_precond_destroy(&FSAI_precond); 142 | return 0; 143 | } 144 | -------------------------------------------------------------------------------- /examples/common.make: -------------------------------------------------------------------------------- 1 | H2PACK_INSTALL_DIR = .. 2 | 3 | DEFS = 4 | INCS = -I$(H2PACK_INSTALL_DIR)/include 5 | CFLAGS = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS) 6 | LDFLAGS = -g -O3 -fopenmp 7 | LIBS = $(H2PACK_INSTALL_DIR)/lib/libH2Pack.a 8 | 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1) 10 | CFLAGS += -fopenmp -xHost 11 | endif 12 | 13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1) 14 | CFLAGS += -fopenmp -march=native -Wno-unused-result -Wno-unused-function 15 | LIBS += -lgfortran -lm 16 | endif 17 | 18 | ifeq ($(strip $(USE_MKL)), 1) 19 | DEFS += -DUSE_MKL 20 | CFLAGS += -mkl 21 | LDFLAGS += -mkl 22 | endif 23 | 24 | ifeq ($(strip $(USE_OPENBLAS)), 1) 25 | OPENBLAS_INSTALL_DIR = ../../OpenBLAS-git/install 26 | DEFS += -DUSE_OPENBLAS 27 | INCS += -I$(OPENBLAS_INSTALL_DIR)/include 28 | LDFLAGS += -L$(OPENBLAS_INSTALL_DIR)/lib 29 | LIBS += -lopenblas 30 | endif 31 | 32 | C_SRCS = $(wildcard *.c) 33 | C_OBJS = $(C_SRCS:.c=.c.o) 34 | EXES = $(C_SRCS:.c=.exe) 35 | 36 | # Delete the default old-fashion double-suffix rules 37 | .SUFFIXES: 38 | 39 | .SECONDARY: $(C_OBJS) 40 | 41 | all: $(EXES) 42 | 43 | %.c.o: %.c 44 | $(CC) $(CFLAGS) -c $^ -o $@ 45 | 46 | %.exe: %.c.o 47 | $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) 48 | 49 | clean: 50 | rm -f $(EXES) $(C_OBJS) -------------------------------------------------------------------------------- /examples/direct_nbody.h: -------------------------------------------------------------------------------- 1 | 2 | void direct_nbody( 3 | const void *krnl_param, kernel_eval_fptr krnl_eval, const int pt_dim, const int krnl_dim, 4 | const DTYPE *src_coord, const int src_coord_ld, const int n_src_pt, const DTYPE *src_val, 5 | const DTYPE *dst_coord, const int dst_coord_ld, const int n_dst_pt, DTYPE *dst_val 6 | ) 7 | { 8 | const int npt_blk = 256; 9 | const int blk_size = npt_blk * krnl_dim; 10 | const int n_thread = omp_get_max_threads(); 11 | 12 | memset(dst_val, 0, sizeof(DTYPE) * n_dst_pt * krnl_dim); 13 | 14 | DTYPE *krnl_mat_buffs = (DTYPE*) malloc(sizeof(DTYPE) * n_thread * blk_size * blk_size); 15 | assert(krnl_mat_buffs != NULL); 16 | 17 | #pragma omp parallel 18 | { 19 | int tid = omp_get_thread_num(); 20 | DTYPE *krnl_mat_buff = krnl_mat_buffs + tid * blk_size * blk_size; 21 | 22 | int tid_dst_pt_s, tid_dst_pt_n, tid_dst_pt_e; 23 | calc_block_spos_len(n_dst_pt, n_thread, tid, &tid_dst_pt_s, &tid_dst_pt_n); 24 | tid_dst_pt_e = tid_dst_pt_s + tid_dst_pt_n; 25 | 26 | for (int dst_pt_idx = tid_dst_pt_s; dst_pt_idx < tid_dst_pt_e; dst_pt_idx += npt_blk) 27 | { 28 | int dst_pt_blk = (dst_pt_idx + npt_blk > tid_dst_pt_e) ? (tid_dst_pt_e - dst_pt_idx) : npt_blk; 29 | int krnl_mat_nrow = dst_pt_blk * krnl_dim; 30 | const DTYPE *dst_coord_ptr = dst_coord + dst_pt_idx; 31 | DTYPE *dst_val_ptr = dst_val + dst_pt_idx * krnl_dim; 32 | for (int src_pt_idx = 0; src_pt_idx < n_src_pt; src_pt_idx += npt_blk) 33 | { 34 | int src_pt_blk = (src_pt_idx + npt_blk > n_src_pt) ? (n_src_pt - src_pt_idx) : npt_blk; 35 | int krnl_mat_ncol = src_pt_blk * krnl_dim; 36 | const DTYPE *src_coord_ptr = src_coord + src_pt_idx; 37 | const DTYPE *src_val_ptr = src_val + src_pt_idx * krnl_dim; 38 | 39 | krnl_eval( 40 | dst_coord_ptr, dst_coord_ld, dst_pt_blk, 41 | src_coord_ptr, src_coord_ld, src_pt_blk, 42 | krnl_param, krnl_mat_buff, krnl_mat_ncol 43 | ); 44 | 45 | CBLAS_GEMV( 46 | CblasRowMajor, CblasNoTrans, krnl_mat_nrow, krnl_mat_ncol, 47 | 1.0, krnl_mat_buff, krnl_mat_ncol, src_val_ptr, 1, 1.0, dst_val_ptr, 1 48 | ); 49 | } 50 | } 51 | } 52 | //printf("Calculate direct n-body reference results for %d points done\n", n_dst_pt); 53 | free(krnl_mat_buffs); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /examples/example_H2.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "H2Pack_kernels.h" 11 | #include "direct_nbody.h" 12 | 13 | int main(int argc, char **argv) 14 | { 15 | srand48(time(NULL)); 16 | double st, et; 17 | 18 | // Point configuration, random generation 19 | int pt_dim = 3; 20 | int n_point = 40000; 21 | DTYPE* coord = (DTYPE*) malloc_aligned(sizeof(DTYPE) * n_point * pt_dim, 64); 22 | assert(coord != NULL); 23 | 24 | DTYPE prefac = DPOW((DTYPE) n_point, 1.0 / (DTYPE) pt_dim); 25 | printf("Generating random coordinates in a scaled cubic box..."); 26 | for (int i = 0; i < n_point * pt_dim; i++) 27 | { 28 | coord[i] = (DTYPE) drand48(); 29 | coord[i] *= prefac; 30 | } 31 | printf(" done.\n"); 32 | 33 | // Kernel configuration 34 | int krnl_dim = 1; 35 | DTYPE *krnl_param = NULL; // Coulomb kernel has no parameter 36 | kernel_eval_fptr krnl_eval = Coulomb_3D_eval_intrin_t; 37 | kernel_bimv_fptr krnl_bimv = Coulomb_3D_krnl_bimv_intrin_t; 38 | int krnl_bimv_flops = Coulomb_3D_krnl_bimv_flop; 39 | 40 | // H2 construction configuration 41 | int krnl_mat_size = krnl_dim * n_point; 42 | DTYPE rel_tol = 1e-6; 43 | const int BD_JIT = 1; 44 | 45 | // Initialization of H2Pack 46 | H2Pack_p h2pack; 47 | H2P_init(&h2pack, pt_dim, krnl_dim, QR_REL_NRM, &rel_tol); 48 | 49 | // Hierarchical partitioning 50 | int max_leaf_points = 0; // use the default in h2pack for maximum number of points in the leaf node 51 | DTYPE max_leaf_size = 0.0; // use the default in h2pack for maximum edge length of leaf box 52 | char *pp_fname = "./PP_Coulomb3D_1e-6.dat"; // file name for storage and reuse of proxy points, can be set as NULL. 53 | H2P_calc_enclosing_box(pt_dim, n_point, coord, pp_fname, &h2pack->root_enbox); 54 | H2P_partition_points(h2pack, n_point, coord, max_leaf_points, max_leaf_size); 55 | 56 | // Select proxy points 57 | H2P_dense_mat_p *pp; 58 | // method 1: numerical proxy point selection, works for any kernel but require relatively expensive precomputation 59 | // the computed proxy points will be stored in `pp_fname' (if not NULL) for reuse if needed. 60 | if (1) 61 | { 62 | st = get_wtime_sec(); 63 | H2P_generate_proxy_point_ID_file(h2pack, krnl_param, krnl_eval, pp_fname, &pp); 64 | et = get_wtime_sec(); 65 | printf("H2Pack generate numerical proxy points used %.3lf (s)...\n", et - st); 66 | } 67 | else 68 | { 69 | // method 2: proxy surface points, works for kernel from potential theory, has negligible cost. 70 | // The edge length of the root box enclosing all the points 71 | DTYPE max_L = h2pack->root_enbox[pt_dim]; 72 | // A heuristic but effective selection of the number of proxy surface points given the expected relative tolerance 73 | int num_pp, num_pp_dim = ceil(-log10(rel_tol)); 74 | if (num_pp_dim < 4 ) num_pp_dim = 4; 75 | if (num_pp_dim > 10) num_pp_dim = 10; 76 | if (pt_dim == 2) num_pp = 2 * pt_dim * num_pp_dim; 77 | if (pt_dim == 3) num_pp = 2 * pt_dim * num_pp_dim * num_pp_dim; 78 | st = get_wtime_sec(); 79 | H2P_generate_proxy_point_surface( 80 | pt_dim, pt_dim, num_pp, h2pack->max_level, 81 | h2pack->min_adm_level, max_L, &pp 82 | ); 83 | et = get_wtime_sec(); 84 | printf("H2Pack generate proxy surface points used %.3lf (s)...\n", et - st); 85 | } 86 | 87 | // Construct H2 matrix representation 88 | H2P_build(h2pack, pp, BD_JIT, krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops); 89 | 90 | // Check multiplication error at 20000 entries 91 | int n_check_pt = 20000, check_pt_s; 92 | if (n_check_pt >= n_point) 93 | { 94 | n_check_pt = n_point; 95 | check_pt_s = 0; 96 | } else { 97 | srand(time(NULL)); 98 | check_pt_s = rand() % (n_point - n_check_pt); 99 | } 100 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 101 | 102 | DTYPE *x, *y0, *y1; 103 | x = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size); 104 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_dim * n_check_pt); 105 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size); 106 | assert(x != NULL && y0 != NULL && y1 != NULL); 107 | for (int i = 0; i < krnl_mat_size; i++) 108 | x[i] = (DTYPE) drand48() - 0.5; 109 | 110 | 111 | // Get reference results 112 | st = get_wtime_sec(); 113 | direct_nbody( 114 | krnl_param, krnl_eval, pt_dim, krnl_dim, 115 | coord, n_point, n_point, x, 116 | coord + check_pt_s, n_point, n_check_pt, y0 117 | ); 118 | et = get_wtime_sec(); 119 | printf("Direct n-body for %d points takes %.3lf (s)\n", n_check_pt, et - st); 120 | 121 | // H2 matrix-vector multiplication 122 | st = get_wtime_sec(); 123 | H2P_matvec(h2pack, x, y1); 124 | et = get_wtime_sec(); 125 | printf("Full H2 matvec takes %.3lf (s)\n", et - st); 126 | 127 | // Print out details of the H2 matrix 128 | H2P_print_statistic(h2pack); 129 | 130 | // Verify H2 matvec results 131 | DTYPE y0_norm = 0.0, err_norm = 0.0; 132 | for (int i = 0; i < krnl_dim * n_check_pt; i++) 133 | { 134 | DTYPE diff = y1[krnl_dim * check_pt_s + i] - y0[i]; 135 | y0_norm += y0[i] * y0[i]; 136 | err_norm += diff * diff; 137 | } 138 | y0_norm = DSQRT(y0_norm); 139 | err_norm = DSQRT(err_norm); 140 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 141 | printf("The specified relative error threshold is %e\n", rel_tol); 142 | 143 | // Store H2 matrix data to file 144 | int store_to_file = 0; 145 | printf("Store H2 matrix data to file? 1-yes, 0-no : "); 146 | scanf("%d", &store_to_file); 147 | if (store_to_file) 148 | { 149 | const char *meta_json_fname = "Coulomb_3D_1e-6_meta.json"; 150 | const char *aux_json_fname = "Coulomb_3D_1e-6_aux.json"; 151 | const char *binary_fname = "Coulomb_3D_1e-6.bin"; 152 | printf("Storing H2 matrix data to files %s, %s, and %s...", meta_json_fname, aux_json_fname, binary_fname); 153 | fflush(stdout); 154 | H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname); 155 | printf("done\n"); 156 | } 157 | 158 | free(x); 159 | free(y0); 160 | free(y1); 161 | free_aligned(coord); 162 | H2P_destroy(&h2pack); 163 | } 164 | -------------------------------------------------------------------------------- /examples/example_H2_tensor.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "H2Pack_kernels.h" 11 | #include "direct_nbody.h" 12 | 13 | int main(int argc, char **argv) 14 | { 15 | srand48(time(NULL)); 16 | double st, et; 17 | 18 | // Point configuration, random generation 19 | int pt_dim = 3; 20 | int n_point = 20000; 21 | DTYPE* coord = (DTYPE*) malloc_aligned(sizeof(DTYPE) * n_point * pt_dim, 64); 22 | assert(coord != NULL); 23 | 24 | DTYPE prefac = DPOW((DTYPE) n_point, 1.0 / (DTYPE) pt_dim); 25 | printf("Generating random coordinates in a scaled cubic box..."); 26 | for (int i = 0; i < n_point * pt_dim; i++) 27 | { 28 | coord[i] = (DTYPE) drand48(); 29 | coord[i] *= prefac; 30 | } 31 | printf(" done.\n"); 32 | 33 | // Kernel configuration 34 | int krnl_dim = 3; 35 | DTYPE krnl_param[2] = {1.0, 0.1}; // Stokes kernel with parameter, eta, a 36 | kernel_eval_fptr krnl_eval = Stokes_eval_std; 37 | kernel_bimv_fptr krnl_bimv = Stokes_krnl_bimv_intrin_t; 38 | int krnl_bimv_flops = Stokes_krnl_bimv_flop; 39 | 40 | // H2 construction configuration 41 | int krnl_mat_size = krnl_dim * n_point; 42 | DTYPE rel_tol = 1e-6; 43 | const int BD_JIT = 1; 44 | 45 | // Initialization of H2Pack 46 | H2Pack_p h2pack; 47 | H2P_init(&h2pack, pt_dim, krnl_dim, QR_REL_NRM, &rel_tol); 48 | 49 | // Hierarchical partitioning 50 | int max_leaf_points = 0; // use the default in h2pack for maximum number of points in the leaf node 51 | DTYPE max_leaf_size = 0.0; // use the default in h2pack for maximum edge length of leaf box 52 | char *pp_fname = "./PP_Stokes3D_1e-6.dat"; // file name for storage and reuse of proxy points, can be set as NULL. 53 | H2P_calc_enclosing_box(pt_dim, n_point, coord, pp_fname, &h2pack->root_enbox); 54 | H2P_partition_points(h2pack, n_point, coord, max_leaf_points, max_leaf_size); 55 | 56 | // Select proxy points 57 | H2P_dense_mat_p *pp; 58 | // method 1: numerical proxy point selection, works for any kernel but require relatively expensive precomputation 59 | // the computed proxy points will be stored in `pp_fname' (if not NULL) for reuse if needed. 60 | if (0) 61 | { 62 | st = get_wtime_sec(); 63 | H2P_generate_proxy_point_ID_file(h2pack, krnl_param, krnl_eval, pp_fname, &pp); 64 | et = get_wtime_sec(); 65 | printf("H2Pack generate numerical proxy points used %.3lf (s)...\n", et - st); 66 | } 67 | else 68 | { 69 | // method 2: proxy surface points, works for kernel from potential theory, has negligible cost. 70 | // The edge length of the root box enclosing all the points 71 | DTYPE max_L = h2pack->root_enbox[pt_dim]; 72 | // A heuristic but effective selection of the number of proxy surface points given the expected relative tolerance 73 | int num_pp, num_pp_dim = ceil(-log10(rel_tol)); 74 | if (num_pp_dim < 4 ) num_pp_dim = 4; 75 | if (num_pp_dim > 10) num_pp_dim = 10; 76 | if (pt_dim == 2) num_pp = 2 * pt_dim * num_pp_dim; 77 | if (pt_dim == 3) num_pp = 2 * pt_dim * num_pp_dim * num_pp_dim; 78 | st = get_wtime_sec(); 79 | H2P_generate_proxy_point_surface( 80 | pt_dim, pt_dim, num_pp, h2pack->max_level, 81 | h2pack->min_adm_level, max_L, &pp 82 | ); 83 | et = get_wtime_sec(); 84 | printf("H2Pack generate proxy surface points used %.3lf (s)...\n", et - st); 85 | } 86 | 87 | // Construct H2 matrix representation 88 | H2P_build(h2pack, pp, BD_JIT, krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops); 89 | 90 | // Check multiplication error at 20000 entries 91 | int n_check_pt = 20000, check_pt_s; 92 | if (n_check_pt >= n_point) 93 | { 94 | n_check_pt = n_point; 95 | check_pt_s = 0; 96 | } else { 97 | srand(time(NULL)); 98 | check_pt_s = rand() % (n_point - n_check_pt); 99 | } 100 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 101 | 102 | DTYPE *x, *y0, *y1; 103 | x = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size); 104 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_dim * n_check_pt); 105 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size); 106 | assert(x != NULL && y0 != NULL && y1 != NULL); 107 | for (int i = 0; i < krnl_mat_size; i++) 108 | x[i] = (DTYPE) drand48() - 0.5; 109 | 110 | 111 | // Get reference results 112 | st = get_wtime_sec(); 113 | direct_nbody( 114 | krnl_param, krnl_eval, pt_dim, krnl_dim, 115 | coord, n_point, n_point, x, 116 | coord + check_pt_s, n_point, n_check_pt, y0 117 | ); 118 | et = get_wtime_sec(); 119 | printf("Direct n-body for %d points takes %.3lf (s)\n", n_check_pt, et - st); 120 | 121 | // H2 matrix-vector multiplication 122 | st = get_wtime_sec(); 123 | H2P_matvec(h2pack, x, y1); 124 | et = get_wtime_sec(); 125 | printf("Full H2 matvec takes %.3lf (s)\n", et - st); 126 | 127 | // Print out details of the H2 matrix 128 | H2P_print_statistic(h2pack); 129 | 130 | // Verify H2 matvec results 131 | DTYPE y0_norm = 0.0, err_norm = 0.0; 132 | for (int i = 0; i < krnl_dim * n_check_pt; i++) 133 | { 134 | DTYPE diff = y1[krnl_dim * check_pt_s + i] - y0[i]; 135 | y0_norm += y0[i] * y0[i]; 136 | err_norm += diff * diff; 137 | } 138 | y0_norm = DSQRT(y0_norm); 139 | err_norm = DSQRT(err_norm); 140 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 141 | printf("The specified relative error threshold is %e\n", rel_tol); 142 | 143 | // Store H2 matrix data to file 144 | int store_to_file = 0; 145 | printf("Store H2 matrix data to file? 1-yes, 0-no : "); 146 | scanf("%d", &store_to_file); 147 | if (store_to_file) 148 | { 149 | const char *meta_json_fname = "Stokes_3D_1e-6_meta.json"; 150 | const char *aux_json_fname = "Stokes_3D_1e-6_aux.json"; 151 | const char *binary_fname = "Stokes_3D_1e-6.bin"; 152 | printf("Storing H2 matrix data to files %s, %s, and %s...", meta_json_fname, aux_json_fname, binary_fname); 153 | fflush(stdout); 154 | H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname); 155 | printf("done\n"); 156 | } 157 | 158 | free(x); 159 | free(y0); 160 | free(y1); 161 | free_aligned(coord); 162 | H2P_destroy(&h2pack); 163 | } 164 | -------------------------------------------------------------------------------- /examples/example_read_H2_file.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "H2Pack_kernels.h" 11 | #include "direct_nbody.h" 12 | 13 | int main(int argc, char **argv) 14 | { 15 | srand48(time(NULL)); 16 | double st, et; 17 | 18 | // Kernel configuration 19 | int krnl_dim = 1; 20 | DTYPE *krnl_param = NULL; // Coulomb kernel has no parameter 21 | kernel_eval_fptr krnl_eval = Coulomb_3D_eval_intrin_t; 22 | kernel_bimv_fptr krnl_bimv = Coulomb_3D_krnl_bimv_intrin_t; 23 | int krnl_bimv_flops = Coulomb_3D_krnl_bimv_flop; 24 | /* 25 | int krnl_dim = 3; 26 | DTYPE krnl_param[2] = {1.0, 0.1}; // Stokes kernel with parameter, eta, a 27 | kernel_eval_fptr krnl_eval = Stokes_eval_std; 28 | kernel_bimv_fptr krnl_bimv = Stokes_krnl_bimv_intrin_t; 29 | int krnl_bimv_flops = Stokes_krnl_bimv_flop; 30 | */ 31 | 32 | // Read H2 matrix data from file and construct H2Pack 33 | const int BD_JIT = 1; 34 | H2Pack_p h2pack; 35 | const char *meta_json_fname = "Coulomb_3D_1e-6_meta.json"; 36 | const char *aux_json_fname = "Coulomb_3D_1e-6_aux.json"; 37 | const char *binary_fname = "Coulomb_3D_1e-6.bin"; 38 | //const char *meta_json_fname = "Stokes_3D_1e-6_meta.json"; 39 | //const char *aux_json_fname = "Stokes_3D_1e-6_aux.json"; 40 | //const char *binary_fname = "Stokes_3D_1e-6.bin"; 41 | printf("Reading H2 matrix data from files %s, %s, and %s\n", meta_json_fname, aux_json_fname, binary_fname); 42 | H2P_read_from_file( 43 | &h2pack, meta_json_fname, aux_json_fname, binary_fname, BD_JIT, 44 | krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops 45 | ); 46 | int pt_dim = h2pack->pt_dim; 47 | int n_point = h2pack->n_point; 48 | int krnl_mat_size = h2pack->krnl_mat_size; 49 | DTYPE rel_tol = h2pack->QR_stop_tol; 50 | DTYPE *coord = h2pack->coord0; // Input (not sorted) point coordinates 51 | 52 | // Check multiplication error at 20000 entries 53 | int n_check_pt = 20000, check_pt_s; 54 | if (n_check_pt >= n_point) 55 | { 56 | n_check_pt = n_point; 57 | check_pt_s = 0; 58 | } else { 59 | srand(time(NULL)); 60 | check_pt_s = rand() % (n_point - n_check_pt); 61 | } 62 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 63 | 64 | DTYPE *x, *y0, *y1; 65 | x = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size); 66 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_dim * n_check_pt); 67 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size); 68 | assert(x != NULL && y0 != NULL && y1 != NULL); 69 | for (int i = 0; i < krnl_mat_size; i++) 70 | x[i] = (DTYPE) drand48() - 0.5; 71 | 72 | 73 | // Get reference results 74 | st = get_wtime_sec(); 75 | direct_nbody( 76 | krnl_param, krnl_eval, pt_dim, krnl_dim, 77 | coord, n_point, n_point, x, 78 | coord + check_pt_s, n_point, n_check_pt, y0 79 | ); 80 | et = get_wtime_sec(); 81 | printf("Direct n-body for %d points takes %.3lf (s)\n", n_check_pt, et - st); 82 | 83 | // H2 matrix-vector multiplication 84 | st = get_wtime_sec(); 85 | H2P_matvec(h2pack, x, y1); 86 | et = get_wtime_sec(); 87 | printf("Full H2 matvec takes %.3lf (s)\n", et - st); 88 | 89 | // Print out details of the H2 matrix 90 | H2P_print_statistic(h2pack); 91 | 92 | // Verify H2 matvec results 93 | DTYPE y0_norm = 0.0, err_norm = 0.0; 94 | for (int i = 0; i < krnl_dim * n_check_pt; i++) 95 | { 96 | DTYPE diff = y1[krnl_dim * check_pt_s + i] - y0[i]; 97 | y0_norm += y0[i] * y0[i]; 98 | err_norm += diff * diff; 99 | } 100 | y0_norm = DSQRT(y0_norm); 101 | err_norm = DSQRT(err_norm); 102 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 103 | printf("The specified relative error threshold is %e\n", rel_tol); 104 | 105 | free(x); 106 | free(y0); 107 | free(y1); 108 | H2P_destroy(&h2pack); 109 | } -------------------------------------------------------------------------------- /examples/meta_txt_to_json.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import json 4 | import sys 5 | import struct 6 | 7 | class EmptyClass(object): 8 | def toJSON(self): 9 | return json.dumps(self, default=lambda o: o.__dict__, indent=2) 10 | 11 | def hex_to_double(f): 12 | return struct.unpack('!d', bytes.fromhex(f))[0] 13 | 14 | def metadata_txt_to_json(meta_txt_fname, meta_json_fname, aux_json_fname): 15 | txt_file = open(meta_txt_fname, 'r') 16 | lines = txt_file.readlines() 17 | txt_file.close() 18 | 19 | meta_json = EmptyClass() # Metadata JSON 20 | aux_json = EmptyClass() # Auxiliary JSON 21 | 22 | # 1. Metadata: H2 / HSS common part 23 | aux_json.dim_point = int(lines[0]) # C.1 dim_point 24 | aux_json.dim_kernel = int(lines[1]) # C.2 dim_kernel 25 | aux_json.num_point = int(lines[2]) # C.3 num_point 26 | meta_json.nrow_matrix = int(lines[3]) # A.1 nrow_matrix 27 | meta_json.ncol_matrix = int(lines[4]) # A.2 ncol_matrix 28 | meta_json.is_symmetric = int(lines[5]) # A.3 is_symmetric 29 | meta_json.num_node_row = int(lines[6]) # A.4 num_node_row 30 | meta_json.num_node_col = int(lines[7]) # A.5 num_node_col 31 | meta_json.root_node_row = int(lines[8]) # A.6 root_node_row 32 | meta_json.root_node_col = int(lines[9]) # A.7 root_node_col 33 | meta_json.num_level_row = int(lines[10]) # A.8 num_level_row 34 | meta_json.num_level_col = int(lines[11]) # A.9 num_level_col 35 | aux_json.is_HSS = int(lines[12]) # C.4 is_HSS 36 | aux_json.min_adm_level = int(lines[13]) # C.5 min_adm_level 37 | meta_json.num_inadmissible_blocks = int(lines[14]) # A.14 num_inadmissible_blocks - n_leaf_node 38 | meta_json.num_admissible_blocks = int(lines[15]) # A.15 num_admissible_blocks 39 | meta_json.has_partial_adm_blocks = int(lines[16]) # A.16 has_partial_adm_blocks 40 | curr_row = 17 41 | 42 | # 2. Metadata: partitioning tree 43 | # A.10 nodes_row; A.11 nodes_col == NULL since H2 matrix is symmetric 44 | nodes_row = [] 45 | num_leaf_node = 0 46 | for i in range(meta_json.num_node_row): 47 | raw_data = [x for x in lines[curr_row + i].split(' ') if x] 48 | node_i = EmptyClass() 49 | node_i.index = int(raw_data[0]) # A.10.1 index 50 | node_i.level = int(raw_data[1]) # A.10.2 level 51 | node_i.cluster_head = int(raw_data[2]) # A.10.3 cluster_head 52 | node_i.cluster_tail = int(raw_data[3]) # A.10.4 cluster_tail 53 | node_i.num_children = int(raw_data[4]) # A.10.5 num_children 54 | if 0 == node_i.num_children: 55 | num_leaf_node += 1 56 | # A.10.6 children 57 | node_i.children = [] 58 | for j in range(node_i.num_children): 59 | node_i.children.append(int(raw_data[5 + j])) 60 | nodes_row.append(node_i) 61 | meta_json.nodes_row = nodes_row 62 | curr_row += meta_json.num_node_row 63 | meta_json.num_inadmissible_blocks += num_leaf_node 64 | 65 | # 3. Metadata data: U matrices 66 | # A.12 basis_matrices_row (A.13 ignored since H2 matrix is symmetric) 67 | U_mat = [] 68 | for i in range(meta_json.num_node_row): 69 | raw_data = [x for x in lines[curr_row + i].split(' ') if x] 70 | U_i = EmptyClass() 71 | U_i.node = int(raw_data[0]) # A.12.1 node 72 | U_i.num_row = int(raw_data[1]) # A.12.2 num_row 73 | U_i.num_col = int(raw_data[2]) # A.12.3 num_col 74 | U_mat.append(U_i) 75 | meta_json.basis_matrices_row = U_mat 76 | curr_row += meta_json.num_node_row 77 | 78 | # 4. Metadata data: B matrices 79 | B_mat = [] 80 | for i in range(meta_json.num_admissible_blocks): 81 | raw_data = [x for x in lines[curr_row + i].split(' ') if x] 82 | B_i = EmptyClass() 83 | B_i.node_row = int(raw_data[0]) # A.17.1 node_row 84 | B_i.node_col = int(raw_data[1]) # A.17.2 node_col 85 | B_i.num_row = int(raw_data[2]) # A.17.3 num_row 86 | B_i.num_col = int(raw_data[3]) # A.17.4 num_col 87 | B_i.is_part_adm = int(raw_data[4]) # A.17.5 is_part_adm 88 | B_mat.append(B_i) 89 | meta_json.B_matrices = B_mat 90 | curr_row += meta_json.num_admissible_blocks 91 | 92 | # 5. Metadata data: D matrices 93 | D_mat = [] 94 | for i in range(meta_json.num_inadmissible_blocks): 95 | raw_data = [x for x in lines[curr_row + i].split(' ') if x] 96 | D_i = EmptyClass() 97 | D_i.node_row = int(raw_data[0]) # A.18.1 node_row 98 | D_i.node_col = int(raw_data[1]) # A.18.2 node_col 99 | D_i.num_row = int(raw_data[2]) # A.18.3 num_row 100 | D_i.num_col = int(raw_data[3]) # A.18.4 num_col 101 | D_mat.append(D_i) 102 | meta_json.D_matrices = D_mat 103 | curr_row += meta_json.num_inadmissible_blocks 104 | 105 | # 6. Other necessary information for H2Pack 106 | aux_json.max_leaf_points = int(lines[curr_row]) # C.6 max_leaf_points 107 | aux_json.QR_stop_tol = float(lines[curr_row + 1]) # C.7 QR_stop_tol 108 | aux_json.has_skeleton_points = int(lines[curr_row + 2]) # C.8 has_skeleton_points 109 | curr_row += 3 110 | # C.9 point_coordinate 111 | # Cast it from uint64_t back to double 112 | coord = [] 113 | for i in range(aux_json.num_point): 114 | raw_data = [x for x in lines[curr_row + i].split(' ') if x] 115 | for j in range(aux_json.dim_point): 116 | coord.append(hex_to_double(raw_data[j])) 117 | aux_json.point_coordinate = coord 118 | curr_row += aux_json.num_point 119 | # C.10 permutation_array 120 | perm = [] 121 | for i in range(aux_json.num_point): 122 | perm.append(int(lines[curr_row + i])) 123 | aux_json.permutation_array = perm 124 | curr_row += aux_json.num_point 125 | # C.11 skeleton_point 126 | node_skel = [] 127 | for i in range(meta_json.num_node_row): 128 | raw_data = [x for x in lines[curr_row + i].split(' ') if x] 129 | skel_i = EmptyClass() 130 | skel_i.node = int(raw_data[0]) 131 | skel_i.num_skeleton_point = int(raw_data[1]) 132 | pt_idx = [] 133 | for j in range(skel_i.num_skeleton_point): 134 | pt_idx.append(int(raw_data[2 + j])) 135 | skel_i.skeleton_point_indices = pt_idx 136 | node_skel.append(skel_i) 137 | aux_json.skeleton_points = node_skel 138 | 139 | json_file0 = open(meta_json_fname, 'w') 140 | json_file0.write(meta_json.toJSON()) 141 | json_file0.close() 142 | 143 | json_file1 = open(aux_json_fname, 'w') 144 | json_file1.write(aux_json.toJSON()) 145 | json_file1.close() 146 | 147 | if __name__=='__main__': 148 | if len(sys.argv) < 4: 149 | print('Usage: %s '%sys.argv[0]) 150 | exit(1) 151 | meta_txt_fname = sys.argv[1] 152 | meta_json_fname = sys.argv[2] 153 | aux_json_fname = sys.argv[3] 154 | metadata_txt_to_json(meta_txt_fname, meta_json_fname, aux_json_fname) -------------------------------------------------------------------------------- /extra/GCC-OpenBLAS.make: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | USE_MKL = 0 3 | USE_OPENBLAS = 1 4 | 5 | include common.make 6 | 7 | # GCC 10 need to manually specify using SVE, -march=native is not enough 8 | # On A64FX SVE vector bits = 512, on other SVE supported processors this value might be different 9 | USE_AARCH64_SVE = 0 10 | SVE_VECTOR_BITS = 512 11 | ifeq ($(strip $(USE_AARCH64_SVE)), 1) 12 | CFLAGS := $(subst -march=native, -march=armv8.2-a+sve -msve-vector-bits=$(SVE_VECTOR_BITS), $(CFLAGS)) 13 | endif -------------------------------------------------------------------------------- /extra/ICC-MKL.make: -------------------------------------------------------------------------------- 1 | CC = icc 2 | USE_MKL = 1 3 | USE_OPENBLAS = 0 4 | 5 | include common.make -------------------------------------------------------------------------------- /extra/common.make: -------------------------------------------------------------------------------- 1 | H2PACK_INSTALL_DIR = .. 2 | 3 | DEFS = 4 | INCS = -I$(H2PACK_INSTALL_DIR)/include 5 | CFLAGS = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS) 6 | LDFLAGS = -g -O3 -fopenmp 7 | LIBS = $(H2PACK_INSTALL_DIR)/lib/libH2Pack.a 8 | 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1) 10 | CFLAGS += -fopenmp -xHost 11 | endif 12 | 13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1) 14 | CFLAGS += -fopenmp -march=native -Wno-unused-result -Wno-unused-function 15 | LIBS += -lgfortran -lm 16 | endif 17 | 18 | ifeq ($(strip $(USE_MKL)), 1) 19 | DEFS += -DUSE_MKL 20 | CFLAGS += -mkl 21 | LDFLAGS += -mkl 22 | endif 23 | 24 | ifeq ($(strip $(USE_OPENBLAS)), 1) 25 | OPENBLAS_INSTALL_DIR = ../../OpenBLAS-git/install 26 | DEFS += -DUSE_OPENBLAS 27 | INCS += -I$(OPENBLAS_INSTALL_DIR)/include 28 | LDFLAGS += -L$(OPENBLAS_INSTALL_DIR)/lib 29 | LIBS += -lopenblas 30 | endif 31 | 32 | C_SRCS = $(wildcard *.c) 33 | C_OBJS = $(C_SRCS:.c=.c.o) 34 | EXES = $(C_SRCS:.c=.exe) 35 | 36 | # Delete the default old-fashion double-suffix rules 37 | .SUFFIXES: 38 | 39 | .SECONDARY: $(C_OBJS) 40 | 41 | all: $(EXES) 42 | 43 | %.c.o: %.c 44 | $(CC) $(CFLAGS) -c $^ -o $@ 45 | 46 | %.exe: %.c.o $(H2PACK_INSTALL_DIR)/lib/libH2Pack.a 47 | $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) 48 | 49 | clean: 50 | rm -f $(EXES) $(C_OBJS) 51 | -------------------------------------------------------------------------------- /extra/debug.h: -------------------------------------------------------------------------------- 1 | 2 | #include "H2Pack.h" 3 | #include "H2Pack_utils.h" 4 | #include "utils.h" 5 | 6 | void dump_HSS(H2Pack_p h2pack) 7 | { 8 | // Assumption: MATLAB code is using the same point set 9 | // and has the same r_adm_pairs 10 | FILE *ouf0 = fopen("add_C_HSS_mat_metadata.m", "w"); 11 | FILE *ouf1 = fopen("C_HSS_mat.bin", "wb"); 12 | 13 | H2P_dense_mat_p tmpM; 14 | H2P_dense_mat_init(&tmpM, 1024, 1024); 15 | 16 | fprintf(ouf0, "C_U_sizes = [\n"); 17 | for (int i = 0; i < h2pack->n_node; i++) 18 | { 19 | fprintf(ouf0, "%d %d;\n", h2pack->U[i]->nrow, h2pack->U[i]->ncol); 20 | fwrite(h2pack->U[i]->data, sizeof(DTYPE), h2pack->U[i]->nrow * h2pack->U[i]->ncol, ouf1); 21 | } 22 | fprintf(ouf0, "];\n"); 23 | 24 | fprintf(ouf0, "C_B_sizes = [\n"); 25 | for (int i = 0; i < h2pack->HSS_n_r_adm_pair; i++) 26 | { 27 | int node0 = h2pack->HSS_r_adm_pairs[2 * i]; 28 | int node1 = h2pack->HSS_r_adm_pairs[2 * i + 1]; 29 | H2P_get_Bij_block(h2pack, node0, node1, tmpM); 30 | fprintf(ouf0, "%d %d;\n", h2pack->B_nrow[i], h2pack->B_ncol[i]); 31 | fwrite(tmpM->data, sizeof(DTYPE), tmpM->nrow * tmpM->ncol, ouf1); 32 | } 33 | fprintf(ouf0, "];\n"); 34 | 35 | fprintf(ouf0, "C_D_sizes = [\n"); 36 | for (int i = 0; i < h2pack->n_leaf_node; i++) 37 | { 38 | int node = h2pack->height_nodes[i]; // i-th leaf node 39 | fprintf(ouf0, "%d %d;\n", h2pack->D_nrow[i], h2pack->D_ncol[i]); 40 | H2P_get_Dij_block(h2pack, node, node, tmpM); 41 | fwrite(tmpM->data, sizeof(DTYPE), tmpM->nrow * tmpM->ncol, ouf1); 42 | } 43 | fprintf(ouf0, "];\n"); 44 | 45 | H2P_dense_mat_destroy(&tmpM); 46 | 47 | fclose(ouf0); 48 | fclose(ouf1); 49 | } 50 | -------------------------------------------------------------------------------- /extra/direct_nbody.h: -------------------------------------------------------------------------------- 1 | 2 | void direct_nbody( 3 | const void *krnl_param, kernel_eval_fptr krnl_eval, const int pt_dim, const int krnl_dim, 4 | const DTYPE *src_coord, const int src_coord_ld, const int n_src_pt, const DTYPE *src_val, 5 | const DTYPE *dst_coord, const int dst_coord_ld, const int n_dst_pt, DTYPE *dst_val 6 | ) 7 | { 8 | const int npt_blk = 256; 9 | const int blk_size = npt_blk * krnl_dim; 10 | const int n_thread = omp_get_max_threads(); 11 | 12 | memset(dst_val, 0, sizeof(DTYPE) * n_dst_pt * krnl_dim); 13 | 14 | DTYPE *krnl_mat_buffs = (DTYPE*) malloc(sizeof(DTYPE) * n_thread * blk_size * blk_size); 15 | assert(krnl_mat_buffs != NULL); 16 | 17 | #pragma omp parallel 18 | { 19 | int tid = omp_get_thread_num(); 20 | DTYPE *krnl_mat_buff = krnl_mat_buffs + tid * blk_size * blk_size; 21 | 22 | int tid_dst_pt_s, tid_dst_pt_n, tid_dst_pt_e; 23 | calc_block_spos_len(n_dst_pt, n_thread, tid, &tid_dst_pt_s, &tid_dst_pt_n); 24 | tid_dst_pt_e = tid_dst_pt_s + tid_dst_pt_n; 25 | 26 | for (int dst_pt_idx = tid_dst_pt_s; dst_pt_idx < tid_dst_pt_e; dst_pt_idx += npt_blk) 27 | { 28 | int dst_pt_blk = (dst_pt_idx + npt_blk > tid_dst_pt_e) ? (tid_dst_pt_e - dst_pt_idx) : npt_blk; 29 | int krnl_mat_nrow = dst_pt_blk * krnl_dim; 30 | const DTYPE *dst_coord_ptr = dst_coord + dst_pt_idx; 31 | DTYPE *dst_val_ptr = dst_val + dst_pt_idx * krnl_dim; 32 | for (int src_pt_idx = 0; src_pt_idx < n_src_pt; src_pt_idx += npt_blk) 33 | { 34 | int src_pt_blk = (src_pt_idx + npt_blk > n_src_pt) ? (n_src_pt - src_pt_idx) : npt_blk; 35 | int krnl_mat_ncol = src_pt_blk * krnl_dim; 36 | const DTYPE *src_coord_ptr = src_coord + src_pt_idx; 37 | const DTYPE *src_val_ptr = src_val + src_pt_idx * krnl_dim; 38 | 39 | krnl_eval( 40 | dst_coord_ptr, dst_coord_ld, dst_pt_blk, 41 | src_coord_ptr, src_coord_ld, src_pt_blk, 42 | krnl_param, krnl_mat_buff, krnl_mat_ncol 43 | ); 44 | 45 | CBLAS_GEMV( 46 | CblasRowMajor, CblasNoTrans, krnl_mat_nrow, krnl_mat_ncol, 47 | 1.0, krnl_mat_buff, krnl_mat_ncol, src_val_ptr, 1, 1.0, dst_val_ptr, 1 48 | ); 49 | } 50 | } 51 | } 52 | //printf("Calculate direct n-body reference results for %d points done\n", n_dst_pt); 53 | free(krnl_mat_buffs); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /extra/parse_tensor_params.h: -------------------------------------------------------------------------------- 1 | struct H2P_test_params 2 | { 3 | int pt_dim; 4 | int xpt_dim; 5 | int krnl_dim; 6 | int n_point; 7 | int krnl_mat_size; 8 | int BD_JIT; 9 | int kernel_id; 10 | int krnl_bimv_flops; 11 | void *krnl_param; 12 | DTYPE rel_tol; 13 | DTYPE *coord; 14 | kernel_eval_fptr krnl_eval; 15 | kernel_bimv_fptr krnl_bimv; 16 | }; 17 | struct H2P_test_params test_params; 18 | 19 | DTYPE Stokes_krnl_param[2] = {1.0, 0.1}; 20 | DTYPE RPY_krnl_param[1] = {1.0}; 21 | 22 | static double pseudo_randn() 23 | { 24 | double res = 0.0; 25 | for (int i = 0; i < 12; i++) res += drand48(); 26 | return (res - 6.0) / 12.0; 27 | } 28 | 29 | void parse_tensor_params(int argc, char **argv) 30 | { 31 | test_params.pt_dim = 3; 32 | test_params.xpt_dim = 3; 33 | test_params.krnl_dim = 3; 34 | 35 | if (argc < 2) 36 | { 37 | printf("Number of points = "); 38 | scanf("%d", &test_params.n_point); 39 | } else { 40 | test_params.n_point = atoi(argv[1]); 41 | printf("Number of points = %d\n", test_params.n_point); 42 | } 43 | test_params.krnl_mat_size = test_params.n_point * test_params.krnl_dim; 44 | 45 | if (argc < 3) 46 | { 47 | printf("QR relative tol = "); 48 | scanf("%lf", &test_params.rel_tol); 49 | } else { 50 | test_params.rel_tol = atof(argv[2]); 51 | printf("QR relative tol = %e\n", test_params.rel_tol); 52 | } 53 | 54 | if (argc < 4) 55 | { 56 | printf("Just-In-Time B & D = "); 57 | scanf("%d", &test_params.BD_JIT); 58 | } else { 59 | test_params.BD_JIT = atoi(argv[3]); 60 | printf("Just-In-Time B & D = %d\n", test_params.BD_JIT); 61 | } 62 | 63 | if (argc < 5) 64 | { 65 | printf("Kernel function ID = "); 66 | scanf("%d", &test_params.kernel_id); 67 | } else { 68 | test_params.kernel_id = atoi(argv[4]); 69 | printf("Kernel function ID = %d\n", test_params.kernel_id); 70 | } 71 | switch (test_params.kernel_id) 72 | { 73 | case 0: 74 | { 75 | printf("Using 3D Stokes kernel, eta = %.2lf, a = %.2lf\n", Stokes_krnl_param[0], Stokes_krnl_param[1]); 76 | break; 77 | } 78 | case 1: 79 | { 80 | printf("Using 3D RPY kernel, eta = %.2lf\n", RPY_krnl_param[0]); 81 | break; 82 | } 83 | } 84 | 85 | if (test_params.kernel_id == 1) test_params.xpt_dim = 4; 86 | test_params.coord = (DTYPE*) malloc_aligned(sizeof(DTYPE) * test_params.n_point * test_params.xpt_dim, 64); 87 | assert(test_params.coord != NULL); 88 | 89 | // Note: coordinates need to be stored in column-major style, i.e. test_params.coord 90 | // is row-major and each column stores the coordinate of a point. 91 | int need_gen = 1; 92 | if (argc >= 6) 93 | { 94 | DTYPE *tmp = (DTYPE*) malloc(sizeof(DTYPE) * test_params.n_point * test_params.xpt_dim); 95 | if (strstr(argv[5], ".csv") != NULL) 96 | { 97 | printf("Reading coordinates from CSV file..."); 98 | FILE *inf = fopen(argv[5], "r"); 99 | for (int i = 0; i < test_params.n_point; i++) 100 | { 101 | for (int j = 0; j < test_params.xpt_dim-2; j++) 102 | fscanf(inf, "%lf,", &tmp[i * test_params.xpt_dim + j]); 103 | fscanf(inf, "%lf\n", &tmp[i * test_params.xpt_dim + test_params.xpt_dim-2]); 104 | } 105 | fclose(inf); 106 | printf(" done.\n"); 107 | need_gen = 0; 108 | } 109 | if (strstr(argv[5], ".bin") != NULL) 110 | { 111 | printf("Reading coordinates from binary file..."); 112 | FILE *inf = fopen(argv[5], "rb"); 113 | fread(tmp, sizeof(DTYPE), test_params.n_point * test_params.xpt_dim, inf); 114 | fclose(inf); 115 | printf(" done.\n"); 116 | need_gen = 0; 117 | } 118 | if (need_gen == 0) 119 | { 120 | for (int i = 0; i < test_params.xpt_dim; i++) 121 | for (int j = 0; j < test_params.n_point; j++) 122 | test_params.coord[i * test_params.n_point + j] = tmp[j * test_params.xpt_dim + i]; 123 | } 124 | free(tmp); 125 | } 126 | if (need_gen == 1) 127 | { 128 | DTYPE vol_frac = 0.1; 129 | DTYPE base = 4.0 / 3.0 * M_PI / vol_frac * (DTYPE) test_params.n_point; 130 | DTYPE expn = 1.0 / (DTYPE) test_params.pt_dim; 131 | DTYPE prefac = DPOW(base, expn); 132 | printf("Binary/CSV coordinate file not provided. Generating random coordinates in unit box..."); 133 | if (test_params.kernel_id == 1) 134 | { 135 | DTYPE *x = test_params.coord; 136 | DTYPE *y = test_params.coord + test_params.n_point; 137 | DTYPE *z = test_params.coord + test_params.n_point * 2; 138 | DTYPE *a = test_params.coord + test_params.n_point * 3; 139 | DTYPE sum_a3 = 0.0; 140 | for (int i = 0; i < test_params.n_point; i++) 141 | { 142 | a[i] = 0.5 + 5.0 * (DTYPE) drand48(); 143 | sum_a3 += a[i] * a[i] * a[i]; 144 | } 145 | base = 4.0 / 3.0 * M_PI * sum_a3 / vol_frac; 146 | prefac = DPOW(base, expn); 147 | for (int i = 0; i < test_params.n_point; i++) 148 | { 149 | x[i] = (DTYPE) drand48() * prefac; 150 | y[i] = (DTYPE) drand48() * prefac; 151 | z[i] = (DTYPE) drand48() * prefac; 152 | } 153 | } else { 154 | for (int i = 0; i < test_params.n_point * test_params.pt_dim; i++) 155 | { 156 | //test_params.coord[i] = (DTYPE) pseudo_randn(); 157 | test_params.coord[i] = (DTYPE) drand48(); 158 | test_params.coord[i] *= prefac; 159 | } 160 | } 161 | printf(" done.\n"); 162 | } 163 | 164 | switch (test_params.kernel_id) 165 | { 166 | case 0: 167 | { 168 | test_params.krnl_eval = Stokes_eval_std; 169 | test_params.krnl_bimv = Stokes_krnl_bimv_intrin_t; 170 | test_params.krnl_bimv_flops = Stokes_krnl_bimv_flop; 171 | test_params.krnl_param = (void*) &Stokes_krnl_param[0]; 172 | break; 173 | } 174 | case 1: 175 | { 176 | test_params.krnl_eval = RPY_eval_std; 177 | test_params.krnl_bimv = RPY_krnl_bimv_intrin_t; 178 | test_params.krnl_bimv_flops = RPY_krnl_bimv_flop; 179 | test_params.krnl_param = (void*) &RPY_krnl_param[0]; 180 | break; 181 | } 182 | } 183 | } 184 | 185 | -------------------------------------------------------------------------------- /extra/rand_3D_sphere_points.m: -------------------------------------------------------------------------------- 1 | function X = rand_3D_sphere_points(n, density) 2 | % Input parameters: 3 | % n : Number of points 4 | % density : Point density on unit surface area, default is 100, 5 | % < 0 will generate points on a unit sphere 6 | % Output parameter: 7 | % X : Size n * 3, each row is a point coordinate 8 | if (nargin < 2), density = 100; end 9 | X = rand(n, 3) - 0.5; 10 | X = normr(X); 11 | if (density > 0) 12 | r = sqrt(n / (4 * pi * density)); 13 | X = X .* r; 14 | end 15 | end -------------------------------------------------------------------------------- /extra/src-obsolete/H2P_generate_proxy_point_ID.c: -------------------------------------------------------------------------------- 1 | 2 | // Generate proxy points for constructing H2 projection and skeleton matrices 3 | // using ID compress for any kernel function. 4 | // This function is isolated because if the enclosing box for all points are fixed, 5 | // we only need to generate proxy points once and use them repeatedly. 6 | // Input parameters: 7 | // pt_dim : Dimension of point coordinate 8 | // krnl_dim : Dimension of kernel's return 9 | // reltol : Proxy point selection relative error tolerance 10 | // max_level : Maximum level (included) of a H2 tree, (root level == 0) 11 | // min_level : Minimum level that needs proxy points 12 | // max_L : The size of the root node's enclosing box 13 | // krnl_eval : Pointer to kernel matrix evaluation function 14 | // krnl_param : Pointer to kernel function parameter array 15 | // Output parameter: 16 | // pp_ : Array of proxy points for each level 17 | void H2P_generate_proxy_point_ID( 18 | const int pt_dim, const int krnl_dim, const DTYPE reltol, const int max_level, const int min_level, 19 | DTYPE max_L, const void *krnl_param, kernel_eval_fptr krnl_eval, H2P_dense_mat_p **pp_ 20 | ) 21 | { 22 | // 1. Initialize proxy point arrays and parameters 23 | int n_level = max_level + 1; 24 | H2P_dense_mat_p *pp = (H2P_dense_mat_p*) malloc(sizeof(H2P_dense_mat_p) * n_level); 25 | ASSERT_PRINTF(pp != NULL, "Failed to allocate %d arrays for storing proxy points", n_level); 26 | for (int i = 0; i <= max_level; i++) 27 | { 28 | H2P_dense_mat_init(&pp[i], pt_dim, 0); 29 | pp[i]->ncol = 0; 30 | } 31 | 32 | GET_ENV_INT_VAR(gen_pp_param.alg, "H2P_GEN_PP_ALG", "alg", 2, 0, 2); 33 | GET_ENV_INT_VAR(gen_pp_param.X0_size, "H2P_GEN_PP_X0_SIZE", "X0_size", 2000, 500, 5000); 34 | GET_ENV_INT_VAR(gen_pp_param.Y0_lsize, "H2P_GEN_PP_Y0_LSIZE", "Y0_lsize", 4000, 1000, 20000); 35 | GET_ENV_INT_VAR(gen_pp_param.L3_nlayer, "H2P_GEN_PP_L3_NLAYER", "L3_nlayer", 8, 8, 32); 36 | GET_ENV_INT_VAR(gen_pp_param.max_layer, "H2P_GEN_PP_MAX_LAYER", "max_layer", 8, 4, 32); 37 | GET_ENV_INT_VAR(gen_pp_param.print_timers, "H2P_PRINT_TIMERS", "print_timers", 0, 0, 1); 38 | 39 | double timers[4]; 40 | DTYPE L3_nlayer_ = (DTYPE) gen_pp_param.L3_nlayer; 41 | 42 | // 2. Construct proxy points on each level 43 | DTYPE pow_2_level = 0.5; 44 | for (int level = 0; level < min_level; level++) pow_2_level *= 2.0; 45 | for (int level = min_level; level <= max_level; level++) 46 | { 47 | // Level 0 and level 1 nodes are not admissible, do not need proxy points 48 | if (level < 2) 49 | { 50 | pow_2_level *= 2.0; 51 | WARNING_PRINTF("Level %d: no proxy points are generated\n", level); 52 | continue; 53 | } 54 | 55 | // Decide box sizes for domains X and Y 56 | pow_2_level *= 2.0; 57 | DTYPE L1 = max_L / pow_2_level; 58 | DTYPE L2 = (1.0 + 2.0 * ALPHA_H2) * L1; 59 | DTYPE L3_0 = (1.0 + L3_nlayer_ * ALPHA_H2) * L1; 60 | DTYPE L3_1 = 2.0 * max_L - L1; 61 | DTYPE L3 = MIN(L3_0, L3_1); 62 | 63 | int Y0_lsize_ = gen_pp_param.Y0_lsize; 64 | if (gen_pp_param.alg == 0) // Only one ring, multiple Y0_lsize_ by the number of rings 65 | { 66 | int n_layer = DROUND((L3 - L2) / L1); 67 | if (n_layer > gen_pp_param.max_layer) n_layer = gen_pp_param.max_layer; 68 | Y0_lsize_ *= n_layer; 69 | } 70 | 71 | // Reset timers 72 | timers[GEN_PP_KRNL_TIMER_IDX] = 0.0; 73 | timers[GEN_PP_KRNL_TIMER_IDX] = 0.0; 74 | timers[GEN_PP_ID_TIMER_IDX] = 0.0; 75 | timers[GEN_PP_MISC_TIMER_IDX] = 0.0; 76 | 77 | // Generate proxy points 78 | H2P_generate_proxy_point_nlayer( 79 | pt_dim, krnl_dim, reltol, 80 | krnl_param, krnl_eval, 81 | L1, L2, L3, 82 | gen_pp_param.alg, gen_pp_param.X0_size, Y0_lsize_, gen_pp_param.max_layer, 83 | pp[level], &timers[0] 84 | ); 85 | 86 | if (gen_pp_param.print_timers == 1) 87 | { 88 | INFO_PRINTF("Level %d: %d proxy points generated\n", level, pp[level]->ncol); 89 | INFO_PRINTF( 90 | " kernel, SpMM, ID, other time = %.3lf, %.3lf, %.3lf, %.3lf sec\n", 91 | timers[GEN_PP_KRNL_TIMER_IDX], timers[GEN_PP_KRNL_TIMER_IDX], 92 | timers[GEN_PP_ID_TIMER_IDX], timers[GEN_PP_MISC_TIMER_IDX] 93 | ); 94 | } 95 | } // End of level loop 96 | 97 | *pp_ = pp; 98 | } 99 | -------------------------------------------------------------------------------- /extra/test_H2_accuracy.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "H2Pack_kernels.h" 11 | 12 | #include "parse_scalar_params.h" 13 | #include "direct_nbody.h" 14 | 15 | int main(int argc, char **argv) 16 | { 17 | srand48(time(NULL)); 18 | 19 | parse_scalar_params(argc, argv); 20 | 21 | double st, et; 22 | 23 | H2Pack_p h2pack; 24 | 25 | // Test parameters 26 | #define n_rel_tol 3 27 | #define krnl_param_len 1 28 | #define n_krnl_param 5 29 | DTYPE rel_tols[n_rel_tol] = {1e-3, 1e-6, 1e-9}; 30 | DTYPE krnl_params[n_krnl_param * krnl_param_len] = {1e-2, 1e-1, 1e0, 1e1, 1e2}; 31 | 32 | // Loop over rel_tol and krnl_param combinations 33 | for (int i_rel_tol = 0; i_rel_tol < n_rel_tol; i_rel_tol++) 34 | { 35 | test_params.rel_tol = rel_tols[i_rel_tol]; 36 | for (int i_krnl_param = 0; i_krnl_param < n_krnl_param; i_krnl_param++) 37 | { 38 | const DTYPE *krnl_param_ = krnl_params + i_krnl_param * krnl_param_len; 39 | test_params.krnl_param = (void*) krnl_param_; 40 | 41 | printf("Current parameters: rel_tol = %.1e, krnl_param[] = ", test_params.rel_tol); 42 | for (int i = 0; i < krnl_param_len; i++) printf("%.1e ", krnl_param_[i]); 43 | printf("\n"); 44 | 45 | H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol); 46 | 47 | H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox); 48 | 49 | int max_leaf_points = 0; 50 | DTYPE max_leaf_size = 0.0; 51 | H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size); 52 | 53 | // Generate proxy points 54 | H2P_dense_mat_p *pp = NULL; 55 | st = get_wtime_sec(); 56 | H2P_generate_proxy_point_ID_file( 57 | h2pack, test_params.krnl_param, test_params.krnl_eval, 58 | test_params.pp_fname, &pp 59 | ); 60 | et = get_wtime_sec(); 61 | printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st); 62 | 63 | // Build H2 representation 64 | st = get_wtime_sec(); 65 | H2P_build( 66 | h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 67 | test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops 68 | ); 69 | et = get_wtime_sec(); 70 | printf("H2Pack H2 construction used %.3lf (s)\n", et - st); 71 | 72 | // Allocate input & output vectors 73 | int n_check_pt = 50000, check_pt_s; 74 | if (n_check_pt >= test_params.n_point) 75 | { 76 | n_check_pt = test_params.n_point; 77 | check_pt_s = 0; 78 | } else { 79 | srand(time(NULL)); 80 | check_pt_s = rand() % (test_params.n_point - n_check_pt); 81 | } 82 | DTYPE *x, *y0, *y1; 83 | x = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 84 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt); 85 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 86 | assert(x != NULL && y0 != NULL && y1 != NULL); 87 | for (int i = 0; i < test_params.krnl_mat_size; i++) 88 | { 89 | //x[i] = (DTYPE) pseudo_randn(); 90 | x[i] = (DTYPE) drand48() - 0.5; 91 | } 92 | 93 | // Get reference results 94 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 95 | direct_nbody( 96 | test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 97 | test_params.coord, test_params.n_point, test_params.n_point, x, 98 | test_params.coord + check_pt_s, test_params.n_point, n_check_pt, y0 99 | ); 100 | 101 | // Check H2 matvec accuracy 102 | H2P_matvec(h2pack, x, y1); 103 | H2P_print_statistic(h2pack); 104 | DTYPE y0_norm = 0.0, err_norm = 0.0; 105 | for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++) 106 | { 107 | DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i]; 108 | y0_norm += y0[i] * y0[i]; 109 | err_norm += diff * diff; 110 | } 111 | y0_norm = DSQRT(y0_norm); 112 | err_norm = DSQRT(err_norm); 113 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 114 | 115 | // Destroy H2Pack structure and I/O vectors 116 | H2P_destroy(&h2pack); 117 | free(h2pack); 118 | free(x); 119 | free(y0); 120 | free(y1); 121 | printf("\n\n\n"); 122 | } // End of i_krnl_param loop 123 | } // End of i_rel_tol loop 124 | 125 | free_aligned(test_params.coord); 126 | 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /extra/test_H2_matmul.h: -------------------------------------------------------------------------------- 1 | 2 | void calc_err_2norm_dtype( 3 | const int len, const DTYPE *x0, const DTYPE *x1, 4 | DTYPE *x0_2norm_, DTYPE *err_2norm_ 5 | ) 6 | { 7 | DTYPE x0_2norm = 0.0, err_2norm = 0.0, diff; 8 | for (int i = 0; i < len; i++) 9 | { 10 | diff = x0[i] - x1[i]; 11 | x0_2norm += x0[i] * x0[i]; 12 | err_2norm += diff * diff; 13 | } 14 | *x0_2norm_ = DSQRT(x0_2norm); 15 | *err_2norm_ = DSQRT(err_2norm); 16 | } 17 | 18 | void test_H2_matmul(H2Pack_p h2pack, const int n_vec) 19 | { 20 | double st, et; 21 | int n_thread = omp_get_num_threads(); 22 | int krnl_mat_size = h2pack->krnl_mat_size; 23 | int mat_size = krnl_mat_size * n_vec; 24 | DTYPE *x0, *x1, *y0, *y1, *y2; 25 | x0 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size); 26 | x1 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size); 27 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size); 28 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size); 29 | y2 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size); 30 | ASSERT_PRINTF( 31 | x0 != NULL && x1 != NULL && y0 != NULL && y1 != NULL && y2 != NULL, 32 | "Failed to allocate 5 arrays of size %d for H2 matmul tests\n", mat_size 33 | ); 34 | for (int i = 0; i < mat_size; i++) 35 | { 36 | //x0[i] = (DTYPE) pseudo_randn(); 37 | x0[i] = (DTYPE) drand48() - 0.5; 38 | y0[i] = 0.0; 39 | y1[i] = 0.0; 40 | } 41 | 42 | // Test multiple matvec 43 | st = get_wtime_sec(); 44 | for (int i = 0; i < n_vec; i++) 45 | { 46 | DTYPE *x_ivec = x0 + i * krnl_mat_size; 47 | DTYPE *y_ivec = y0 + i * krnl_mat_size; 48 | H2P_matvec(h2pack, x_ivec, y_ivec); 49 | } 50 | et = get_wtime_sec(); 51 | printf("%3d matvec used %.3lf sec\n", n_vec, et - st); 52 | 53 | DTYPE y0_2norm, err_2norm, relerr; 54 | 55 | // Test column-major matmul performance 56 | st = get_wtime_sec(); 57 | H2P_matmul(h2pack, CblasColMajor, n_vec, x0, krnl_mat_size, y1, krnl_mat_size); 58 | et = get_wtime_sec(); 59 | printf("One col-major matmul used %.3lf sec\n", et - st); 60 | 61 | // Check H2 column-major matmul results 62 | DTYPE cm_max_relerr = 0.0; 63 | DTYPE cm_avg_relerr = 0.0; 64 | for (int i = 0; i < n_vec; i++) 65 | { 66 | DTYPE *y0_ivec = y0 + i * krnl_mat_size; 67 | DTYPE *y1_ivec = y1 + i * krnl_mat_size; 68 | calc_err_2norm_dtype(krnl_mat_size, y0_ivec, y1_ivec, &y0_2norm, &err_2norm); 69 | relerr = err_2norm / y0_2norm; 70 | if (relerr > cm_max_relerr) cm_max_relerr = relerr; 71 | cm_avg_relerr += relerr; 72 | } 73 | cm_avg_relerr /= (DTYPE) n_vec; 74 | 75 | // Test row-major matmul performance 76 | //double trans_t = 0.0, matmul_t = 0.0, total_t = 0.0; 77 | //st = get_wtime_sec(); 78 | H2P_transpose_dmat(n_thread, n_vec, krnl_mat_size, x0, krnl_mat_size, x1, n_vec); 79 | //et = get_wtime_sec(); 80 | //trans_t += et - st; 81 | 82 | st = get_wtime_sec(); 83 | H2P_matmul(h2pack, CblasRowMajor, n_vec, x1, n_vec, y1, n_vec); 84 | et = get_wtime_sec(); 85 | //matmul_t = et - st; 86 | 87 | //st = get_wtime_sec(); 88 | H2P_transpose_dmat(n_thread, krnl_mat_size, n_vec, y1, n_vec, y2, krnl_mat_size); 89 | //et = get_wtime_sec(); 90 | //trans_t += et - st; 91 | //total_t = matmul_t + trans_t; 92 | printf("One row-major matmul used %.3lf sec\n", et - st); 93 | 94 | // Check H2 row-major matmul results 95 | DTYPE rm_max_relerr = 0.0; 96 | DTYPE rm_avg_relerr = 0.0; 97 | for (int i = 0; i < n_vec; i++) 98 | { 99 | DTYPE *y0_ivec = y0 + i * krnl_mat_size; 100 | DTYPE *y2_ivec = y2 + i * krnl_mat_size; 101 | calc_err_2norm_dtype(krnl_mat_size, y0_ivec, y2_ivec, &y0_2norm, &err_2norm); 102 | relerr = err_2norm / y0_2norm; 103 | if (relerr > rm_max_relerr) rm_max_relerr = relerr; 104 | rm_avg_relerr += relerr; 105 | } 106 | rm_avg_relerr /= (DTYPE) n_vec; 107 | 108 | printf("%d vectors col-major matmul max/avg relerr = %e, %e\n", n_vec, cm_max_relerr, cm_avg_relerr); 109 | printf("%d vectors row-major matmul max/avg relerr = %e, %e\n", n_vec, rm_max_relerr, rm_avg_relerr); 110 | 111 | free(x0); 112 | free(x1); 113 | free(y0); 114 | free(y1); 115 | free(y2); 116 | } 117 | 118 | -------------------------------------------------------------------------------- /extra/test_H2_scalar.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | //#include 10 | 11 | #include "H2Pack.h" 12 | #include "H2Pack_kernels.h" 13 | 14 | #include "parse_scalar_params.h" 15 | #include "direct_nbody.h" 16 | 17 | int main(int argc, char **argv) 18 | { 19 | //__itt_pause(); 20 | srand48(time(NULL)); 21 | 22 | parse_scalar_params(argc, argv); 23 | 24 | double st, et; 25 | 26 | H2Pack_p h2pack; 27 | 28 | H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol); 29 | 30 | H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox); 31 | 32 | int max_leaf_points = 0; 33 | DTYPE max_leaf_size = 0.0; 34 | H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size); 35 | 36 | H2P_dense_mat_p *pp; 37 | st = get_wtime_sec(); 38 | H2P_generate_proxy_point_ID_file( 39 | h2pack, test_params.krnl_param, test_params.krnl_eval, 40 | test_params.pp_fname, &pp 41 | ); 42 | et = get_wtime_sec(); 43 | printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st); 44 | 45 | H2P_build( 46 | h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 47 | test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops 48 | ); 49 | 50 | int n_check_pt = 50000, check_pt_s; 51 | if (n_check_pt >= test_params.n_point) 52 | { 53 | n_check_pt = test_params.n_point; 54 | check_pt_s = 0; 55 | } else { 56 | srand(time(NULL)); 57 | check_pt_s = rand() % (test_params.n_point - n_check_pt); 58 | } 59 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 60 | 61 | DTYPE *x, *y0, *y1; 62 | x = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 63 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt); 64 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 65 | assert(x != NULL && y0 != NULL && y1 != NULL); 66 | for (int i = 0; i < test_params.krnl_mat_size; i++) 67 | { 68 | //x[i] = (DTYPE) pseudo_randn(); 69 | x[i] = (DTYPE) drand48() - 0.5; 70 | } 71 | 72 | // Get reference results 73 | direct_nbody( 74 | test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 75 | test_params.coord, test_params.n_point, test_params.n_point, x, 76 | test_params.coord + check_pt_s, test_params.n_point, n_check_pt, y0 77 | ); 78 | 79 | // Warm up, reset timers, and test the matvec performance 80 | H2P_matvec(h2pack, x, y1); 81 | H2P_reset_timers(h2pack); 82 | //__itt_resume(); 83 | for (int i = 0; i < 10; i++) 84 | H2P_matvec(h2pack, x, y1); 85 | //__itt_pause(); 86 | 87 | H2P_print_statistic(h2pack); 88 | 89 | // Verify H2 matvec results 90 | DTYPE y0_norm = 0.0, err_norm = 0.0; 91 | for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++) 92 | { 93 | DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i]; 94 | y0_norm += y0[i] * y0[i]; 95 | err_norm += diff * diff; 96 | } 97 | y0_norm = DSQRT(y0_norm); 98 | err_norm = DSQRT(err_norm); 99 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 100 | 101 | // Store H2 matrix data to file 102 | int store_to_file = 0; 103 | printf("Store H2 matrix data to file? 1-yes, 0-no : "); 104 | scanf("%d", &store_to_file); 105 | if (store_to_file) 106 | { 107 | char meta_json_fname[1024]; 108 | char aux_json_fname[1024]; 109 | char binary_fname[1024]; 110 | printf("Enter meta JSON file name: "); 111 | scanf("%s", meta_json_fname); 112 | printf("Enter auxiliary JSON file name: "); 113 | scanf("%s", aux_json_fname); 114 | printf("Enter binary data file name: "); 115 | scanf("%s", binary_fname); 116 | H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname); 117 | printf("done\n"); 118 | } 119 | 120 | free(x); 121 | free(y0); 122 | free(y1); 123 | free_aligned(test_params.coord); 124 | H2P_destroy(&h2pack); 125 | 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /extra/test_H2_scalar_samplept.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | //#include 10 | 11 | #include "H2Pack.h" 12 | #include "H2Pack_kernels.h" 13 | 14 | #include "parse_scalar_params.h" 15 | #include "direct_nbody.h" 16 | 17 | // Copy from MATLAB code 18 | int sample_approx_rank(const DTYPE tau, const DTYPE reltol) 19 | { 20 | int r = 1, r_tmp; 21 | if (reltol < 2e-1) r = 2; 22 | if (reltol < 2e-2) r = 3; 23 | if (reltol < 2e-3) r = 4; 24 | if (reltol < 2e-4) 25 | { 26 | r_tmp = 2.0 * DFLOOR(DLOG(reltol) / DLOG(tau)) - 15.0; 27 | if (r_tmp < 20.0) r_tmp = 20.0; 28 | r = (int) DCEIL(DSQRT(r_tmp)); 29 | } 30 | if (reltol < 7e-7) 31 | { 32 | r_tmp = 2.0 * DFLOOR(DLOG(reltol) / DLOG(tau)) - 10.0; 33 | if (r_tmp < 20.0) r_tmp = 20.0; 34 | r = (int) DCEIL(DSQRT(r_tmp)); 35 | } 36 | if (reltol < 7e-9) 37 | { 38 | r_tmp = 2.0 * DFLOOR(DLOG(reltol) / DLOG(tau)); 39 | if (r_tmp < 90.0) r_tmp = 90.0; 40 | r = (int) DCEIL(DSQRT(r_tmp)); 41 | } 42 | return r; 43 | } 44 | 45 | int main(int argc, char **argv) 46 | { 47 | //__itt_pause(); 48 | srand48(time(NULL)); 49 | 50 | printf("For this sample point example program, please enter an arbitrary proxy point file name if asked\n\n"); 51 | parse_scalar_params(argc, argv); 52 | 53 | double st, et; 54 | 55 | H2Pack_p h2pack; 56 | 57 | H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol); 58 | 59 | H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox); 60 | 61 | int max_leaf_points = 0; 62 | DTYPE max_leaf_size = 0.0; 63 | H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size); 64 | 65 | DTYPE tau = 0.7; // Separation threshold 66 | #if 0 67 | int approx_rank, approx_rank0; 68 | approx_rank0 = sample_approx_rank(tau, test_params.rel_tol); 69 | if (argc >= 9) approx_rank = atoi(argv[8]); 70 | else 71 | { 72 | printf("Sample approx rank (suggested %d): ", approx_rank0); 73 | scanf("%d", &approx_rank); 74 | } 75 | #endif 76 | 77 | H2P_dense_mat_p *sample_pt; 78 | st = get_wtime_sec(); 79 | H2P_select_sample_point( 80 | h2pack, test_params.krnl_param, test_params.krnl_eval, 81 | tau, &sample_pt 82 | ); 83 | et = get_wtime_sec(); 84 | printf("H2Pack select sample points used %.3lf (s)\n", et - st); 85 | 86 | H2P_build_with_sample_point( 87 | h2pack, sample_pt, test_params.BD_JIT, test_params.krnl_param, 88 | test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops 89 | ); 90 | 91 | int n_check_pt = 50000, check_pt_s; 92 | if (n_check_pt >= test_params.n_point) 93 | { 94 | n_check_pt = test_params.n_point; 95 | check_pt_s = 0; 96 | } else { 97 | srand(time(NULL)); 98 | check_pt_s = rand() % (test_params.n_point - n_check_pt); 99 | } 100 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 101 | 102 | DTYPE *x, *y0, *y1; 103 | x = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 104 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt); 105 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 106 | assert(x != NULL && y0 != NULL && y1 != NULL); 107 | for (int i = 0; i < test_params.krnl_mat_size; i++) 108 | { 109 | //x[i] = (DTYPE) pseudo_randn(); 110 | x[i] = (DTYPE) drand48() - 0.5; 111 | } 112 | 113 | // Get reference results 114 | direct_nbody( 115 | test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 116 | test_params.coord, test_params.n_point, test_params.n_point, x, 117 | test_params.coord + check_pt_s, test_params.n_point, n_check_pt, y0 118 | ); 119 | 120 | // Warm up, reset timers, and test the matvec performance 121 | H2P_matvec(h2pack, x, y1); 122 | H2P_reset_timers(h2pack); 123 | //__itt_resume(); 124 | for (int i = 0; i < 10; i++) 125 | H2P_matvec(h2pack, x, y1); 126 | //__itt_pause(); 127 | 128 | H2P_print_statistic(h2pack); 129 | 130 | // Verify H2 matvec results 131 | DTYPE y0_norm = 0.0, err_norm = 0.0; 132 | for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++) 133 | { 134 | DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i]; 135 | y0_norm += y0[i] * y0[i]; 136 | err_norm += diff * diff; 137 | } 138 | y0_norm = DSQRT(y0_norm); 139 | err_norm = DSQRT(err_norm); 140 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 141 | 142 | // Store H2 matrix data to file 143 | int store_to_file = 0; 144 | printf("Store H2 matrix data to file? 1-yes, 0-no : "); 145 | scanf("%d", &store_to_file); 146 | if (store_to_file) 147 | { 148 | char meta_json_fname[1024]; 149 | char aux_json_fname[1024]; 150 | char binary_fname[1024]; 151 | printf("Enter meta JSON file name: "); 152 | scanf("%s", meta_json_fname); 153 | printf("Enter auxiliary JSON file name: "); 154 | scanf("%s", aux_json_fname); 155 | printf("Enter binary data file name: "); 156 | scanf("%s", binary_fname); 157 | H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname); 158 | printf("done\n"); 159 | } 160 | 161 | free(x); 162 | free(y0); 163 | free(y1); 164 | free_aligned(test_params.coord); 165 | H2P_destroy(&h2pack); 166 | 167 | return 0; 168 | } 169 | -------------------------------------------------------------------------------- /extra/test_HSS_scalar.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | //#include 10 | 11 | #include "H2Pack.h" 12 | #include "H2Pack_kernels.h" 13 | 14 | #include "parse_scalar_params.h" 15 | #include "direct_nbody.h" 16 | 17 | #include "debug.h" 18 | 19 | int main(int argc, char **argv) 20 | { 21 | //__itt_pause(); 22 | srand48(time(NULL)); 23 | 24 | parse_scalar_params(argc, argv); 25 | 26 | double st, et; 27 | 28 | H2Pack_p h2pack; 29 | 30 | H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol); 31 | H2P_run_HSS(h2pack); 32 | 33 | H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox); 34 | 35 | int max_leaf_points = 0; 36 | DTYPE max_leaf_size = 0.0; 37 | H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size); 38 | 39 | H2P_dense_mat_p *pp; 40 | st = get_wtime_sec(); 41 | H2P_generate_proxy_point_ID_file( 42 | h2pack, test_params.krnl_param, test_params.krnl_eval, 43 | test_params.pp_fname, &pp 44 | ); 45 | et = get_wtime_sec(); 46 | printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st); 47 | 48 | H2P_build( 49 | h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 50 | test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops 51 | ); 52 | 53 | int n_check_pt = 50000, check_pt_s; 54 | if (n_check_pt >= test_params.n_point) 55 | { 56 | n_check_pt = test_params.n_point; 57 | check_pt_s = 0; 58 | } else { 59 | srand(time(NULL)); 60 | check_pt_s = rand() % (test_params.n_point - n_check_pt); 61 | } 62 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 63 | 64 | DTYPE *x0, *x1, *y0, *y1; 65 | x0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 66 | x1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 67 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt); 68 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 69 | assert(x0 != NULL && x1 != NULL && y0 != NULL && y1 != NULL); 70 | for (int i = 0; i < test_params.krnl_mat_size; i++) 71 | { 72 | //x0[i] = (DTYPE) pseudo_randn(); 73 | x0[i] = (DTYPE) drand48() - 0.5; 74 | } 75 | 76 | // Get reference results 77 | direct_nbody( 78 | test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 79 | test_params.coord, test_params.n_point, test_params.n_point, x0, 80 | test_params.coord + check_pt_s, test_params.n_point, n_check_pt, y0 81 | ); 82 | 83 | // Warm up, reset timers, and test the matvec performance 84 | H2P_matvec(h2pack, x0, y1); 85 | H2P_reset_timers(h2pack); 86 | //__itt_resume(); 87 | for (int i = 0; i < 10; i++) 88 | H2P_matvec(h2pack, x0, y1); 89 | //__itt_pause(); 90 | 91 | 92 | // Verify HSS matvec results 93 | DTYPE ref_norm = 0.0, err_norm = 0.0; 94 | for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++) 95 | { 96 | DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i]; 97 | ref_norm += y0[i] * y0[i]; 98 | err_norm += diff * diff; 99 | } 100 | ref_norm = DSQRT(ref_norm); 101 | err_norm = DSQRT(err_norm); 102 | printf("For %d validation points: ||y_{HSS} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / ref_norm); 103 | 104 | #if 0 105 | // Test ULV Cholesky factorization 106 | const DTYPE shift = 0; 107 | H2P_HSS_ULV_Cholesky_factorize(h2pack, shift); 108 | 109 | for (int i = 0; i < test_params.krnl_mat_size; i++) y1[i] += shift * x0[i]; 110 | // Warm up, reset timers, and test the ULV solve performance 111 | H2P_HSS_ULV_Cholesky_solve(h2pack, 3, y1, x1); 112 | h2pack->n_ULV_solve = 0; 113 | h2pack->timers[ULV_SLV_TIMER_IDX] = 0.0; 114 | for (int i = 0; i < 10; i++) 115 | H2P_HSS_ULV_Cholesky_solve(h2pack, 3, y1, x1); 116 | ref_norm = 0.0; 117 | err_norm = 0.0; 118 | for (int i = 0; i < test_params.krnl_mat_size; i++) 119 | { 120 | DTYPE diff = x1[i] - x0[i]; 121 | ref_norm += x0[i] * x0[i]; 122 | err_norm += diff * diff; 123 | } 124 | ref_norm = DSQRT(ref_norm); 125 | err_norm = DSQRT(err_norm); 126 | printf("H2P_HSS_ULV_Cholesky_solve relerr = %e\n", err_norm / ref_norm); 127 | #endif 128 | 129 | //dump_HSS(h2pack); 130 | 131 | // Test ULV LU factorization 132 | const DTYPE shift = 0; 133 | H2P_HSS_ULV_LU_factorize(h2pack, shift); 134 | 135 | for (int i = 0; i < test_params.krnl_mat_size; i++) y1[i] += shift * x0[i]; 136 | // Warm up, reset timers, and test the ULV solve performance 137 | H2P_HSS_ULV_LU_solve(h2pack, 3, y1, x1); 138 | h2pack->n_ULV_solve = 0; 139 | h2pack->timers[ULV_SLV_TIMER_IDX] = 0.0; 140 | for (int i = 0; i < 10; i++) 141 | H2P_HSS_ULV_LU_solve(h2pack, 3, y1, x1); 142 | ref_norm = 0.0; 143 | err_norm = 0.0; 144 | for (int i = 0; i < test_params.krnl_mat_size; i++) 145 | { 146 | DTYPE diff = x1[i] - x0[i]; 147 | ref_norm += x0[i] * x0[i]; 148 | err_norm += diff * diff; 149 | } 150 | ref_norm = DSQRT(ref_norm); 151 | err_norm = DSQRT(err_norm); 152 | printf("H2P_HSS_ULV_LU_solve relerr = %e\n", err_norm / ref_norm); 153 | 154 | H2P_print_statistic(h2pack); 155 | 156 | free(x0); 157 | free(x1); 158 | free(y0); 159 | free(y1); 160 | free_aligned(test_params.coord); 161 | H2P_destroy(&h2pack); 162 | 163 | return 0; 164 | } 165 | -------------------------------------------------------------------------------- /extra/test_ID_compress.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "utils.h" 11 | 12 | int main() 13 | { 14 | int nrow, ncol; 15 | printf("matrix size: "); 16 | scanf("%d %d", &nrow, &ncol); 17 | H2P_dense_mat_p A, A0, U; 18 | H2P_dense_mat_init(&A, nrow, ncol); 19 | H2P_dense_mat_init(&A0, nrow, ncol); 20 | 21 | DTYPE A0_fnorm = 0.0; 22 | srand48(time(NULL)); 23 | DTYPE *x1 = (DTYPE*) malloc(sizeof(DTYPE) * nrow); 24 | DTYPE *y1 = (DTYPE*) malloc(sizeof(DTYPE) * nrow); 25 | DTYPE *x2 = (DTYPE*) malloc(sizeof(DTYPE) * ncol); 26 | DTYPE *y2 = (DTYPE*) malloc(sizeof(DTYPE) * ncol); 27 | assert(x1 != NULL && x2 != NULL && y1 != NULL && y2 != NULL); 28 | for (int i = 0; i < nrow; i++) 29 | { 30 | x1[i] = (DTYPE) drand48(); 31 | y1[i] = (DTYPE) drand48(); 32 | } 33 | for (int i = 0; i < ncol; i++) 34 | { 35 | x2[i] = (DTYPE) (drand48() + 0.6); 36 | y2[i] = (DTYPE) (drand48() + 0.4); 37 | } 38 | for (int irow = 0; irow < nrow; irow++) 39 | { 40 | DTYPE *A_irow = A->data + irow * ncol; 41 | DTYPE *A0_irow = A0->data + irow * ncol; 42 | for (int icol = 0; icol < ncol; icol++) 43 | { 44 | DTYPE dx = x1[irow] - x2[icol]; 45 | DTYPE dy = y1[irow] - y2[icol]; 46 | DTYPE d = DSQRT(dx * dx + dy * dy); 47 | A_irow[icol] = 1.0 / d; 48 | A0_irow[icol] = A_irow[icol]; 49 | A0_fnorm += A_irow[icol] * A_irow[icol]; 50 | } 51 | } 52 | A0_fnorm = DSQRT(A0_fnorm); 53 | 54 | /* 55 | FILE *ouf = fopen("A.csv", "w"); 56 | for (int irow = 0; irow < nrow; irow++) 57 | { 58 | DTYPE *A_irow = A->data + irow * ncol; 59 | for (int icol = 0; icol < ncol - 1; icol++) 60 | { 61 | fprintf(ouf, "%.15lf, ", A_irow[icol]); 62 | //printf("%e ", A_irow[icol]); 63 | } 64 | fprintf(ouf, "%.15lf\n", A_irow[ncol - 1]); 65 | //printf("%e\n", A_irow[ncol - 1]); 66 | } 67 | fclose(ouf); 68 | */ 69 | 70 | H2P_int_vec_p J; 71 | H2P_int_vec_init(&J, nrow); 72 | DTYPE tol_norm; 73 | printf("norm_rel_tol: "); 74 | scanf(DTYPE_FMTSTR, &tol_norm); 75 | int n_thread = omp_get_max_threads(); 76 | int *ID_buff = (int*) malloc(sizeof(int) * A->nrow * 4); 77 | DTYPE *QR_buff = (DTYPE*) malloc(sizeof(DTYPE) * A->nrow); 78 | assert(ID_buff != NULL && QR_buff != NULL); 79 | H2P_ID_compress(A, QR_REL_NRM, &tol_norm, &U, J, n_thread, QR_buff, ID_buff, 1); // Warm up 80 | double ut = 0.0; 81 | for (int i = 0; i < 10; i++) 82 | { 83 | memcpy(A->data, A0->data, sizeof(DTYPE) * nrow * ncol); 84 | A->nrow = nrow; 85 | A->ncol = ncol; 86 | A->ld = ncol; 87 | double st = get_wtime_sec(); 88 | H2P_ID_compress(A, QR_REL_NRM, &tol_norm, &U, J, n_thread, QR_buff, ID_buff, 1); 89 | double et = get_wtime_sec(); 90 | ut += et - st; 91 | } 92 | printf("U rank = %d, average used time = %.8lf (s)\n", U->ncol, ut / 10.0); 93 | fflush(stdout); 94 | 95 | /* 96 | ouf = fopen("U.csv", "w"); 97 | for (int irow = 0; irow < U->nrow; irow++) 98 | { 99 | DTYPE *U_irow = U->data + irow * U->ncol; 100 | for (int icol = 0; icol < U->ncol - 1; icol++) 101 | { 102 | fprintf(ouf, "%.15lf, ", U_irow[icol]); 103 | //printf("% .4lf ", U_irow[icol]); 104 | } 105 | fprintf(ouf, "%.15lf\n", U_irow[U->ncol - 1]); 106 | //printf("% .4lf \n", U_irow[U->ncol - 1]); 107 | } 108 | fclose(ouf); 109 | */ 110 | 111 | //printf("A skeleton rows: "); 112 | //for (int i = 0; i < U->ncol; i++) printf("%d ", J[i]); 113 | //printf("\n"); 114 | 115 | DTYPE *AJ = (DTYPE*) malloc(sizeof(DTYPE) * ncol * U->ncol); 116 | for (int i = 0; i < U->ncol; i++) 117 | memcpy(AJ + i * ncol, A0->data + J->data[i] * ncol, sizeof(DTYPE) * ncol); 118 | CBLAS_GEMM( 119 | CblasRowMajor, CblasNoTrans, CblasNoTrans, nrow, ncol, U->ncol, 120 | 1.0, U->data, U->ncol, AJ, ncol, -1.0, A0->data, A0->ncol 121 | ); 122 | DTYPE res_fnorm = 0.0; 123 | for (int i = 0; i < nrow * ncol; i++) 124 | res_fnorm += A0->data[i] * A0->data[i]; 125 | res_fnorm = DSQRT(res_fnorm); 126 | printf("||A - A_{ID}||_fro / ||A||_fro = %e\n", res_fnorm / A0_fnorm); 127 | 128 | free(ID_buff); 129 | free(QR_buff); 130 | free(x1); 131 | free(y1); 132 | free(x2); 133 | free(y2); 134 | H2P_int_vec_destroy(&J); 135 | H2P_dense_mat_destroy(&U); 136 | H2P_dense_mat_destroy(&A); 137 | H2P_dense_mat_destroy(&A0); 138 | return 0; 139 | } -------------------------------------------------------------------------------- /extra/test_ID_compress_dim.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack_aux_structs.h" 10 | #include "H2Pack_ID_compress.h" 11 | #include "utils.h" 12 | 13 | void RPY_kernel_3d( 14 | const DTYPE *coord0, const int ld0, const int n0, 15 | const DTYPE *coord1, const int ld1, const int n1, 16 | const int dim, DTYPE *mat, const int ldm 17 | ) 18 | { 19 | const DTYPE a = 1.0, eta = 1.0; 20 | const DTYPE C = 1.0 / (6.0 * M_PI * a * eta); 21 | const DTYPE aa = a * a; 22 | const DTYPE a2 = 2.0 * a; 23 | const DTYPE aa2 = aa * 2.0; 24 | const DTYPE aa_2o3 = aa2 / 3.0; 25 | const DTYPE C_075 = C * 0.75; 26 | const DTYPE C_9o32oa = C * 9.0 / 32.0 / a; 27 | const DTYPE C_3o32oa = C * 3.0 / 32.0 / a; 28 | for (int i = 0; i < n0; i++) 29 | { 30 | DTYPE x0 = coord0[i]; 31 | DTYPE y0 = coord0[i + ld0]; 32 | DTYPE z0 = coord0[i + ld0 * 2]; 33 | for (int j = 0; j < n1; j++) 34 | { 35 | DTYPE r0 = x0 - coord1[j]; 36 | DTYPE r1 = y0 - coord1[j + ld1]; 37 | DTYPE r2 = z0 - coord1[j + ld1 * 2]; 38 | DTYPE s2 = r0 * r0 + r1 * r1 + r2 * r2; 39 | DTYPE s = DSQRT(s2); 40 | DTYPE inv_s = 1.0 / s; 41 | r0 *= inv_s; 42 | r1 *= inv_s; 43 | r2 *= inv_s; 44 | DTYPE t1, t2; 45 | if (s < a2) 46 | { 47 | t1 = C - C_9o32oa * s; 48 | t2 = C_3o32oa * s; 49 | } else { 50 | t1 = C_075 / s * (1 + aa_2o3 / s2); 51 | t2 = C_075 / s * (1 - aa2 / s2); 52 | } 53 | int base = 3 * i * ldm + 3 * j; 54 | #define krnl(k, l) mat[base + k * ldm + l] 55 | krnl(0, 0) = t2 * r0 * r0 + t1; 56 | krnl(0, 1) = t2 * r0 * r1; 57 | krnl(0, 2) = t2 * r0 * r2; 58 | krnl(1, 0) = t2 * r1 * r0; 59 | krnl(1, 1) = t2 * r1 * r1 + t1; 60 | krnl(1, 2) = t2 * r1 * r2; 61 | krnl(2, 0) = t2 * r2 * r0; 62 | krnl(2, 1) = t2 * r2 * r1; 63 | krnl(2, 2) = t2 * r2 * r2 + t1; 64 | } 65 | } 66 | } 67 | 68 | 69 | int main() 70 | { 71 | int nrow, ncol, kdim = 3; 72 | printf("matrix size: "); 73 | scanf("%d%d", &nrow, &ncol); 74 | int A_nrow = nrow * kdim; 75 | int A_ncol = ncol * kdim; 76 | DTYPE tol_norm; 77 | printf("norm_rel_tol: "); 78 | scanf(DTYPE_FMTSTR, &tol_norm); 79 | 80 | H2P_dense_mat_p A, A0, U; 81 | H2P_int_vec_p J; 82 | H2P_dense_mat_init(&A, A_nrow, A_ncol); 83 | H2P_dense_mat_init(&A0, A_nrow, A_ncol); 84 | H2P_int_vec_init(&J, A_nrow); 85 | 86 | DTYPE *coord0 = (DTYPE*) malloc(sizeof(DTYPE) * A_nrow); 87 | DTYPE *coord1 = (DTYPE*) malloc(sizeof(DTYPE) * A_ncol); 88 | assert(coord0 != NULL && coord1 != NULL); 89 | DTYPE *x0 = coord0, *x1 = coord1; 90 | DTYPE *y0 = coord0 + nrow, *y1 = coord1 + ncol; 91 | DTYPE *z0 = coord0 + nrow * 2, *z1 = coord1 + ncol * 2; 92 | for (int i = 0; i < nrow; i++) 93 | { 94 | x0[i] = (DTYPE) drand48(); 95 | y0[i] = (DTYPE) drand48(); 96 | z0[i] = (DTYPE) drand48(); 97 | } 98 | for (int i = 0; i < ncol; i++) 99 | { 100 | x1[i] = (DTYPE) (drand48() + 1.9); 101 | y1[i] = (DTYPE) (drand48() + 0.8); 102 | z1[i] = (DTYPE) (drand48() + 0.9); 103 | } 104 | 105 | RPY_kernel_3d( 106 | coord0, nrow, nrow, 107 | coord1, ncol, ncol, 108 | 1, A->data, A_ncol 109 | ); 110 | memcpy(A0->data, A->data, sizeof(DTYPE) * A_nrow * A_ncol); 111 | DTYPE A0_fnorm = 0.0; 112 | for (int i = 0; i < A_nrow * A_ncol; i++) 113 | A0_fnorm += A->data[i] * A->data[i]; 114 | 115 | int n_thread = omp_get_max_threads(); 116 | int QR_buff_size = (2 * kdim + 2) * A->ncol + (kdim + 1) * A->nrow; 117 | int *ID_buff = (int *) malloc(sizeof(int) * A->nrow * 4); 118 | DTYPE *QR_buff = (DTYPE *) malloc(sizeof(DTYPE) * QR_buff_size); 119 | double st = get_wtime_sec(); 120 | H2P_ID_compress( 121 | A, QR_REL_NRM, &tol_norm, &U, J, 122 | n_thread, QR_buff, ID_buff, kdim 123 | ); 124 | double ut = get_wtime_sec() - st; 125 | printf("H2P_ID_compress used %.3lf s\n", ut); 126 | 127 | DTYPE *AJ = (DTYPE*) malloc(sizeof(DTYPE) * U->ncol * A_ncol); 128 | for (int i = 0; i < J->length; i++) 129 | { 130 | int i30 = i * 3 + 0; 131 | int i31 = i * 3 + 1; 132 | int i32 = i * 3 + 2; 133 | int j30 = J->data[i] * 3 + 0; 134 | int j31 = J->data[i] * 3 + 1; 135 | int j32 = J->data[i] * 3 + 2; 136 | memcpy(AJ + i30*A_ncol, A0->data + j30*A_ncol, sizeof(DTYPE) * A_ncol); 137 | memcpy(AJ + i31*A_ncol, A0->data + j31*A_ncol, sizeof(DTYPE) * A_ncol); 138 | memcpy(AJ + i32*A_ncol, A0->data + j32*A_ncol, sizeof(DTYPE) * A_ncol); 139 | } 140 | CBLAS_GEMM( 141 | CblasRowMajor, CblasNoTrans, CblasNoTrans, A_nrow, A_ncol, U->ncol, 142 | 1.0, U->data, U->ncol, AJ, A_ncol, -1.0, A0->data, A_ncol 143 | ); 144 | DTYPE res_fnorm = 0.0; 145 | for (int i = 0; i < A_nrow * A_ncol; i++) 146 | res_fnorm += A0->data[i] * A0->data[i]; 147 | res_fnorm = DSQRT(res_fnorm); 148 | printf("U rank = %d (%d column blocks)\n", U->ncol, J->length); 149 | printf("||A - A_{ID}||_fro / ||A||_fro = %e\n", res_fnorm / A0_fnorm); 150 | 151 | free(QR_buff); 152 | free(ID_buff); 153 | free(coord0); 154 | free(coord1); 155 | H2P_int_vec_destroy(&J); 156 | H2P_dense_mat_destroy(&U); 157 | H2P_dense_mat_destroy(&A); 158 | H2P_dense_mat_destroy(&A0); 159 | return 0; 160 | } -------------------------------------------------------------------------------- /extra/test_kernel_SIMD.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "H2Pack_kernels.h" 11 | 12 | #include "parse_scalar_params.h" 13 | #include "direct_nbody.h" 14 | 15 | static void Gaussian_3D_eval_std_d(KRNL_EVAL_PARAM) 16 | { 17 | EXTRACT_3D_COORD(); 18 | const DTYPE *param_ = (DTYPE*) param; 19 | const DTYPE l = param_[0]; 20 | for (int i = 0; i < n0; i++) 21 | { 22 | DTYPE *mat_irow = mat + i * ldm; 23 | const DTYPE x0_i = x0[i]; 24 | const DTYPE y0_i = y0[i]; 25 | const DTYPE z0_i = z0[i]; 26 | //#pragma novector 27 | #pragma omp simd 28 | for (int j = 0; j < n1; j++) 29 | { 30 | DTYPE dx = x0_i - x1[j]; 31 | DTYPE dy = y0_i - y1[j]; 32 | DTYPE dz = z0_i - z1[j]; 33 | DTYPE r2 = dx * dx + dy * dy + dz * dz; 34 | mat_irow[j] = exp(-l * r2); 35 | } 36 | } 37 | } 38 | 39 | static void Gaussian_3D_bimv_std_d(KRNL_BIMV_PARAM) 40 | { 41 | EXTRACT_3D_COORD(); 42 | const DTYPE *param_ = (DTYPE*) param; 43 | const DTYPE l = param_[0]; 44 | for (int i = 0; i < n0; i += 2) 45 | { 46 | const DTYPE x0_i0 = x0[i]; 47 | const DTYPE y0_i0 = y0[i]; 48 | const DTYPE z0_i0 = z0[i]; 49 | const DTYPE x0_i1 = x0[i + 1]; 50 | const DTYPE y0_i1 = y0[i + 1]; 51 | const DTYPE z0_i1 = z0[i + 1]; 52 | const DTYPE xin1_i0 = x_in_1[i]; 53 | const DTYPE xin1_i1 = x_in_1[i + 1]; 54 | DTYPE sum_i0 = 0.0, sum_i1 = 0.0; 55 | //#pragma novector 56 | #pragma omp simd 57 | for (int j = 0; j < n1; j++) 58 | { 59 | DTYPE d0, d1, r20, r21; 60 | 61 | d0 = x0_i0 - x1[j]; 62 | d1 = x0_i1 - x1[j]; 63 | r20 = d0 * d0; 64 | r21 = d1 * d1; 65 | 66 | d0 = y0_i0 - y1[j]; 67 | d1 = y0_i1 - y1[j]; 68 | r20 += d0 * d0; 69 | r21 += d1 * d1; 70 | 71 | d0 = z0_i0 - z1[j]; 72 | d1 = z0_i1 - z1[j]; 73 | r20 += d0 * d0; 74 | r21 += d1 * d1; 75 | 76 | r20 = exp(-l * r20); 77 | r21 = exp(-l * r21); 78 | 79 | sum_i0 += r20 * x_in_0[j]; 80 | sum_i1 += r21 * x_in_0[j]; 81 | x_out_1[j] += (r20 * xin1_i0 + r21 * xin1_i1); 82 | } 83 | x_out_0[i] += sum_i0; 84 | x_out_0[i+1] += sum_i1; 85 | } 86 | } 87 | 88 | int main(int argc, char **argv) 89 | { 90 | //__itt_pause(); 91 | srand48(time(NULL)); 92 | 93 | parse_scalar_params(argc, argv); 94 | test_params.krnl_eval = Gaussian_3D_eval_std_d; 95 | test_params.krnl_bimv = Gaussian_3D_bimv_std_d; 96 | 97 | double st, et; 98 | 99 | H2Pack_p h2pack; 100 | 101 | H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol); 102 | 103 | H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox); 104 | 105 | int max_leaf_points = 0; 106 | DTYPE max_leaf_size = 0.0; 107 | H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size); 108 | 109 | H2P_dense_mat_p *pp; 110 | st = get_wtime_sec(); 111 | H2P_generate_proxy_point_ID_file( 112 | h2pack, test_params.krnl_param, test_params.krnl_eval, 113 | test_params.pp_fname, &pp 114 | ); 115 | et = get_wtime_sec(); 116 | printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st); 117 | 118 | H2P_build( 119 | h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 120 | test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops 121 | ); 122 | 123 | int n_check_pt = 50000, check_pt_s; 124 | if (n_check_pt >= test_params.n_point) 125 | { 126 | n_check_pt = test_params.n_point; 127 | check_pt_s = 0; 128 | } else { 129 | srand(time(NULL)); 130 | check_pt_s = rand() % (test_params.n_point - n_check_pt); 131 | } 132 | printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1); 133 | 134 | DTYPE *x, *y0, *y1; 135 | x = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 136 | y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt); 137 | y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size); 138 | assert(x != NULL && y0 != NULL && y1 != NULL); 139 | for (int i = 0; i < test_params.krnl_mat_size; i++) 140 | { 141 | //x[i] = (DTYPE) pseudo_randn(); 142 | x[i] = (DTYPE) drand48() - 0.5; 143 | } 144 | 145 | // Get reference results 146 | direct_nbody( 147 | test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 148 | test_params.coord, test_params.n_point, test_params.n_point, x, 149 | test_params.coord + check_pt_s, test_params.n_point, n_check_pt, y0 150 | ); 151 | 152 | // Warm up, reset timers, and test the matvec performance 153 | H2P_matvec(h2pack, x, y1); 154 | H2P_reset_timers(h2pack); 155 | for (int i = 0; i < 10; i++) 156 | H2P_matvec(h2pack, x, y1); 157 | 158 | H2P_print_statistic(h2pack); 159 | 160 | // Verify H2 matvec results 161 | DTYPE y0_norm = 0.0, err_norm = 0.0; 162 | for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++) 163 | { 164 | DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i]; 165 | y0_norm += y0[i] * y0[i]; 166 | err_norm += diff * diff; 167 | } 168 | y0_norm = DSQRT(y0_norm); 169 | err_norm = DSQRT(err_norm); 170 | printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm); 171 | 172 | free(x); 173 | free(y0); 174 | free(y1); 175 | free_aligned(test_params.coord); 176 | H2P_destroy(&h2pack); 177 | 178 | return 0; 179 | } 180 | -------------------------------------------------------------------------------- /extra/test_scalar_matmul.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack.h" 10 | #include "H2Pack_kernels.h" 11 | #include "H2Pack_utils.h" 12 | 13 | #include "parse_scalar_params.h" 14 | #include "direct_nbody.h" 15 | #include "test_H2_matmul.h" 16 | 17 | int main(int argc, char **argv) 18 | { 19 | srand48(time(NULL)); 20 | 21 | parse_scalar_params(argc, argv); 22 | 23 | H2Pack_p h2pack; 24 | double st, et; 25 | 26 | H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol); 27 | 28 | H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox); 29 | 30 | int max_leaf_points = 0; 31 | DTYPE max_leaf_size = 0.0; 32 | H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size); 33 | 34 | H2P_dense_mat_p *pp; 35 | st = get_wtime_sec(); 36 | H2P_generate_proxy_point_ID_file( 37 | h2pack, test_params.krnl_param, test_params.krnl_eval, 38 | test_params.pp_fname, &pp 39 | ); 40 | et = get_wtime_sec(); 41 | printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st); 42 | 43 | H2P_build( 44 | h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 45 | test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops 46 | ); 47 | 48 | int n_vecs[10] = {2, 2, 4, 8, 12, 16, 20, 24, 28, 32}; 49 | for (int i = 0; i < 10; i++) 50 | test_H2_matmul(h2pack, n_vecs[i]); 51 | 52 | h2pack->n_matvec = 0; // Skip printing matvec timings 53 | H2P_print_statistic(h2pack); 54 | 55 | free_aligned(test_params.coord); 56 | H2P_destroy(&h2pack); 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /pyh2pack/example.py: -------------------------------------------------------------------------------- 1 | import pyh2pack 2 | import numpy as np 3 | 4 | ''' 5 | NOTE: 6 | In Jupyter notebook, the outputs of `print_statistics/print_setting' might be redirected to terminals and will not be properly shown. 7 | Solution to this problem is to use package 'wurlitzer' 8 | Run `%load_ext wurlitzer` in Jupyeter. 9 | ''' 10 | 11 | N = 80000 12 | krnl_dim = 1 13 | pt_dim = 3 14 | coord = np.random.uniform(0, 1, size=(pt_dim, N)) 15 | x = np.random.normal(size=(krnl_dim*N)) 16 | 17 | 18 | ''' 19 | Test without precomputed proxy points 20 | ''' 21 | # build 22 | krnl_param = np.array([1, -0.5]) 23 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param) 24 | # matvec 25 | y = A.matvec(x) 26 | # partial direct matvec 27 | start_pt = 8000 28 | end_pt = 9999 29 | z = A.direct_matvec(x, start_pt, end_pt) 30 | # print the matvec error in the partial results 31 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z)) 32 | # statistic info of pyh2pack performance 33 | A.print_statistic() 34 | A.print_setting() 35 | A.clean() 36 | 37 | 38 | 39 | ''' 40 | Test with precomputed proxy points 41 | ''' 42 | # path to the file of storing proxy points 43 | pp_fname = "./pp_tmp.dat" 44 | # build 45 | krnl_param = np.array([1,-0.5]) 46 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param, pp_filename=pp_fname) 47 | # matvec 48 | y = A.matvec(x) 49 | # partial direct matvec 50 | start_pt = 8000 51 | end_pt = 9999 52 | z = A.direct_matvec(x, start_pt, end_pt) 53 | # print the matvec error in the partial results 54 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z)) 55 | # statistic info of pyh2pack performance 56 | A.print_statistic() 57 | A.clean() 58 | 59 | 60 | ''' 61 | Test with matmul 62 | ''' 63 | # build 64 | krnl_param = np.array([1,-0.5]) 65 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param) 66 | # matmul 67 | nvec = 10 68 | xs = np.random.normal(size=(krnl_dim*N, nvec)) 69 | ys = A.matmul(xs) 70 | # partial direct sum 71 | zs = [] 72 | start_pt = 0 73 | end_pt = 999 74 | for i in range(nvec): 75 | zs.append(A.direct_matvec(xs[:,i], start_pt, end_pt)) 76 | zs = np.hstack([z[:,np.newaxis] for z in zs]) 77 | print(np.linalg.norm(ys[start_pt*krnl_dim:(end_pt+1)*krnl_dim, :] - zs, ord='fro') / np.linalg.norm(zs, ord='fro')) 78 | A.print_statistic() 79 | A.clean() 80 | 81 | 82 | 83 | ''' 84 | Test with direct matrix vector multiplication in pyh2pack 85 | ''' 86 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param) 87 | y = A.matvec(x) 88 | 89 | # partial direct matvec by class h2 variable. 90 | start_pt = 0 91 | end_pt = 999 92 | z = A.direct_matvec(x, start_pt, end_pt) 93 | 94 | # direct matvec via package method: kernel_matvec 95 | target_coord = coord[:, start_pt:(end_pt+1)] 96 | z0 = pyh2pack.kernel_matvec(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_dim=pt_dim, krnl_param=krnl_param, source=coord, target=target_coord, x_in=x) 97 | 98 | A_blk = pyh2pack.kernel_block(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_dim=pt_dim, krnl_param=krnl_param, source=coord, target=target_coord) 99 | z1 = np.matmul(A_blk, x) 100 | 101 | # check error 102 | print(np.linalg.norm(z - z0)) 103 | print(np.linalg.norm(z - z1)) 104 | A.print_statistic() 105 | A.clean() 106 | -------------------------------------------------------------------------------- /pyh2pack/example_hss.py: -------------------------------------------------------------------------------- 1 | import pyh2pack 2 | import numpy as np 3 | 4 | ''' 5 | NOTE: 6 | In Jupyter notebook, the outputs of `print_statistics/print_setting' might be redirected to terminals and will not be properly shown. 7 | Solution to this problem is to use package 'wurlitzer' 8 | Run `%load_ext wurlitzer` in Jupyeter. 9 | ''' 10 | 11 | N = 40000 12 | krnl_dim = 1 13 | pt_dim = 2 14 | coord = np.random.uniform(0, N**(1.0/pt_dim), size=(pt_dim, N)) 15 | x = np.random.normal(size=(krnl_dim*N)) 16 | 17 | ''' 18 | Standard HSS 19 | ''' 20 | 21 | ## build 22 | krnl_param = np.array([1, -0.5]) 23 | A = pyh2pack.HSSMat(kernel="Quadratic_2D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, rel_tol=1e-3, krnl_param=krnl_param) 24 | # A = pyh2pack.HSSMat(kernel="Quadratic_2D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, rank=100, krnl_param=krnl_param) 25 | 26 | 27 | ## matvec 28 | y = A.matvec(x) 29 | # partial direct matvec 30 | start_pt = 6000 31 | end_pt = 9999 32 | z = A.direct_matvec(x, start_pt, end_pt) 33 | # print the matvec error in the partial results 34 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z)) 35 | 36 | ## ULV factorization 37 | diag_shift = 0.1 38 | A.factorize(is_cholesky=1, shift=diag_shift) 39 | 40 | ## solve based on ULV decomposition 41 | b = y + diag_shift * x 42 | x0 = A.solve(b) 43 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x0) / np.linalg.norm(x))) 44 | 45 | ## partial solve, A = LU, apply inv(L) first and then apply inv(U) 46 | z = A.solve(b, op="L") 47 | x1 = A.solve(z, op="U") 48 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x1) / np.linalg.norm(x))) 49 | 50 | ## statistic info of pyh2pack performance 51 | A.print_statistic() 52 | A.print_setting() 53 | A.clean() 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | ''' 62 | SPD HSS 63 | ''' 64 | 65 | ## build 66 | krnl_param = np.array([1, -0.5]) 67 | A = pyh2pack.HSSMat("Quadratic_2D", krnl_dim, coord, pt_dim, rel_tol=1e-6, krnl_param=krnl_param, spdhss=1, spdhss_shift=0.0, rank=100) 68 | # A = pyh2pack.HSSMat(kernel="Quadratic_2D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, rank=100, krnl_param=krnl_param) 69 | 70 | 71 | ## matvec 72 | y = A.matvec(x) 73 | # partial direct matvec 74 | start_pt = 6000 75 | end_pt = 9999 76 | z = A.direct_matvec(x, start_pt, end_pt) 77 | # print the matvec error in the partial results 78 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z)) 79 | 80 | ## ULV factorization 81 | diag_shift = 0.0 82 | A.factorize(is_cholesky=1, shift=diag_shift) 83 | 84 | ## solve based on ULV decomposition 85 | b = y + diag_shift * x 86 | x0 = A.solve(b) 87 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x0) / np.linalg.norm(x))) 88 | 89 | ## partial solve, A = LU, apply inv(L) first and then apply inv(U) 90 | z = A.solve(b, op="L") 91 | x1 = A.solve(z, op="U") 92 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x1) / np.linalg.norm(x))) 93 | 94 | ## statistic info of pyh2pack performance 95 | A.print_statistic() 96 | A.print_setting() 97 | A.clean() 98 | 99 | -------------------------------------------------------------------------------- /pyh2pack/example_samplept.py: -------------------------------------------------------------------------------- 1 | import pyh2pack 2 | import numpy as np 3 | 4 | N = 80000 5 | krnl_dim = 1 6 | pt_dim = 3 7 | coord = np.random.uniform(0, 1, size=(pt_dim, N)) 8 | x = np.random.normal(size=(krnl_dim*N)) 9 | 10 | # build 11 | krnl_param = np.array([0.5]) 12 | A = pyh2pack.H2Mat(kernel="Gaussian_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param, sample_pt=1) 13 | # Coulomb kernel does not have krnl_param 14 | #A = pyh2pack.H2Mat(kernel="Coulomb_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-6, sample_pt=1) 15 | 16 | # show build settings 17 | A.print_setting() 18 | 19 | # matvec 20 | y = A.matvec(x) 21 | 22 | # partial direct matvec 23 | start_pt = 8000 24 | end_pt = 9999 25 | z = A.direct_matvec(x, start_pt, end_pt) 26 | 27 | # print the matvec relative error in the partial results 28 | relerr = np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z) 29 | print("H2 matvec relative error = %e\n" % relerr) 30 | 31 | # statistic info of pyh2pack performance 32 | A.print_statistic() 33 | 34 | # clean out 35 | A.clean() 36 | -------------------------------------------------------------------------------- /pyh2pack/readme.md: -------------------------------------------------------------------------------- 1 | ## Building and Installing PyH2Pack 2 | 3 | ### Intel compiler (ICC) + Intel MKL 4 | Use this command to compile: 5 | ```shell 6 | LDSHARED="icc -shared" CC=icc python3 setup_icc.py install 7 | ``` 8 | Before running the python code, you need to manually preload the following MKL file: 9 | ```shell 10 | # Check if $MKLROOT is set correctly 11 | # ls $MKLROOT/lib/intel64/libmkl_rt.so 12 | export LD_PRELOAD=$MKLROOT/lib/intel64/libmkl_rt.so 13 | ``` 14 | 15 | ### GNU compiler (GCC) + OpenBLAS 16 | 17 | Install or compile OpenBLAS first, then modify `setup.py` and update variable `OPENBLAS_INSTALL_DIR` according to the location OpenBLAS is installed. Use this command to compile: 18 | 19 | ```shell 20 | CC=gcc python3 setup.py install 21 | ``` 22 | 23 | If you see an error message like: 24 | 25 | ```text 26 | copying build/lib.linux-x86_64-3.8/pyh2pack.cpython-38-x86_64-linux-gnu.so -> /usr/local/lib/python3.8/dist-packages 27 | error: could not delete '/usr/local/lib/python3.8/dist-packages/pyh2pack.cpython-38-x86_64-linux-gnu.so': Permission denied 28 | ``` 29 | 30 | Then manually run: 31 | 32 | ```shell 33 | sudo cp build/lib.linux-x86_64-3.8/pyh2pack.cpython-38-x86_64-linux-gnu.so /usr/local/lib/python3.8/dist-packages 34 | ``` 35 | 36 | 37 | 38 | ## Using PyH2Pack 39 | 40 | See `example.py`. 41 | 42 | If you want to try the data-driven sample point method instead of the default proxy point / proxy surface method, see `example_samplept.py`. 43 | -------------------------------------------------------------------------------- /pyh2pack/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | import os 3 | import numpy 4 | 5 | H2PACK_DIR = ".." 6 | OPENBLAS_INSTALL_DIR = "/usr/local/opt/openblas" 7 | #C_DIR = "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include" 8 | 9 | extra_cflags = ["-I"+H2PACK_DIR+"/include"] 10 | extra_cflags += ["-I"+OPENBLAS_INSTALL_DIR+"/include"] 11 | extra_cflags += ["-g", "-std=gnu99", "-O3"] 12 | extra_cflags += ["-DUSE_OPENBLAS", "-fopenmp", "-march=native"] 13 | extra_cflags += ["-Wno-unused-result", "-Wno-unused-function"] 14 | 15 | LIB = [H2PACK_DIR+"/lib/libH2Pack.a", OPENBLAS_INSTALL_DIR+"/lib/libopenblas.a"] 16 | extra_lflags = LIB + ["-g", "-O3", "-fopenmp", "-lm", "-lgfortran"] 17 | 18 | def main(): 19 | setup(name="pyh2pack", 20 | version="1.0.0", 21 | description="Python interface for H2Pack", 22 | author="Hua Huang, Xin Xing, and Edmond Chow", 23 | author_email="xxing02@gmail.com", 24 | ext_modules=[Extension( 25 | name = "pyh2pack", 26 | sources = ["pyh2pack.c"], 27 | include_dirs=[H2PACK_DIR+"/include", numpy.get_include()], 28 | extra_compile_args = extra_cflags, 29 | extra_link_args= extra_lflags, 30 | ) 31 | ] 32 | ) 33 | 34 | if __name__ == "__main__": 35 | main() 36 | -------------------------------------------------------------------------------- /pyh2pack/setup_icc.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup, Extension 2 | import os 3 | import numpy 4 | 5 | H2PACK_DIR = ".." 6 | 7 | extra_cflags = ["-I"+H2PACK_DIR+"/include"] 8 | extra_cflags += ["-g", "-std=gnu99", "-O3"] 9 | extra_cflags += ["-DUSE_MKL", "-qopenmp", "-xHost", "-mkl"] 10 | 11 | LIB = [H2PACK_DIR+"/lib/libH2Pack.a"] 12 | extra_lflags = LIB + ["-g", "-O3", "-qopenmp", "-L${MKLROOT}/lib/intel64", "-mkl_rt", "-lpthread"] 13 | 14 | def main(): 15 | setup(name="pyh2pack", 16 | version="1.0.0", 17 | description="Python interface for H2Pack", 18 | author="Hua Huang, Xin Xing, and Edmond Chow", 19 | author_email="xxing02@gmail.com", 20 | ext_modules=[Extension( 21 | name = "pyh2pack", 22 | sources = ["pyh2pack.c"], 23 | include_dirs=[H2PACK_DIR+"/include", numpy.get_include()], 24 | extra_compile_args = extra_cflags, 25 | extra_link_args= extra_lflags, 26 | ) 27 | ] 28 | ) 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /src/AFN_precond.h: -------------------------------------------------------------------------------- 1 | #ifndef __AFN_PRECOND_H__ 2 | #define __AFN_PRECOND_H__ 3 | 4 | // Adaptive Factorized Nystrom preconditioner, ref: https://arxiv.org/pdf/2304.05460.pdf 5 | 6 | #include "H2Pack_typedef.h" 7 | 8 | struct AFN_precond 9 | { 10 | int is_nys, is_afn; // Whether to use Nystrom ot AFN 11 | int n; // Size of the kernel matrix, == number of points (does not support krnl_dim > 1 yet) 12 | int n1; // Size of K11 block (== global low-rank approximation rank) 13 | int n2; // == n - n1 14 | int est_rank; // Estimated rank 15 | int *perm; // Permutation array, size n 16 | DTYPE *px, *py; // Size n, permuted x and y in AFN_precond_apply 17 | DTYPE *t1, *t2; // Size n, intermediate vectors in AFN_precond_apply 18 | DTYPE *nys_U; // Size n * n1, row major, Nystrom basis 19 | DTYPE *nys_M; // Size n1, Nystrom eigenvalues + diagonal shift, then scaled 20 | int *afn_G_rowptr; // Size n2 + 1, AFN G matrix CSR row_ptr array 21 | int *afn_GT_rowptr; // Size n2 + 1, AFN G^T matrix CSR row_ptr array 22 | int *afn_G_colidx; // Size nnz, AFN G matrix CSR col_idx array 23 | int *afn_GT_colidx; // Size nnz, AFN G^T matrix CSR col_idx array 24 | DTYPE *afn_G_val; // Size nnz, AFN G matrix CSR values array 25 | DTYPE *afn_GT_val; // Size nnz, AFN G^T matrix CSR values array 26 | DTYPE *afn_invK11; // Size n1 * n1, row major, AFN K11^{-1} matrix 27 | DTYPE *afn_K12; // Size n1 * n2, row major, AFN K12 matrix 28 | 29 | // Timers for profiling 30 | int n_apply; 31 | double t_build, t_apply, t_rankest, t_fps, t_K11K12, t_nys; 32 | double t_afn, t_afn_mat, t_afn_knn, t_afn_fsai, t_afn_csr; 33 | }; 34 | typedef struct AFN_precond AFN_precond_s; 35 | typedef struct AFN_precond *AFN_precond_p; 36 | 37 | #ifdef __cplusplus 38 | extern "C" { 39 | #endif 40 | 41 | // Build an AFN preconditioner for a kernel matrix 42 | // Input parameters: 43 | // krnl_eval : Pointer to kernel matrix evaluation function 44 | // krnl_param : Pointer to kernel function parameter array 45 | // npt : Number of points in coord 46 | // pt_dim : Dimension of each point 47 | // coord : Matrix, size pt_dim-by-npt, coordinates of points 48 | // mu : Scalar, diagonal shift of the kernel matrix 49 | // max_k : Maximum global low-rank approximation rank 50 | // ss_npt : Number of points in the sampling set 51 | // fsai_npt : Maximum number of nonzeros in each row of the FSAI matrix 52 | // h2mat : Optional, pointer to an initialized H2Pack struct, used for FSAI KNN search 53 | // Output parameter: 54 | // AFN_precond_ : Pointer to an initialized AFN_precond struct 55 | void AFN_precond_build( 56 | kernel_eval_fptr krnl_eval, void *krnl_param, const int npt, const int pt_dim, 57 | const DTYPE *coord, const DTYPE mu, const int max_k, const int ss_npt, 58 | const int fsai_npt, void *h2mat, AFN_precond_p *AFN_precond_ 59 | ); 60 | 61 | // Destroy an initialized AFN_precond struct 62 | void AFN_precond_destroy(AFN_precond_p *AFN_precond_); 63 | 64 | // Apply an AFN preconditioner to a vector 65 | // Input parameters: 66 | // AFN_precond : Pointer to an initialized AFN_precond struct 67 | // x : Input vector, size n 68 | // Output parameter: 69 | // y : Output vector, size n 70 | void AFN_precond_apply(AFN_precond_p AFN_precond, const DTYPE *x, DTYPE *y); 71 | 72 | // Print statistics of an AFN_precond struct 73 | void AFN_precond_print_stat(AFN_precond_p AFN_precond); 74 | 75 | #ifdef __cplusplus 76 | } 77 | #endif 78 | 79 | #endif // End of "#ifndef __AFN_PRECOND_H__" 80 | -------------------------------------------------------------------------------- /src/DAG_task_queue.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "DAG_task_queue.h" 8 | 9 | // Initialize a DAG_task_queue structure with a DAG stored in CSR format. 10 | // DAG(i, j) is nonzero means that task j relies on task i. If DAG(i, i) is 11 | // nonzero, task i will be skipped. 12 | void DAG_task_queue_init( 13 | const int max_task_id, const int num_dep, const int *DAG_src_ptr, 14 | const int *DAG_dst_idx, DAG_task_queue_p *tq_ 15 | ) 16 | { 17 | DAG_task_queue_p tq = (DAG_task_queue_p) malloc(sizeof(DAG_task_queue_s)); 18 | assert(tq != NULL); 19 | 20 | // Allocate arrays in DAG_task_queue 21 | tq->DAG_src_ptr = (int*) malloc(sizeof(int) * (max_task_id + 1)); 22 | tq->DAG_dst_idx = (int*) malloc(sizeof(int) * num_dep); 23 | tq->indeg = (int*) malloc(sizeof(int) * max_task_id); 24 | tq->curr_indeg = (int*) malloc(sizeof(int) * max_task_id); 25 | tq->task_queue = (int*) malloc(sizeof(int) * max_task_id); 26 | assert(tq->DAG_src_ptr != NULL); 27 | assert(tq->DAG_dst_idx != NULL); 28 | assert(tq->indeg != NULL); 29 | assert(tq->curr_indeg != NULL); 30 | assert(tq->task_queue != NULL); 31 | 32 | // Copy DAG CSR matrix, count DAG vertex indegree and number of actual tasks 33 | if (num_dep != DAG_src_ptr[max_task_id]) 34 | { 35 | fprintf(stderr, "ERROR: num_dep != DAG_src_ptr[max_task_id] \n"); 36 | return; 37 | } 38 | tq->max_task_id = max_task_id; 39 | tq->num_task = max_task_id; 40 | memcpy(tq->DAG_src_ptr, DAG_src_ptr, sizeof(int) * (max_task_id + 1)); 41 | memcpy(tq->DAG_dst_idx, DAG_dst_idx, sizeof(int) * num_dep); 42 | memset(tq->indeg, 0, sizeof(int) * max_task_id); 43 | for (int i = 0; i < max_task_id; i++) 44 | { 45 | for (int j = tq->DAG_src_ptr[i]; j < tq->DAG_src_ptr[i + 1]; j++) 46 | { 47 | int dst = tq->DAG_dst_idx[j]; 48 | if (dst == i) tq->num_task--; // Task i relies on task i 49 | tq->indeg[dst]++; 50 | } 51 | } 52 | 53 | DAG_task_queue_reset(tq); 54 | 55 | *tq_ = tq; 56 | } 57 | 58 | // Destroy a DAG_task_queue structure 59 | void DAG_task_queue_destroy(DAG_task_queue_p *tq_) 60 | { 61 | DAG_task_queue_p tq = *tq_; 62 | if (tq == NULL) return; 63 | free(tq->DAG_src_ptr); 64 | free(tq->DAG_dst_idx); 65 | free(tq->indeg); 66 | free(tq->curr_indeg); 67 | free(tq->task_queue); 68 | free(tq); 69 | *tq_ = NULL; 70 | } 71 | 72 | // Get a new task from a DAG_task_queue structure and update its task queue. 73 | // This function can be called by multiple threads at the same time. 74 | int DAG_task_queue_get_task(DAG_task_queue_p tq) 75 | { 76 | if (tq == NULL) return -1; 77 | 78 | // Get current task queue head index and increment it 79 | // If all tasks are finished, return directly 80 | int task_head = __atomic_fetch_add(&tq->task_head, 1, __ATOMIC_SEQ_CST); 81 | if (task_head >= tq->num_task) return -1; 82 | 83 | // Atomic load the task id, task_id = -1 means the task_head-th task is not 84 | // available yet, otherwise we have a valid task_id and return 85 | int task_id = __atomic_load_n(&tq->task_queue[task_head], __ATOMIC_SEQ_CST); 86 | while (task_id == -1) 87 | { 88 | //usleep(10); 89 | task_id = __atomic_load_n(&tq->task_queue[task_head], __ATOMIC_SEQ_CST); 90 | } 91 | //if (task_id == -1) printf("[Warning] task_head = %d, task_id = -1\n", task_head); 92 | return task_id; 93 | } 94 | 95 | // Finish a task and push new available tasks to a DAG_task_queue task queue. 96 | // This function can be called by multiple threads at the same time. 97 | void DAG_task_queue_finish_task(DAG_task_queue_p tq, const int task_id) 98 | { 99 | if (tq == NULL) return; 100 | for (int j = tq->DAG_src_ptr[task_id]; j < tq->DAG_src_ptr[task_id + 1]; j++) 101 | { 102 | // For a destination vertex, subtract its current indegree count by 1 103 | // and get its new indegree count to see if it is available now 104 | int dst = tq->DAG_dst_idx[j]; 105 | int dst_indeg = __atomic_sub_fetch(tq->curr_indeg + dst, 1, __ATOMIC_SEQ_CST); 106 | 107 | // If the destination vertex is now available, push it to task queue 108 | if (dst_indeg == 0) 109 | { 110 | int task_tail = __atomic_fetch_add(&tq->task_tail, 1, __ATOMIC_SEQ_CST); 111 | __atomic_store_n(&tq->task_queue[task_tail], dst, __ATOMIC_SEQ_CST); 112 | } 113 | //if (dst_indeg < 0) printf("Warning: from task %d, set %d indeg = %d\n", task_id, dst, dst_indeg); 114 | } 115 | } 116 | 117 | // Reset the task queue in a DAG_task_queue structure. 118 | void DAG_task_queue_reset(DAG_task_queue_p tq) 119 | { 120 | if (tq == NULL) return; 121 | tq->task_head = 0; 122 | tq->task_tail = 0; 123 | for (int i = 0; i < tq->max_task_id; i++) 124 | { 125 | tq->curr_indeg[i] = tq->indeg[i]; 126 | tq->task_queue[i] = -1; // Mark all the tasks in the queue as unavailable 127 | if (tq->indeg[i] == 0) 128 | { 129 | tq->task_queue[tq->task_tail] = i; 130 | tq->task_tail++; 131 | } 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /src/DAG_task_queue.h: -------------------------------------------------------------------------------- 1 | #ifndef __DAG_TASK_QUEUE_H__ 2 | #define __DAG_TASK_QUEUE_H__ 3 | 4 | struct DAG_task_queue 5 | { 6 | int max_task_id; // Max task id + 1 7 | int num_task; // Number of actual tasks 8 | int task_head; // Head index of currently avail tasks in the queue 9 | int task_tail; // Tail index of currently avail tasks in the queue 10 | int *DAG_src_ptr; // Size max_task_id+1, DAG CSR matrix row_ptr array 11 | int *DAG_dst_idx; // Size unknown, DAG CSR matrix col_idx array 12 | int *indeg; // Size max_task_id, indegree of DAG vertexes 13 | int *curr_indeg; // Size max_task_id, indegree of DAG vertexes in running 14 | int *task_queue; // Size max_task_id, task queue 15 | }; 16 | typedef struct DAG_task_queue DAG_task_queue_s; 17 | typedef struct DAG_task_queue* DAG_task_queue_p; 18 | 19 | #ifdef __cplusplus 20 | extern "C" { 21 | #endif 22 | 23 | // Initialize a DAG_task_queue structure with a DAG stored in CSR matrix. 24 | // DAG(i, j) is nonzero means that task j relies on task i. If DAG(i, i) is 25 | // nonzero, task i will be skipped. 26 | // Input parameters: 27 | // max_task_id : Max task id + 1, Tasks are indexed from 0 to max_task_id-1 28 | // num_dep : Number of dependencies (nonzeros in DAG matrix) 29 | // DAG_src_ptr : Size max_task_id+1, CSR matrix row_ptr array 30 | // DAG_dst_idx : Size num_dep, CSR matrix col_idx array 31 | // Output parameter: 32 | // *tq_ : Pointer to an initialized DAG_task_queue structure 33 | void DAG_task_queue_init( 34 | const int max_task_id, const int num_dep, const int *DAG_src_ptr, 35 | const int *DAG_dst_idx, DAG_task_queue_p *tq_ 36 | ); 37 | 38 | // Destroy a DAG_task_queue structure. 39 | // Input parameter: 40 | // tq : A DAG_task_queue structure to be destroyed 41 | void DAG_task_queue_destroy(DAG_task_queue_p *tq_); 42 | 43 | // Get a new task from a DAG_task_queue structure and update its task queue. 44 | // This function can be called by multiple threads at the same time. 45 | // Input parameter: 46 | // tq : Target DAG_task_queue structure 47 | // Output parameters: 48 | // tq : Target DAG_task_queue structure with updated task queue info 49 | // : Index of the new task. -1 means all tasks are finished. 50 | int DAG_task_queue_get_task(DAG_task_queue_p tq); 51 | 52 | // Finish a task and push new available tasks to a DAG_task_queue task queue. 53 | // This function can be called by multiple threads at the same time. 54 | // Input parameters: 55 | // tq : Target DAG_task_queue structure 56 | // task_id : Index of the finished task 57 | // Output parameter: 58 | // tq : Target DAG_task_queue structure with updated task queue info 59 | void DAG_task_queue_finish_task(DAG_task_queue_p tq, const int task_id); 60 | 61 | // Reset the task queue in a DAG_task_queue structure. 62 | // Input parameter: 63 | // tq : Target DAG_task_queue structure 64 | // Output parameter: 65 | // tq : Target DAG_task_queue structure with updated task queue info 66 | void DAG_task_queue_reset(DAG_task_queue_p tq); 67 | 68 | #ifdef __cplusplus 69 | } 70 | #endif 71 | 72 | #endif // End of "#ifndef __DAG_TASK_QUEUE_H__" 73 | 74 | -------------------------------------------------------------------------------- /src/GCC-OpenBLAS.make: -------------------------------------------------------------------------------- 1 | CC = gcc 2 | USE_MKL = 0 3 | USE_OPENBLAS = 1 4 | 5 | include common.make 6 | 7 | # GCC 10 need to manually specify using SVE, -march=native is not enough 8 | # On A64FX SVE vector bits = 512, on other SVE supported processors this value might be different 9 | USE_AARCH64_SVE = 0 10 | SVE_VECTOR_BITS = 512 11 | ifeq ($(strip $(USE_AARCH64_SVE)), 1) 12 | CFLAGS := $(subst -march=native, -march=armv8.2-a+sve -msve-vector-bits=$(SVE_VECTOR_BITS), $(CFLAGS)) 13 | endif -------------------------------------------------------------------------------- /src/H2Pack.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_H__ 2 | #define __H2PACK_H__ 3 | 4 | // H2Pack configurations 5 | #include "H2Pack_config.h" 6 | 7 | // H2Pack data structure 8 | #include "H2Pack_typedef.h" 9 | 10 | // H2Pack auxiliary data structures 11 | #include "H2Pack_aux_structs.h" 12 | 13 | // H2Pack hierarchical point partitioning 14 | #include "H2Pack_partition.h" 15 | 16 | // H2Pack hierarchical point partitioning for periodic system 17 | #include "H2Pack_partition_periodic.h" 18 | 19 | // H2Pack interpolative decomposition compression 20 | #include "H2Pack_ID_compress.h" 21 | 22 | // H2Pack generate proxy points 23 | #include "H2Pack_gen_proxy_point.h" 24 | 25 | // H2Pack build H2/HSS representation 26 | #include "H2Pack_build.h" 27 | 28 | // H2Pack build H2 representation for periodic system 29 | #include "H2Pack_build_periodic.h" 30 | 31 | // H2Pack H2/HSS fast matrix-vector multiplication 32 | #include "H2Pack_matvec.h" 33 | 34 | // H2Pack H2 fast matrix-vector multiplication for periodic system 35 | #include "H2Pack_matvec_periodic.h" 36 | 37 | // H2Pack H2/HSS fast matrix-matrix multiplication 38 | #include "H2Pack_matmul.h" 39 | 40 | // H2Pack H2 fast matrix-matrix multiplication for periodic system 41 | #include "H2Pack_matmul_periodic.h" 42 | 43 | // H2Pack HSS ULV decomposition and solve 44 | #include "H2Pack_HSS_ULV.h" 45 | 46 | // H2Pack SPDHSS H2 build 47 | #include "H2Pack_SPDHSS_H2.h" 48 | 49 | // H2Pack file IO 50 | #include "H2Pack_file_IO.h" 51 | 52 | // H2Pack build H2/HSS representation with sample points 53 | #include "H2Pack_build_with_sample_point.h" 54 | 55 | // Linear algebra library (BLAS, LAPACK) wrapper header 56 | #include "linalg_lib_wrapper.h" 57 | 58 | // Vector wrapper function wrapper 59 | #include "ASTER/include/aster.h" 60 | 61 | // Helper functions 62 | #include "utils.h" 63 | 64 | #endif 65 | -------------------------------------------------------------------------------- /src/H2Pack_HSS_ULV.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_HSS_ULV_H__ 2 | #define __H2PACK_HSS_ULV_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // Construct the ULV LU factorization for a HSS matrix 12 | // Input parameters: 13 | // h2pack : H2Pack structure with constructed HSS representation 14 | // shift : Shift coefficient k to make (A + k * I) non-singular 15 | // Output parameter: 16 | // h2pack : H2Pack structure with ULV LU factorization 17 | void H2P_HSS_ULV_LU_factorize(H2Pack_p h2pack, const DTYPE shift); 18 | 19 | // Solve the linear system A_{HSS} * x = b using the HSS ULV LU factorization, 20 | // where A_{HSS} = L_{HSS} * U_{HSS}. 21 | // Input parameters: 22 | // h2pack : H2Pack structure with ULV LU factorization 23 | // op : Operation type, 1, 2, or 3 24 | // b : Size >= h2pack->krnl_mat_size, right-hand side vector 25 | // Output parameter: 26 | // x : Size >= h2pack->krnl_mat_size, solution vector. 27 | // If op == 1, x satisfies L_{HSS} * x = b. 28 | // If op == 2, x satisfies U_{HSS} * x = b. 29 | // If op == 3, x satisfies A_{HSS} * x = b. 30 | void H2P_HSS_ULV_LU_solve(H2Pack_p h2pack, const int op, const DTYPE *b, DTYPE *x); 31 | 32 | // Construct the ULV Cholesky factorization for a HSS matrix 33 | // Input parameters: 34 | // h2pack : H2Pack structure with constructed HSS representation 35 | // shift : Shift coefficient k to make (A + k * I) S.P.D. 36 | // Output parameter: 37 | // h2pack : H2Pack structure with ULV Cholesky factorization 38 | void H2P_HSS_ULV_Cholesky_factorize(H2Pack_p h2pack, const DTYPE shift); 39 | 40 | // Solve the linear system A_{HSS} * x = b using the HSS ULV Cholesky factorization, 41 | // where A_{HSS} = L_{HSS} * L_{HSS}^T. 42 | // Input parameters: 43 | // h2pack : H2Pack structure with ULV Cholesky factorization 44 | // op : Operation type, 1, 2, or 3 45 | // b : Size >= h2pack->krnl_mat_size, right-hand side vector 46 | // Output parameter: 47 | // x : Size >= h2pack->krnl_mat_size, solution vector. 48 | // If op == 1, x satisfies L_{HSS}^T * x = b. 49 | // If op == 2, x satisfies L_{HSS} * x = b. 50 | // If op == 3, x satisfies A_{HSS} * x = b. 51 | void H2P_HSS_ULV_Cholesky_solve(H2Pack_p h2pack, const int op, const DTYPE *b, DTYPE *x); 52 | 53 | #ifdef __cplusplus 54 | } 55 | #endif 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /src/H2Pack_ID_compress.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_ID_COMPRESS_H__ 2 | #define __H2PACK_ID_COMPRESS_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_aux_structs.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // Interpolative Decomposition (ID) using partial QR over rows of a target 12 | // matrix. Partial pivoting QR may need to be upgraded to SRRQR later. 13 | // Given an m*n matrix A, an rank-k ID approximation of A is of form 14 | // A = U * A(J, :) 15 | // where J is a row index subset of size k, and U is a m*k matrix (if 16 | // SRRQR is used, entries of U are bounded by a parameter 'f'). A(J,:) 17 | // and U are usually called the skeleton and projection matrix. 18 | // Input parameters: 19 | // A : Target matrix, stored in row major 20 | // stop_type : Partial QR stop criteria: QR_RANK, QR_REL_NRM, or QR_ABS_NRM 21 | // stop_param : Pointer to partial QR stop parameter 22 | // n_thread : Number of threads used in this function 23 | // QR_buff : Working buffer for partial pivoting QR. If kdim == 1, size A->nrow. 24 | // If kdim > 1, size (2*kdim+2)*A->ncol + (kdim+1)*A->nrow. 25 | // ID_buff : Size 4 * A->nrow, working buffer for ID compression 26 | // kdim : Dimension of tensor kernel's return (column block size) 27 | // Output parameters: 28 | // U_ : Projection matrix, will be initialized in this function. If U_ == NULL, 29 | // the projection matrix will not be calculated. 30 | // J : Row indices of the skeleton A 31 | void H2P_ID_compress( 32 | H2P_dense_mat_p A, const int stop_type, void *stop_param, H2P_dense_mat_p *U_, 33 | H2P_int_vec_p J, const int n_thread, DTYPE *QR_buff, int *ID_buff, const int kdim 34 | ); 35 | 36 | #ifdef __cplusplus 37 | } 38 | #endif 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /src/H2Pack_SPDHSS_H2.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_SPDHSS_H2_H__ 2 | #define __H2PACK_SPDHSS_H2_H__ 3 | 4 | #include "H2Pack_typedef.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | // Construct an SPD HSS matrix from a H2 matrix 11 | // Input parameters: 12 | // max_rank : Maximum rank of the HSS matrix 13 | // reltol : Relative tolerance in column-pivoted QR 14 | // shift : Diagonal shifting 15 | // h2mat : Constructed H2 matrix 16 | // Output parameter: 17 | // *hssmat_ : The constructed SPD HSS matrix, A_{HSS} ~= A_{H2} + shift * I 18 | void H2P_SPDHSS_H2_build( 19 | const int max_rank, const DTYPE reltol, const DTYPE shift, 20 | H2Pack_p h2mat, H2Pack_p *hssmat_ 21 | ); 22 | 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | 27 | #endif 28 | 29 | -------------------------------------------------------------------------------- /src/H2Pack_build.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_BUILD_H__ 2 | #define __H2PACK_BUILD_H__ 3 | 4 | #include "H2Pack_typedef.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | // Build H2 representation with a kernel function 11 | // Input parameters: 12 | // h2pack : H2Pack structure with point partitioning info 13 | // pp : Array of proxy points for each level 14 | // BD_JIT : 0 or 1, if B and D matrices are computed just-in-time in matvec 15 | // krnl_param : Pointer to kernel function parameter array 16 | // krnl_eval : Pointer to kernel matrix evaluation function 17 | // krnl_bimv : Pointer to kernel matrix bi-matvec function 18 | // krnl_bimv_flops : FLOPs needed in kernel bi-matvec 19 | // Output parameter: 20 | // h2pack : H2Pack structure with H2 representation matrices 21 | void H2P_build( 22 | H2Pack_p h2pack, H2P_dense_mat_p *pp, const int BD_JIT, void *krnl_param, 23 | kernel_eval_fptr krnl_eval, kernel_bimv_fptr krnl_bimv, const int krnl_bimv_flops 24 | ); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/H2Pack_build_periodic.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "H2Pack_config.h" 10 | #include "H2Pack_typedef.h" 11 | #include "H2Pack_aux_structs.h" 12 | #include "H2Pack_build_periodic.h" 13 | #include "H2Pack_utils.h" 14 | #include "utils.h" 15 | 16 | // Build periodic block for root node 17 | void H2P_build_periodic_block(H2Pack_p h2pack) 18 | { 19 | int pt_dim = h2pack->pt_dim; 20 | int xpt_dim = h2pack->xpt_dim; 21 | int krnl_dim = h2pack->krnl_dim; 22 | int root_idx = h2pack->root_idx; 23 | int n_lattice = h2pack->n_lattice; 24 | void *krnl_param = h2pack->krnl_param; 25 | void *pkrnl_param = h2pack->pkrnl_param; 26 | DTYPE *enbox0_width = h2pack->enbox + (root_idx * (2 * pt_dim) + pt_dim); 27 | DTYPE *per_lattices = h2pack->per_lattices; 28 | H2P_dense_mat_p root_J_coord = h2pack->J_coord[root_idx]; 29 | H2P_dense_mat_p root_J_coord_s = h2pack->tb[0]->mat0; 30 | H2P_dense_mat_p krnl_mat_blk = h2pack->tb[0]->mat1; 31 | kernel_eval_fptr krnl_eval = h2pack->krnl_eval; 32 | kernel_eval_fptr pkrnl_eval = h2pack->pkrnl_eval; 33 | 34 | int n_point_root = root_J_coord->ncol; 35 | int per_blk_size = n_point_root * krnl_dim; 36 | DTYPE *per_blk = (DTYPE*) malloc_aligned(sizeof(DTYPE) * per_blk_size * per_blk_size, 64); 37 | ASSERT_PRINTF(per_blk != NULL, "Failed to allocate periodic block of size %d^2\n", per_blk_size); 38 | 39 | // O = pkernel({root_J_coord, root_J_coord}); 40 | pkrnl_eval( 41 | root_J_coord->data, root_J_coord->ld, root_J_coord->ncol, 42 | root_J_coord->data, root_J_coord->ld, root_J_coord->ncol, 43 | pkrnl_param, per_blk, per_blk_size 44 | ); 45 | DTYPE shift[8] = {0, 0, 0, 0, 0, 0, 0, 0}; 46 | H2P_dense_mat_resize(krnl_mat_blk, per_blk_size, per_blk_size); 47 | H2P_dense_mat_resize(root_J_coord_s, xpt_dim, n_point_root); 48 | copy_matrix( 49 | sizeof(DTYPE), xpt_dim, n_point_root, root_J_coord->data, root_J_coord->ld, 50 | root_J_coord_s->data, root_J_coord_s->ld, 0 51 | ); 52 | for (int l = 0; l < n_lattice; l++) 53 | { 54 | // shift = lattice(l, 1 : pt_dim) .* root_box(pt_dim+1 : 2 * pt_dim); 55 | // shift = [shift, zeros(1, xpt_dim - pt_dim)]; 56 | DTYPE *lattice_l = per_lattices + l * pt_dim; 57 | for (int j = 0; j < pt_dim; j++) shift[j] = enbox0_width[j] * lattice_l[j]; 58 | // root_J_coord_s = coord_shift(root_J_coord, shift, 1); 59 | H2P_shift_coord(root_J_coord_s, shift, 1.0); 60 | // O = O - kernel({root_J_coord, root_J_coord_s}); 61 | krnl_eval( 62 | root_J_coord->data, root_J_coord->ld, root_J_coord->ncol, 63 | root_J_coord_s->data, root_J_coord_s->ld, root_J_coord->ncol, 64 | krnl_param, krnl_mat_blk->data, krnl_mat_blk->ld 65 | ); 66 | #pragma omp simd 67 | for (int i = 0; i < per_blk_size * per_blk_size; i++) 68 | per_blk[i] -= krnl_mat_blk->data[i]; 69 | // Reset root_J_coord_s = root_J_coord 70 | H2P_shift_coord(root_J_coord_s, shift, -1.0); 71 | } 72 | 73 | h2pack->per_blk = per_blk; 74 | } 75 | 76 | // Build H2 representation with a regular kernel function and 77 | // a periodic system kernel (Ewald summation) function 78 | void H2P_build_periodic( 79 | H2Pack_p h2pack, H2P_dense_mat_p *pp, const int BD_JIT, 80 | void *krnl_param, kernel_eval_fptr krnl_eval, 81 | void *pkrnl_param, kernel_eval_fptr pkrnl_eval, 82 | kernel_mv_fptr krnl_mv, const int krnl_mv_flops 83 | ) 84 | { 85 | double st, et; 86 | double *timers = h2pack->timers; 87 | 88 | if (pp == NULL) 89 | { 90 | ERROR_PRINTF("You need to provide a set of proxy points.\n"); 91 | return; 92 | } 93 | 94 | if (krnl_eval == NULL) 95 | { 96 | ERROR_PRINTF("You need to provide a valid krnl_eval().\n"); 97 | return; 98 | } 99 | 100 | if (BD_JIT != 1) 101 | { 102 | ERROR_PRINTF("Only support BD_JIT=1 in this function for the moment.\n"); 103 | return; 104 | } 105 | 106 | h2pack->pp = pp; 107 | h2pack->BD_JIT = BD_JIT; 108 | h2pack->krnl_param = krnl_param; 109 | h2pack->krnl_eval = krnl_eval; 110 | h2pack->pkrnl_param = pkrnl_param; 111 | h2pack->pkrnl_eval = pkrnl_eval; 112 | h2pack->krnl_mv = krnl_mv; 113 | h2pack->krnl_bimv_flops = krnl_mv_flops - 2; 114 | if (BD_JIT == 1 && krnl_mv == NULL) 115 | WARNING_PRINTF("krnl_eval() will be used in BD_JIT matvec. For better performance, consider using a krnl_mv().\n"); 116 | 117 | // 1. Build projection matrices and skeleton row sets 118 | st = get_wtime_sec(); 119 | H2P_build_H2_UJ_proxy(h2pack); 120 | et = get_wtime_sec(); 121 | timers[U_BUILD_TIMER_IDX] = et - st; 122 | 123 | // 2. Generate H2 generator matrices metadata 124 | st = get_wtime_sec(); 125 | H2P_generate_B_metadata(h2pack); 126 | et = get_wtime_sec(); 127 | timers[B_BUILD_TIMER_IDX] = et - st; 128 | 129 | // 3. Generate H2 dense blocks metadata 130 | st = get_wtime_sec(); 131 | H2P_generate_D_metadata(h2pack); 132 | et = get_wtime_sec(); 133 | timers[D_BUILD_TIMER_IDX] = et - st; 134 | 135 | // 4. Build periodic block for root node, add its timing to B build timing 136 | st = get_wtime_sec(); 137 | H2P_build_periodic_block(h2pack); 138 | et = get_wtime_sec(); 139 | timers[B_BUILD_TIMER_IDX] = et - st; 140 | 141 | // 5. Set up forward and backward permutation indices 142 | int n_point = h2pack->n_point; 143 | int krnl_dim = h2pack->krnl_dim; 144 | int *coord_idx = h2pack->coord_idx; 145 | int *fwd_pmt_idx = (int*) malloc(sizeof(int) * n_point * krnl_dim); 146 | int *bwd_pmt_idx = (int*) malloc(sizeof(int) * n_point * krnl_dim); 147 | for (int i = 0; i < n_point; i++) 148 | { 149 | for (int j = 0; j < krnl_dim; j++) 150 | { 151 | fwd_pmt_idx[i * krnl_dim + j] = coord_idx[i] * krnl_dim + j; 152 | bwd_pmt_idx[coord_idx[i] * krnl_dim + j] = i * krnl_dim + j; 153 | } 154 | } 155 | h2pack->fwd_pmt_idx = fwd_pmt_idx; 156 | h2pack->bwd_pmt_idx = bwd_pmt_idx; 157 | } 158 | -------------------------------------------------------------------------------- /src/H2Pack_build_periodic.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_BUILD_PERIODIC_H__ 2 | #define __H2PACK_BUILD_PERIODIC_H__ 3 | 4 | #include "H2Pack_typedef.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | // Build H2 representation with a regular kernel function and 11 | // a periodic system kernel (Ewald summation) function 12 | // Input parameters: 13 | // h2pack : H2Pack structure with point partitioning info 14 | // pp : Array of proxy points for each level 15 | // BD_JIT : 0 or 1, if B and D matrices are computed just-in-time in matvec 16 | // krnl_param : Pointer to kernel function parameter array 17 | // krnl_eval : Pointer to kernel matrix evaluation function 18 | // pkrnl_param : Pointer to periodic system kernel (Ewald summation) function parameter array 19 | // pkrnl_eval : Pointer to periodic system kernel (Ewald summation) matrix evaluation function 20 | // krnl_mv : Pointer to kernel matvec function 21 | // krnl_mv_flops : FLOPs needed in kernel bi-matvec 22 | // Output parameter: 23 | // h2pack : H2Pack structure with H2 representation matrices 24 | void H2P_build_periodic( 25 | H2Pack_p h2pack, H2P_dense_mat_p *pp, const int BD_JIT, 26 | void *krnl_param, kernel_eval_fptr krnl_eval, 27 | void *pkrnl_param, kernel_eval_fptr pkrnl_eval, 28 | kernel_mv_fptr krnl_mv, const int krnl_mv_flops 29 | ); 30 | 31 | #ifdef __cplusplus 32 | } 33 | #endif 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /src/H2Pack_build_with_sample_point.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_BUILD_WITH_SAMPLE_POINT_H__ 2 | #define __H2PACK_BUILD_WITH_SAMPLE_POINT_H__ 3 | 4 | #include "H2Pack_typedef.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | // Select sample points for constructing H2 projection and skeleton matrices 11 | // This algorithm is based on the MATLAB code provided by the author of the paper 12 | // doi/10.1109/IPDPS47924.2020.00082, but the algorithm is not discussed in the paper 13 | // Input parameters: 14 | // h2pack : Initialized H2Pack structure 15 | // krnl_param : Pointer to kernel function parameter array 16 | // krnl_eval : Pointer to kernel matrix evaluation function 17 | // tau : Separation threshold, usually is 0.7 18 | // Output parameter: 19 | // *sample_points_ : Array of sample points for each node 20 | void H2P_select_sample_point( 21 | H2Pack_p h2pack, const void *krnl_param, kernel_eval_fptr krnl_eval, 22 | const DTYPE tau, H2P_dense_mat_p **sample_points_ 23 | ); 24 | 25 | // Build H2 representation with a kernel function and sample points 26 | // Input parameters: 27 | // h2pack : H2Pack structure with point partitioning info 28 | // sample_pt : Array of sample points for each node 29 | // BD_JIT : 0 or 1, if B and D matrices are computed just-in-time in matvec 30 | // krnl_param : Pointer to kernel function parameter array 31 | // krnl_eval : Pointer to kernel matrix evaluation function 32 | // krnl_bimv : Pointer to kernel matrix bi-matvec function 33 | // krnl_bimv_flops : FLOPs needed in kernel bi-matvec 34 | // Output parameter: 35 | // h2pack : H2Pack structure with H2 representation matrices 36 | void H2P_build_with_sample_point( 37 | H2Pack_p h2pack, H2P_dense_mat_p *sample_pt, const int BD_JIT, void *krnl_param, 38 | kernel_eval_fptr krnl_eval, kernel_bimv_fptr krnl_bimv, const int krnl_bimv_flops 39 | ); 40 | 41 | #ifdef __cplusplus 42 | } 43 | #endif 44 | 45 | #endif 46 | -------------------------------------------------------------------------------- /src/H2Pack_config.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_CONFIG_H__ 2 | #define __H2PACK_CONFIG_H__ 3 | 4 | // Parameters used in H2Pack 5 | 6 | #define DOUBLE_SIZE 8 7 | #define FLOAT_SIZE 4 8 | 9 | #ifndef DTYPE_SIZE 10 | #define DTYPE_SIZE DOUBLE_SIZE // Matrix data type: double or float 11 | #endif 12 | 13 | #if DTYPE_SIZE == DOUBLE_SIZE // Marcos for double data type 14 | #define DTYPE double // Data type 15 | #define DTYPE_FMTSTR "%lf" // Data type format string 16 | #define DABS fabs // Abs function 17 | #define DLOG log // Natural logarithm function 18 | #define DLOG2 log2 // Base-2 logarithm function 19 | #define DEXP exp // Exponential function 20 | #define DPOW pow // Power function 21 | #define DSQRT sqrt // Sqrt function 22 | #define DSIN sin // Sine function 23 | #define DCOS cos // Cosine function 24 | #define DERF erf // Erf function 25 | #define DERFC erfc // Erfc function 26 | #define DFLOOR floor // Floor function 27 | #define DROUND round // Rounding function 28 | #define DCEIL ceil // Ceiling function 29 | #define DFMOD fmod // Floating point remainder function 30 | #define CBLAS_NRM2 cblas_dnrm2 // CBLAS vector 2-norm 31 | #define CBLAS_DOT cblas_ddot // CBLAS vector dot product 32 | #define CBLAS_GEMV cblas_dgemv // CBLAS matrix-vector multiplication 33 | #define CBLAS_GEMM cblas_dgemm // CBLAS matrix-matrix multiplication 34 | #define CBLAS_SYRK cblas_dsyrk // CBLAS symmetric rank-k update 35 | #define CBLAS_TRSM cblas_dtrsm // CBLAS triangle solve 36 | #define CBLAS_TRMM cblas_dtrmm // CBLAS triangle matrix multiplication 37 | #define LAPACK_GETRF LAPACKE_dgetrf // LAPACK LU factorization 38 | #define LAPACK_GETRS LAPACKE_dgetrs // LAPACK linear system solve using LU factorization 39 | #define LAPACK_GETRI LAPACKE_dgetri // LAPACK LU inverse matrix 40 | #define LAPACK_POTRF LAPACKE_dpotrf // LAPACK Cholesky factorization 41 | #define LAPACK_POTRS LAPACKE_dpotrs // LAPACK linear system solve using Cholesky factorization 42 | #define LAPACK_POTRI LAPACKE_dpotri // LAPACK Cholesky inverse matrix 43 | #define LAPACK_GEQRF LAPACKE_dgeqrf // LAPACK QR factorization 44 | #define LAPACK_GEQPF LAPACKE_dgeqpf // LAPACK QR factorization with column pivoting 45 | #define LAPACK_ORGQR LAPACKE_dorgqr // LAPACK QR Q matrix explicitly construction 46 | #define LAPACK_ORMQR LAPACKE_dormqr // LAPACK QR Q matrix multiples another matrix 47 | #define LAPACK_SYEVD LAPACKE_dsyevd // LAPACK eigenvalue decomposition 48 | #define LAPACK_GESVD LAPACKE_dgesvd // LAPACK singular value decomposition 49 | #define N_DTYPE_64B 8 // 8 double == 64 bytes, for alignment 50 | #define SIMD_LEN SIMD_LEN_D // SIMD vector length 51 | #define D_EPS DBL_EPSILON // Double precision machine epsilon 52 | #define ASTER_DTYPE_DOUBLE 53 | #endif 54 | 55 | 56 | #if DTYPE_SIZE == FLOAT_SIZE // Marcos for float data type 57 | #define DTYPE float 58 | #define DTYPE_FMTSTR "%f" 59 | #define DABS fabsf 60 | #define DLOG logf 61 | #define DLOG2 log2f 62 | #define DEXP expf 63 | #define DPOW powf 64 | #define DSQRT sqrtf 65 | #define DSIN sinf 66 | #define DCOS cosf 67 | #define DERF erff 68 | #define DERFC erfcf 69 | #define DFLOOR floorf 70 | #define DROUND roundf 71 | #define DFMOD fmodf 72 | #define DCEIL ceilf 73 | #define CBLAS_NRM2 cblas_snrm2 74 | #define CBLAS_DOT cblas_sdot 75 | #define CBLAS_GEMV cblas_sgemv 76 | #define CBLAS_GEMM cblas_sgemm 77 | #define CBLAS_SYRK cblas_ssyrk 78 | #define CBLAS_TRSM cblas_strsm 79 | #define CBLAS_TRMM cblas_strmm 80 | #define LAPACK_GETRF LAPACKE_sgetrf 81 | #define LAPACK_GETRS LAPACKE_sgetrs 82 | #define LAPACK_GETRI LAPACKE_sgetri 83 | #define LAPACK_POTRF LAPACKE_spotrf 84 | #define LAPACK_POTRS LAPACKE_spotrs 85 | #define LAPACK_POTRI LAPACKE_spotri 86 | #define LAPACK_GEQRF LAPACKE_sgeqrf 87 | #define LAPACK_GEQPF LAPACKE_sgeqpf 88 | #define LAPACK_ORGQR LAPACKE_sorgqr 89 | #define LAPACK_ORMQR LAPACKE_sormqr 90 | #define LAPACK_SYEVD LAPACKE_ssyevd 91 | #define LAPACK_GESVD LAPACKE_sgesvd 92 | #define N_DTYPE_64B 16 93 | #define SIMD_LEN SIMD_LEN_S 94 | #define D_EPS FLT_EPSILON 95 | #define ASTER_DTYPE_FLOAT 96 | #endif 97 | 98 | #define QR_RANK 0 // Partial QR stop criteria: maximum rank 99 | #define QR_REL_NRM 1 // Partial QR stop criteria: maximum relative column 2-norm 100 | #define QR_ABS_NRM 2 // Partial QR stop criteria: maximum absolute column 2-norm 101 | 102 | #define ALIGN_SIZE 64 // Memory allocation alignment 103 | #define ALPHA_H2 0.999999 // Admissible coefficient for H2, == 1 here 104 | #define ALPHA_HSS -0.000001 // Admissible coefficient for HSS, == 0 here 105 | 106 | #define BD_NTASK_THREAD 10 // Average number of tasks each thread has in B & D build 107 | 108 | #include "linalg_lib_wrapper.h" 109 | #include "ASTER/include/aster.h" 110 | 111 | #endif 112 | -------------------------------------------------------------------------------- /src/H2Pack_file_IO.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_FILE_IO_H__ 2 | #define __H2PACK_FILE_IO_H__ 3 | 4 | #include "H2Pack_typedef.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | // Store a constructed H2 representation to a set of files 11 | // Input parameters: 12 | // h2pack : H2Pack structure after calling H2P_build() 13 | // meta_json_fname : Metadata JSON file name 14 | // aux_json_fname : Auxiliary JSON file name 15 | // binary_fname : Binary data file name 16 | void H2P_store_to_file( 17 | H2Pack_p h2pack, const char *meta_json_fname, 18 | const char *aux_json_fname, const char *binary_fname 19 | ); 20 | 21 | // Load a constructed H2 representation from a set of files 22 | // Input parameters: 23 | // meta_json_fname : Metadata JSON file name 24 | // aux_json_fname : Auxiliary JSON file name, can be NULL 25 | // binary_fname : Binary data file name 26 | // BD_JIT : If H2Pack should use just-in-time matvec mode, 0 or 1 27 | // krnl_param : Pointer to the krnl_eval parameter buffer 28 | // krnl_eval : Pointer to the kernel matrix evaluation function, can be NULL 29 | // krnl_bimv : Pointer to the kernel matrix bi-matvec function, can be NULL 30 | // krnl_bimv_flops : Number of flops required for each bi-matvec operation, for performance statistic only 31 | // Output parameter: 32 | // *h2pack_ : H2Pack structure constructed from given files 33 | // Notes: 34 | // If only meta_json_fname and binary_fname are valid non-empty values, the constructed 35 | // H2Pack matrix can only be used to perform H2P_matvec(). Performing other operations 36 | // may crash the program. 37 | void H2P_read_from_file( 38 | H2Pack_p *h2pack_, const char *meta_json_fname, const char *aux_json_fname, 39 | const char *binary_fname, const int BD_JIT, void *krnl_param, 40 | kernel_eval_fptr krnl_eval, kernel_bimv_fptr krnl_bimv, const int krnl_bimv_flops 41 | ); 42 | 43 | #ifdef __cplusplus 44 | } 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /src/H2Pack_gen_proxy_point.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_GEN_PROXY_POINT_H__ 2 | #define __H2PACK_GEN_PROXY_POINT_H__ 3 | 4 | #include "H2Pack_typedef.h" 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | void H2P_generate_proxy_point_ID( 11 | const int pt_dim, const int krnl_dim, const DTYPE reltol, const int max_level, const int min_level, 12 | DTYPE max_L, const void *krnl_param, kernel_eval_fptr krnl_eval, H2P_dense_mat_p **pp_ 13 | ); 14 | 15 | // Calculate the enclosing box of a given set of points and adjust it if the proxy point file is provided 16 | // Input parameters: 17 | // pt_dim : Point dimension 18 | // n_point : Number of points 19 | // coord : Size pt_dim-by-npt, each column is a point coordinate 20 | // fname : Proxy point file name, can be NULL 21 | // Output parameter: 22 | // enbox_ : Box that encloses all points in this node. 23 | // enbox[0 : pt_dim-1] are the corner with the smallest 24 | // x/y/z/... coordinates. enbox[pt_dim : 2*pt_dim-1] are 25 | // the sizes of this box. 26 | void H2P_calc_enclosing_box(const int pt_dim, const int n_point, const DTYPE *coord, const char *fname, DTYPE **enbox_); 27 | 28 | // Write a set of proxy points to a text file 29 | // Input parameters: 30 | // fname : File name 31 | // pt_dim : Point dimension 32 | // reltol : Proxy point selection relative error tolerance 33 | // L3_nlayer : Y box exterior boundary size factor 34 | // minL : Radius of the minimal proxy point set (pp[0]) 35 | // num_pp : Number of proxy point sets 36 | // pp : Proxy point sets. Radius of pp[i] should == 2 * radius of pp[i-1] 37 | void H2P_write_proxy_point_file( 38 | const char *fname, const int pt_dim, const DTYPE reltol, const int L3_nlayer, 39 | const DTYPE minL, const int num_pp, H2P_dense_mat_p *pp 40 | ); 41 | 42 | // Generate proxy points for constructing H2 projection and skeleton matrices using 43 | // ID compress, also try to load proxy points from a file and update this file 44 | // Input parameters: 45 | // h2pack : Initialized H2Pack structure 46 | // krnl_param : Pointer to kernel function parameter array 47 | // krnl_eval : Pointer to kernel matrix evaluation function 48 | // fname : Proxy point file name, if == NULL or cannot find that file, compute all proxy points 49 | // Output parameter: 50 | // pp_ : Array of proxy points for each level 51 | void H2P_generate_proxy_point_ID_file( 52 | H2Pack_p h2pack, const void *krnl_param, kernel_eval_fptr krnl_eval, 53 | const char *fname, H2P_dense_mat_p **pp_ 54 | ); 55 | 56 | // Generate uniformly distributed proxy points on a box surface for constructing 57 | // H2 projection and skeleton matrices for SOME kernel function. 58 | // This function is isolated because if the enclosing box for all points are fixed, 59 | // we only need to generate proxy points once and use them repeatedly. 60 | // Input parameters: 61 | // pt_dim : Dimension of point coordinate 62 | // xpt_dim : Dimension of extended point coordinate (for RPY xpt_dim == pt_dim+1, otherwise set xpt_dim == pt_dim) 63 | // min_npt : Minimum number of proxy points on the box surface 64 | // max_level : Maximum level (included) of a H2 tree, (root level == 0) 65 | // min_level : Minimum level that needs proxy points 66 | // max_L : The size of the root node's enclosing box 67 | // Output parameter: 68 | // pp_ : Array of proxy points for each level 69 | void H2P_generate_proxy_point_surface( 70 | const int pt_dim, const int xpt_dim, const int min_npt, const int max_level, 71 | const int min_level, DTYPE max_L, H2P_dense_mat_p **pp_ 72 | ); 73 | 74 | #ifdef __cplusplus 75 | } 76 | #endif 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /src/H2Pack_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_KERNELS_H__ 2 | #define __H2PACK_KERNELS_H__ 3 | 4 | #include "H2Pack_2D_kernels.h" 5 | 6 | #include "H2Pack_3D_kernels.h" 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /src/H2Pack_matmul.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_MATMUL_H__ 2 | #define __H2PACK_MATMUL_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // H2 representation multiplies a dense general matrix 12 | // Input parameters: 13 | // h2pack : H2Pack structure with H2 representation matrices 14 | // layout : CblasRowMajor/CblasColMajor if x & y are stored in row/column-major style 15 | // n_vec : Number of column vectors in mat_x 16 | // mat_x : Size >= h2pack->krnl_mat_size * ldx if layout == CblasRowMajor, 17 | // size >= n_vec * ldx if layout == CblasColMajor, 18 | // input dense matrix, the leading h2pack->krnl_mat_size-by-n_vec part of 19 | // mat_x will be used 20 | // ldx : Leading dimension of mat_x, must >= n_vec if layout == CblasRowMajor, 21 | // must >= h2pack->krnl_mat_size if layout == CblasColMajor 22 | // ldy : Leading dimension of mat_y, the same requirement of ldx 23 | // Output parameter: 24 | // mat_y : Size is the same as mat_x, output dense matrix, mat_y := A_{H2} * mat_x 25 | void H2P_matmul( 26 | H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 27 | const DTYPE *mat_x, const int ldx, DTYPE *mat_y, const int ldy 28 | ); 29 | 30 | // Permute rows of the multiplicand matrix from the original point ordering to 31 | // the sorted point ordering inside H2Pack (forward), or vise versa (backward) 32 | // for the output matrix. 33 | // These two functions will be called automatically in H2P_matmul(), you 34 | // don't need to manually call them. We just provide the interface here. 35 | // h2pack : H2Pack structure with H2 representation matrices 36 | // layout : CblasRowMajor/CblasColMajor if x & y are stored in row/column-major style 37 | // n_vec : Number of column vectors in mat_x 38 | // mat_x : Size >= h2pack->krnl_mat_size * ldx if layout == CblasRowMajor, 39 | // size >= n_vec * ldx if layout == CblasColMajor, 40 | // dense matrix to be permuted, the leading h2pack->krnl_mat_size-by-n_vec 41 | // part of mat_x will be used 42 | // ldx : Leading dimension of mat_x, must >= n_vec if layout == CblasRowMajor, 43 | // must >= h2pack->krnl_mat_size if layout == CblasColMajor 44 | // ldp : Leading dimension of pmt_mat_x, the same requirement of ldx 45 | // Output parameter: 46 | // pmt_mat_x : Size is the same as mat_x, permuted dense matrix 47 | void H2P_permute_matrix_row_forward( 48 | H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 49 | const DTYPE *mat_x, const int ldx, DTYPE *pmt_mat_x, const int ldp 50 | ); 51 | void H2P_permute_matrix_row_backward( 52 | H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 53 | const DTYPE *mat_x, const int ldx, DTYPE *pmt_mat_x, const int ldp 54 | ); 55 | 56 | #ifdef __cplusplus 57 | } 58 | #endif 59 | 60 | #endif 61 | -------------------------------------------------------------------------------- /src/H2Pack_matmul_periodic.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_MATMUL_PERIODIC_H__ 2 | #define __H2PACK_MATMUL_PERIODIC_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // H2 representation multiplies a dense general matrix, for periodic system 12 | // Input parameters: 13 | // h2pack : H2Pack structure with H2 representation matrices 14 | // layout : CblasRowMajor/CblasColMajor if x & y are stored in row/column-major style 15 | // n_vec : Number of column vectors in mat_x 16 | // mat_x : Size >= h2pack->krnl_mat_size * ldx, input dense matrix, the leading 17 | // h2pack->krnl_mat_size-by-n_vec part of mat_x will be used 18 | // ldx : Leading dimension of mat_x, should >= n_vec if layout == CblasRowMajor, 19 | // should >= h2pack->krnl_mat_size if layout == CblasColMajor 20 | // ldy : Leading dimension of mat_y, the same requirement of ldx 21 | // Output parameter: 22 | // mat_y : Size is the same as mat_x, output dense matrix, mat_y := A_{H2} * mat_x 23 | void H2P_matmul_periodic( 24 | H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 25 | const DTYPE *mat_x, const int ldx, DTYPE *mat_y, const int ldy 26 | ); 27 | 28 | #ifdef __cplusplus 29 | } 30 | #endif 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /src/H2Pack_matvec.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_MATVEC_H__ 2 | #define __H2PACK_MATVEC_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // H2 representation multiplies a column vector 12 | // Input parameters: 13 | // h2pack : H2Pack structure with H2 representation matrices 14 | // x : Input dense vector 15 | // Output parameter: 16 | // y : Output dense vector 17 | void H2P_matvec(H2Pack_p h2pack, const DTYPE *x, DTYPE *y); 18 | 19 | // Permute the multiplicand vector from the original point ordering to the 20 | // sorted point ordering inside H2Pack (forward), or vise versa (backward) 21 | // for the output vector. 22 | // These two functions will be called automatically in H2P_matvec(), you 23 | // don't need to manually call them. We just provide the interface here. 24 | // Input parameters: 25 | // h2pack : H2Pack structure with H2 representation matrices 26 | // x : Vector to be permuted 27 | // Output parameter: 28 | // pmt_x : Permuted vector 29 | void H2P_permute_vector_forward (H2Pack_p h2pack, const DTYPE *x, DTYPE *pmt_x); 30 | void H2P_permute_vector_backward(H2Pack_p h2pack, const DTYPE *x, DTYPE *pmt_x); 31 | 32 | #ifdef __cplusplus 33 | } 34 | #endif 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /src/H2Pack_matvec_periodic.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_MATVEC_PERIODIC_H__ 2 | #define __H2PACK_MATVEC_PERIODIC_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // H2 representation multiplies a column vector, for periodic system 12 | // Input parameters: 13 | // h2pack : H2Pack structure with H2 representation matrices 14 | // x : Input dense vector 15 | // Output parameter: 16 | // y : Output dense vector 17 | void H2P_matvec_periodic(H2Pack_p h2pack, const DTYPE *x, DTYPE *y); 18 | 19 | #ifdef __cplusplus 20 | } 21 | #endif 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /src/H2Pack_partition.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_PARTITION_H__ 2 | #define __H2PACK_PARTITION_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // Hierarchical point partitioning for H2 / HSS construction 12 | // Input parameters: 13 | // h2pack : H2Pack structure initialized using H2P_init() 14 | // n_point : Number of points for the kernel matrix 15 | // coord : Matrix, size h2pack->pt_dim * n_point, each column is a point coordinate 16 | // max_leaf_points : Maximum point in a leaf node's box. If <= 0, will use 200 for 17 | // 2D points and 400 for other dimensions 18 | // max_leaf_size : Maximum size of a leaf node's box. If == 0, max_leaf_points 19 | // will be the only restriction. 20 | // Output parameter: 21 | // h2pack : H2Pack structure with point partitioning info 22 | void H2P_partition_points( 23 | H2Pack_p h2pack, const int n_point, const DTYPE *coord, 24 | int max_leaf_points, DTYPE max_leaf_size 25 | ); 26 | 27 | // Calculate reduced (in)admissible pairs for HSS 28 | // Input parameter: 29 | // h2pack : H2Pack structure after calling H2P_partition_points() 30 | // Output parameter: 31 | // h2pack : H2Pack structure with reduced (in)admissible pairs for HSS 32 | void H2P_HSS_calc_adm_inadm_pairs(H2Pack_p h2pack); 33 | 34 | #ifdef __cplusplus 35 | } 36 | #endif 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /src/H2Pack_partition_periodic.h: -------------------------------------------------------------------------------- 1 | #ifndef __H2PACK_PARTITION_PERIODIC_H__ 2 | #define __H2PACK_PARTITION_PERIODIC_H__ 3 | 4 | #include "H2Pack_config.h" 5 | #include "H2Pack_typedef.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | // Hierarchical point partitioning for periodic system H2 construction 12 | // Input parameters: 13 | // h2pack : H2Pack structure initialized using H2P_init() 14 | // n_point : Number of points for the kernel matrix 15 | // coord : Matrix, size h2pack->pt_dim * n_point, each column is a point coordinate 16 | // max_leaf_points : Maximum point in a leaf node's box. If <= 0, will use 200 for 17 | // 2D points and 400 for other dimensions 18 | // max_leaf_size : Maximum size of a leaf node's box. If == 0, max_leaf_points 19 | // will be the only restriction. 20 | // unit_cell : Array, size 2 * h2pack->pt_dim, unit cell of the periodic system, 21 | // == the largest enclosing box for all points 22 | // Output parameter: 23 | // h2pack : H2Pack structure with point partitioning info 24 | void H2P_partition_points_periodic( 25 | H2Pack_p h2pack, const int n_point, const DTYPE *coord, int max_leaf_points, 26 | DTYPE max_leaf_size, DTYPE *unit_cell 27 | ); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /src/ICC-MKL.make: -------------------------------------------------------------------------------- 1 | CC = icc 2 | USE_MKL = 1 3 | USE_OPENBLAS = 0 4 | 5 | include common.make -------------------------------------------------------------------------------- /src/common.make: -------------------------------------------------------------------------------- 1 | LIB_A = libH2Pack.a 2 | LIB_SO = libH2Pack.so 3 | 4 | C_SRCS = $(wildcard *.c) 5 | C_OBJS = $(C_SRCS:.c=.c.o) 6 | 7 | AR = ar rcs 8 | DEFS = 9 | INCS = 10 | CFLAGS = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS) 11 | 12 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1) 13 | CFLAGS += -qopenmp -xHost 14 | endif 15 | 16 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1) 17 | CFLAGS += -fopenmp -march=native -Wno-unused-result -Wno-unused-function 18 | endif 19 | 20 | ifeq ($(strip $(USE_MKL)), 1) 21 | DEFS += -DUSE_MKL 22 | CFLAGS += -mkl 23 | endif 24 | 25 | # If you use OpenBLAS, modify OPENBLAS_INSTALL_DIR here 26 | OPENBLAS_INSTALL_DIR = ../../OpenBLAS-git/install 27 | ifeq ($(strip $(USE_OPENBLAS)), 1) 28 | DEFS += -DUSE_OPENBLAS 29 | INCS += -I$(OPENBLAS_INSTALL_DIR)/include 30 | endif 31 | 32 | # Delete the default old-fashion double-suffix rules 33 | .SUFFIXES: 34 | 35 | .SECONDARY: $(C_OBJS) 36 | 37 | all: install 38 | 39 | install: $(LIB_A) $(LIB_SO) 40 | mkdir -p ../lib 41 | mkdir -p ../include 42 | mkdir -p ../include/ASTER/include 43 | cp -u $(LIB_A) ../lib/$(LIB_A) 44 | cp -u $(LIB_SO) ../lib/$(LIB_SO) 45 | cp -u *.h ../include/ 46 | cp -u ASTER/include/*.h ../include/ASTER/include 47 | 48 | $(LIB_A): $(C_OBJS) 49 | $(AR) $@ $^ 50 | 51 | $(LIB_SO): $(C_OBJS) 52 | $(CC) -shared -o $@ $^ 53 | 54 | %.c.o: %.c 55 | $(CC) $(CFLAGS) -c $^ -o $@ 56 | 57 | clean: 58 | rm -f $(C_OBJS) $(LIB_A) $(LIB_SO) 59 | -------------------------------------------------------------------------------- /src/linalg_lib_wrapper.h: -------------------------------------------------------------------------------- 1 | #ifndef __LINALG_LIB_WRAPPER_H__ 2 | #define __LINALG_LIB_WRAPPER_H__ 3 | 4 | // Wrapper for linear algebra library (BLAS, LAPACK) 5 | 6 | #if !defined(USE_MKL) && !defined(USE_OPENBLAS) 7 | #define USE_OPENBLAS 8 | #endif 9 | 10 | #ifdef USE_MKL 11 | #include 12 | #define BLAS_SET_NUM_THREADS mkl_set_num_threads 13 | #endif 14 | 15 | #ifdef USE_OPENBLAS 16 | #include 17 | #include 18 | #define BLAS_SET_NUM_THREADS openblas_set_num_threads 19 | #endif 20 | 21 | #endif 22 | 23 | --------------------------------------------------------------------------------