├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── examples
    ├── AFN_precond
    │   ├── GCC-OpenBLAS.make
    │   ├── ICC-MKL.make
    │   ├── IE_diag_quad.h
    │   ├── Nys_precond.c
    │   ├── Nys_precond.h
    │   ├── common.make
    │   ├── precond_test_utils.h
    │   ├── test_AFN.c
    │   ├── test_AFN_IE.c
    │   └── test_Nys.c
    ├── GCC-OpenBLAS.make
    ├── ICC-MKL.make
    ├── PCG
    │   ├── pcg.c
    │   └── pcg.h
    ├── SPDHSS-H2
    │   ├── CSRPlus.c
    │   ├── CSRPlus.h
    │   ├── FSAI_precond.c
    │   ├── FSAI_precond.h
    │   ├── GCC-OpenBLAS.make
    │   ├── ICC-MKL.make
    │   ├── LRD_precond.c
    │   ├── LRD_precond.h
    │   ├── block_jacobi_precond.c
    │   ├── block_jacobi_precond.h
    │   ├── common.make
    │   ├── example_SPDHSSH2.c
    │   ├── example_SPDHSSH2_tol.c
    │   ├── example_regularHSS.c
    │   ├── parse_scalar_params.h
    │   ├── pcg_tests.c
    │   ├── pcg_tests.h
    │   ├── point_set
    │   │   ├── 3Dball_80000.csv
    │   │   └── 3Dsphere_80000.csv
    │   ├── test_FSAI.c
    │   └── test_FSAI_IE.c
    ├── common.make
    ├── direct_nbody.h
    ├── example_H2.c
    ├── example_H2_RPY.c
    ├── example_H2_tensor.c
    ├── example_HSS.c
    ├── example_read_H2_file.c
    ├── meta_json_to_txt.py
    └── meta_txt_to_json.py
├── extra
    ├── GCC-OpenBLAS.make
    ├── ICC-MKL.make
    ├── common.make
    ├── debug.h
    ├── direct_nbody.h
    ├── parse_scalar_params.h
    ├── parse_tensor_params.h
    ├── rand_3D_sphere_points.m
    ├── src-obsolete
    │   ├── H2P_build_H2_UJ_proxy_levelup.c
    │   └── H2P_generate_proxy_point_ID.c
    ├── test_H2_accuracy.c
    ├── test_H2_matmul.h
    ├── test_H2_scalar.c
    ├── test_H2_scalar_samplept.c
    ├── test_HSS_scalar.c
    ├── test_ID_compress.c
    ├── test_ID_compress_dim.c
    ├── test_kernel_SIMD.c
    └── test_scalar_matmul.c
├── pyh2pack
    ├── example.py
    ├── example_hss.py
    ├── example_samplept.py
    ├── pyh2pack.c
    ├── pyh2pack.h
    ├── pyh2pack_kernel.h
    ├── readme.md
    ├── setup.py
    └── setup_icc.py
└── src
    ├── AFN_precond.c
    ├── AFN_precond.h
    ├── DAG_task_queue.c
    ├── DAG_task_queue.h
    ├── GCC-OpenBLAS.make
    ├── H2Pack.h
    ├── H2Pack_2D_kernels.h
    ├── H2Pack_3D_kernels.h
    ├── H2Pack_HSS_ULV.c
    ├── H2Pack_HSS_ULV.h
    ├── H2Pack_ID_compress.c
    ├── H2Pack_ID_compress.h
    ├── H2Pack_SPDHSS_H2.c
    ├── H2Pack_SPDHSS_H2.h
    ├── H2Pack_aux_structs.c
    ├── H2Pack_aux_structs.h
    ├── H2Pack_build.c
    ├── H2Pack_build.h
    ├── H2Pack_build_periodic.c
    ├── H2Pack_build_periodic.h
    ├── H2Pack_build_with_sample_point.c
    ├── H2Pack_build_with_sample_point.h
    ├── H2Pack_config.h
    ├── H2Pack_file_IO.c
    ├── H2Pack_file_IO.h
    ├── H2Pack_gen_proxy_point.c
    ├── H2Pack_gen_proxy_point.h
    ├── H2Pack_kernels.h
    ├── H2Pack_matmul.c
    ├── H2Pack_matmul.h
    ├── H2Pack_matmul_periodic.c
    ├── H2Pack_matmul_periodic.h
    ├── H2Pack_matvec.c
    ├── H2Pack_matvec.h
    ├── H2Pack_matvec_periodic.c
    ├── H2Pack_matvec_periodic.h
    ├── H2Pack_partition.c
    ├── H2Pack_partition.h
    ├── H2Pack_partition_periodic.c
    ├── H2Pack_partition_periodic.h
    ├── H2Pack_typedef.c
    ├── H2Pack_typedef.h
    ├── H2Pack_utils.c
    ├── H2Pack_utils.h
    ├── ICC-MKL.make
    ├── common.make
    ├── linalg_lib_wrapper.h
    ├── utils.c
    └── utils.h


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 
54 | .vscode/
55 | lib/
56 | include/
57 | install/
58 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "src/ASTER"]
2 | 	path = src/ASTER
3 | 	url = https://github.com/huanghua1994/ASTER
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Hua Huang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/examples/AFN_precond/GCC-OpenBLAS.make:
--------------------------------------------------------------------------------
1 | CC           = gcc
2 | USE_MKL      = 0
3 | USE_OPENBLAS = 1
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/examples/AFN_precond/ICC-MKL.make:
--------------------------------------------------------------------------------
1 | CC           = icc
2 | USE_MKL      = 1
3 | USE_OPENBLAS = 0
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/examples/AFN_precond/Nys_precond.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <assert.h>
 5 | #include <omp.h>
 6 | 
 7 | #include "Nys_precond.h"
 8 | #include "H2Pack_utils.h"
 9 | 
10 | // In AFN_precond.c
11 | void Nys_precond_build_(
12 |     const DTYPE mu, const int n1, const int n2, DTYPE *K11, 
13 |     DTYPE *K12, DTYPE **nys_M_, DTYPE **nys_U_
14 | );
15 | void Nys_precond_apply_(
16 |     const int n1, const int n, const DTYPE *nys_M, const DTYPE *nys_U, 
17 |     const DTYPE *x, DTYPE *y, DTYPE *t
18 | );
19 | 
20 | // Build a randomize Nystrom preconditioner for a kernel matrix
21 | void Nys_precond_build(
22 |     kernel_eval_fptr krnl_eval, void *krnl_param, const int npt, const int pt_dim, 
23 |     const DTYPE *coord, const DTYPE mu, const int nys_k, Nys_precond_p *Nys_precond_
24 | )
25 | {
26 |     Nys_precond_p Nys_precond = (Nys_precond_p) malloc(sizeof(Nys_precond_s));
27 |     memset(Nys_precond, 0, sizeof(Nys_precond_s));
28 | 
29 |     // 1. Randomly select nys_k points from npt points
30 |     int n = npt, n1 = nys_k, n2 = npt - nys_k;
31 |     int *perm = (int *) malloc(sizeof(int) * n);
32 |     uint8_t *flag = (uint8_t *) malloc(sizeof(uint8_t) * n);
33 |     DTYPE *coord_perm = (DTYPE *) malloc(sizeof(DTYPE) * npt * pt_dim);
34 |     H2P_rand_sample(npt, nys_k, perm, flag);
35 |     memset(flag, 0, sizeof(uint8_t) * n);
36 |     for (int i = 0; i < n1; i++) flag[perm[i]] = 1;
37 |     int idx = n1;
38 |     for (int i = 0; i < n; i++)
39 |         if (flag[i] == 0) perm[idx++] = i;
40 |     H2P_gather_matrix_columns(coord, npt, coord_perm, npt, pt_dim, perm, npt);
41 | 
42 |     // 2. Build K11 and K12 blocks
43 |     DTYPE *coord_n1 = coord_perm;
44 |     DTYPE *coord_n2 = coord_perm + n1;
45 |     DTYPE *K11 = (DTYPE *) malloc(sizeof(DTYPE) * n1 * n1);
46 |     DTYPE *K12 = (DTYPE *) malloc(sizeof(DTYPE) * n1 * n2);
47 |     int n_thread = omp_get_max_threads();
48 |     ASSERT_PRINTF(
49 |         K11 != NULL && K12 != NULL,
50 |         "Failed to allocate Nystrom preconditioner K11/K12 buffers\n"
51 |     );
52 |     H2P_eval_kernel_matrix_OMP(
53 |         krnl_eval, krnl_param, 
54 |         coord_n1, n, n1, coord_n1, n, n1, 
55 |         K11, n1, n_thread
56 |     );
57 |     H2P_eval_kernel_matrix_OMP(
58 |         krnl_eval, krnl_param, 
59 |         coord_n1, n, n1, coord_n2, n, n2, 
60 |         K12, n2, n_thread
61 |     );
62 |     free(coord_perm);
63 |     free(flag);
64 | 
65 |     // 3. Build U and M matrices
66 |     Nys_precond->n    = n;
67 |     Nys_precond->n1   = n1;
68 |     Nys_precond->perm = perm;
69 |     Nys_precond->t    = (DTYPE*) malloc(sizeof(DTYPE) * n);
70 |     Nys_precond->px   = (DTYPE*) malloc(sizeof(DTYPE) * n);
71 |     Nys_precond->py   = (DTYPE*) malloc(sizeof(DTYPE) * n);
72 |     Nys_precond_build_(mu, n1, n2, K11, K12, &Nys_precond->M, &Nys_precond->U);
73 |     *Nys_precond_ = Nys_precond;
74 | }
75 | 
76 | // Apply a Nystrom preconditioner to a vector
77 | void Nys_precond_apply(Nys_precond_p Nys_precond, const DTYPE *x, DTYPE *y)
78 | {
79 |     int n = Nys_precond->n, n1 = Nys_precond->n1;
80 |     int *perm = Nys_precond->perm;
81 |     DTYPE *px = Nys_precond->px, *py = Nys_precond->py, *t1 = Nys_precond->t;
82 |     DTYPE *M = Nys_precond->M, *U = Nys_precond->U;
83 |     for (int i = 0; i < n; i++) px[i] = x[perm[i]];
84 |     Nys_precond_apply_(n1, n, M, U, px, py, t1);
85 |     for (int i = 0; i < n; i++) y[perm[i]] = py[i];
86 | }
87 | 
88 | // Destroy an initialized Nys_precond struct
89 | void Nys_precond_destroy(Nys_precond_p *Nys_precond_)
90 | {
91 |     Nys_precond_p p = *Nys_precond_;
92 |     if (p == NULL) return;
93 |     free(p->perm);
94 |     free(p->M);
95 |     free(p->U);
96 |     free(p->t);
97 |     free(p);
98 | }
99 | 


--------------------------------------------------------------------------------
/examples/AFN_precond/Nys_precond.h:
--------------------------------------------------------------------------------
 1 | #ifndef __NYS_PRECOND_H__
 2 | #define __NYS_PRECOND_H__
 3 | 
 4 | #include "H2Pack.h"
 5 | 
 6 | struct Nys_precond
 7 | {
 8 |     int n;          // Size of the kernel matrix, == number of points  
 9 |     int n1;         // Size of K11 block (== Nystrom approximation rank)
10 |     int *perm;      // Permutation array, size n
11 |     DTYPE *t;       // Size n, intermediate vectors in Nystrom_precond_apply
12 |     DTYPE *px, *py; // Size n, permuted x and y in Nystrom_precond_apply
13 |     DTYPE *U;       // Size n * n1, row major, Nystrom basis
14 |     DTYPE *M;       // Size n1, Nystrom eigenvalues + diagonal shift, then scaled
15 | };
16 | typedef struct Nys_precond  Nys_precond_s;
17 | typedef struct Nys_precond* Nys_precond_p;
18 | 
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 | 
23 | // Build a randomize Nystrom preconditioner for a kernel matrix
24 | // Input parameters:
25 | //   krnl_eval  : Pointer to kernel matrix evaluation function
26 | //   krnl_param : Pointer to kernel function parameter array
27 | //   npt        : Number of points in coord
28 | //   pt_dim     : Dimension of each point
29 | //   coord      : Matrix, size pt_dim-by-npt, coordinates of points
30 | //   mu         : Scalar, diagonal shift of the kernel matrix
31 | //   nys_k      : Nystrom approximation rank
32 | // Output parameter:
33 | //   Nys_precond_ : Pointer to an initialized Nys_precond struct
34 | void Nys_precond_build(
35 |     kernel_eval_fptr krnl_eval, void *krnl_param, const int npt, const int pt_dim, 
36 |     const DTYPE *coord, const DTYPE mu, const int nys_k, Nys_precond_p *Nys_precond_
37 | );
38 | 
39 | // Apply a Nystrom preconditioner to a vector
40 | void Nys_precond_apply(Nys_precond_p Nys_precond, const DTYPE *x, DTYPE *y);
41 | 
42 | // Destroy an initialized Nys_precond struct
43 | void Nys_precond_destroy(Nys_precond_p *Nys_precond_);
44 | 
45 | #ifdef __cplusplus
46 | }
47 | #endif
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------
/examples/AFN_precond/common.make:
--------------------------------------------------------------------------------
 1 | H2PACK_DIR = ../..
 2 | 
 3 | DEFS    = 
 4 | INCS    = -I$(H2PACK_DIR)/include
 5 | CFLAGS  = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS)
 6 | LDFLAGS = -g -O3 -fopenmp
 7 | LIBS    = 
 8 | 
 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1)
10 | CFLAGS  += -fopenmp -xHost
11 | endif
12 | 
13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1)
14 | CFLAGS  += -fopenmp -march=native -Wno-unused-result -Wno-unused-function
15 | LIBS    += -lgfortran -lm
16 | endif
17 | 
18 | ifeq ($(strip $(USE_MKL)), 1)
19 | DEFS    += -DUSE_MKL
20 | CFLAGS  += -mkl=parallel
21 | LIBS    += -mkl
22 | endif
23 | 
24 | ifeq ($(strip $(USE_OPENBLAS)), 1)
25 | OPENBLAS_INSTALL_DIR = ../../../OpenBLAS-git/install
26 | DEFS    += -DUSE_OPENBLAS
27 | INCS    += -I$(OPENBLAS_INSTALL_DIR)/include
28 | LIBS    += -L$(OPENBLAS_INSTALL_DIR)/lib -lopenblas
29 | endif
30 | 
31 | C_SRCS 	= $(wildcard *.c)
32 | C_OBJS  = $(C_SRCS:.c=.c.o)
33 | EXES    = test_AFN.exe test_Nys.exe test_AFN_IE.exe
34 | SHARED_OBJS = Nys_precond.c.o ../PCG/pcg.c.o 
35 | 
36 | # Delete the default old-fashion double-suffix rules
37 | .SUFFIXES:
38 | 
39 | .SECONDARY: $(C_OBJS) $(SHARED_OBJS)
40 | 
41 | all: $(EXES)
42 | 
43 | %.c.o: %.c
44 | 	$(CC) $(CFLAGS) -c $^ -o $@
45 | 
46 | %.exe: %.c.o $(SHARED_OBJS) $(H2PACK_DIR)/lib/libH2Pack.a
47 | 	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
48 | 
49 | clean:
50 | 	rm -f $(EXES) $(C_OBJS) $(SHARED_OBJS)
51 | 


--------------------------------------------------------------------------------
/examples/AFN_precond/precond_test_utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef __PRECOND_TEST_UTILS_H__
  2 | #define __PRECOND_TEST_UTILS_H__
  3 | 
  4 | #include <stdio.h>
  5 | #include "H2Pack_typedef.h"
  6 | #include "H2Pack_kernels.h"
  7 | #include "H2Pack.h"
  8 | #include "../PCG/pcg.h"
  9 | 
 10 | static DTYPE shift_ = 0.0;
 11 | static int n_ = 0;
 12 | 
 13 | static void H2Pack_matvec_diagshift(const void *h2pack_, const DTYPE *b, DTYPE *x)
 14 | {
 15 |     H2Pack_p h2pack = (H2Pack_p) h2pack_;
 16 |     H2P_matvec(h2pack, b, x);
 17 |     #pragma omp simd
 18 |     for (int i = 0; i < h2pack->krnl_mat_size; i++) x[i] += shift_ * b[i];
 19 | }
 20 | 
 21 | static void select_kernel(
 22 |     const int kid, const int pt_dim, const DTYPE kp, const DTYPE mu, const int npt, 
 23 |     kernel_eval_fptr *krnl_eval_, kernel_bimv_fptr *krnl_bimv_, int *krnl_bimv_flops_
 24 | )
 25 | {
 26 |     shift_ = mu;
 27 |     n_ = npt;
 28 |     kernel_eval_fptr krnl_eval = NULL;
 29 |     kernel_bimv_fptr krnl_bimv = NULL;
 30 |     int krnl_bimv_flops = 0;
 31 |     switch (kid)
 32 |     {
 33 |         case 1:
 34 |         {
 35 |             if (pt_dim == 3)
 36 |             {
 37 |                 krnl_eval = Gaussian_3D_eval_intrin_t;
 38 |                 krnl_bimv = Gaussian_3D_krnl_bimv_intrin_t;
 39 |                 krnl_bimv_flops = Gaussian_3D_krnl_bimv_flop;
 40 |             } else {
 41 |                 krnl_eval = Gaussian_2D_eval_intrin_t;
 42 |                 krnl_bimv = Gaussian_2D_krnl_bimv_intrin_t;
 43 |                 krnl_bimv_flops = Gaussian_2D_krnl_bimv_flop;
 44 |             }
 45 |             printf("Test kernel: Gaussian    k(x, y) = exp(-l * |x-y|^2), l = %.4f\n", kp);
 46 |             break;
 47 |         }
 48 |         case 2:
 49 |         {
 50 |             if (pt_dim == 3)
 51 |             {
 52 |                 krnl_eval = Expon_3D_eval_intrin_t;
 53 |                 krnl_bimv = Expon_3D_krnl_bimv_intrin_t;
 54 |                 krnl_bimv_flops = Expon_3D_krnl_bimv_flop;
 55 |             } else {
 56 |                 krnl_eval = Expon_2D_eval_intrin_t;
 57 |                 krnl_bimv = Expon_2D_krnl_bimv_intrin_t;
 58 |                 krnl_bimv_flops = Expon_2D_krnl_bimv_flop;
 59 |             }
 60 |             printf("Test kernel: Exponential k(x, y) = exp(-l * |x-y|), l = %.4f\n", kp);
 61 |             break;
 62 |         }
 63 |         case 3:
 64 |         {
 65 |             if (pt_dim == 3)
 66 |             {
 67 |                 krnl_eval = Matern32_3D_eval_intrin_t;
 68 |                 krnl_bimv = Matern32_3D_krnl_bimv_intrin_t;
 69 |                 krnl_bimv_flops = Matern32_3D_krnl_bimv_flop;
 70 |             } else {
 71 |                 krnl_eval = Matern32_2D_eval_intrin_t;
 72 |                 krnl_bimv = Matern32_2D_krnl_bimv_intrin_t;
 73 |                 krnl_bimv_flops = Matern32_2D_krnl_bimv_flop;
 74 |             }
 75 |             printf("Test kernel: 3/2 Matern  k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|, l = %.4f\n", kp);
 76 |             break;
 77 |         }
 78 |         case 4:
 79 |         {
 80 |             if (pt_dim == 3)
 81 |             {
 82 |                 krnl_eval = Matern52_3D_eval_intrin_t;
 83 |                 krnl_bimv = Matern52_3D_krnl_bimv_intrin_t;
 84 |                 krnl_bimv_flops = Matern32_3D_krnl_bimv_flop;
 85 |             } else {
 86 |                 krnl_eval = Matern52_2D_eval_intrin_t;
 87 |                 krnl_bimv = Matern52_2D_krnl_bimv_intrin_t;
 88 |                 krnl_bimv_flops = Matern32_2D_krnl_bimv_flop;
 89 |             }
 90 |             printf("Test kernel: 5/2 Matern  k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), l = %.4f\n", kp);
 91 |             break;
 92 |         }
 93 |     } 
 94 |     *krnl_eval_ = krnl_eval;
 95 |     *krnl_bimv_ = krnl_bimv;
 96 |     *krnl_bimv_flops_ = krnl_bimv_flops;
 97 | }
 98 | 
 99 | static void H2mat_build(
100 |     const int npt, const int pt_dim, DTYPE *coord, DTYPE reltol, kernel_eval_fptr krnl_eval, 
101 |     kernel_bimv_fptr krnl_bimv, int krnl_bimv_flops, void *krnl_param, H2Pack_p *h2mat_
102 | )
103 | {
104 |     double st, et;
105 |     H2Pack_p h2mat = NULL;
106 |     int krnl_dim = 1, BD_JIT = 1;
107 |     H2P_dense_mat_p *pp = NULL;
108 |     printf("Building H2 representation with reltol = %.4e for kernel matrix...\n", reltol);
109 |     H2P_init(&h2mat, pt_dim, krnl_dim, QR_REL_NRM, &reltol);
110 |     H2P_calc_enclosing_box(pt_dim, npt, coord, NULL, &h2mat->root_enbox);
111 |     H2P_partition_points(h2mat, npt, coord, 0, 0);
112 |     st = get_wtime_sec();
113 |     H2P_generate_proxy_point_ID_file(h2mat, krnl_param, krnl_eval, NULL, &pp);
114 |     et = get_wtime_sec();
115 |     printf("H2Pack proxy point selection time = %.3f s\n", et - st);
116 |     st = get_wtime_sec();
117 |     H2P_build(h2mat, pp, BD_JIT, krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops);
118 |     et = get_wtime_sec();
119 |     printf("H2Pack build time = %.3f s\n", et - st);
120 |     H2P_print_statistic(h2mat);
121 |     H2P_dense_mat_destroy(pp);
122 |     *h2mat_ = h2mat;
123 |     printf("\n");
124 | }
125 | 
126 | static void test_PCG(
127 |     matvec_fptr Ax, void *Ax_param, matvec_fptr invMx, void *invMx_param,
128 |     const int n, const int max_iter, const DTYPE CG_reltol
129 | )
130 | {
131 |     DTYPE relres;
132 |     int flag, iter, pcg_print_level = 1;
133 |     DTYPE *x = malloc(sizeof(DTYPE) * n);
134 |     DTYPE *b = malloc(sizeof(DTYPE) * n);
135 |     srand(126);  // Match with Tianshi's code
136 |     for (int i = 0; i < n; i++)
137 |     {
138 |         b[i] = (rand() / (DTYPE) RAND_MAX) - 0.5;
139 |         x[i] = 0.0;
140 |     }
141 |     pcg(
142 |         n, CG_reltol, max_iter, 
143 |         Ax, Ax_param, b, invMx, invMx_param, x,
144 |         &flag, &relres, &iter, NULL, pcg_print_level
145 |     );
146 |     free(x);
147 |     free(b);
148 | }
149 | 
150 | #endif
151 | 


--------------------------------------------------------------------------------
/examples/AFN_precond/test_AFN.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "precond_test_utils.h"
 11 | #include "AFN_precond.h"
 12 | #include "../PCG/pcg.h"
 13 | 
 14 | int main(int argc, char **argv)
 15 | {
 16 |     // Parse command line arguments
 17 |     int kid, npt, pt_dim, max_k, ss_npt, fsai_npt, fast_knn;
 18 |     DTYPE mu, kp, *coord = NULL;
 19 |     void *krnl_param = &kp;
 20 |     kernel_eval_fptr krnl_eval = NULL;
 21 |     kernel_bimv_fptr krnl_bimv = NULL;
 22 |     int krnl_bimv_flops = 0;
 23 |     if (argc < 10)
 24 |     {
 25 |         printf("Usage: %s kid kp mu npt pt_dim max_k ss_npt fsai_npt fast_knn coord_bin\n", argv[0]);
 26 |         printf("  - kid       [int]    : Kernel function ID\n");
 27 |         printf("                         1 - Gaussian    k(x, y) = exp(-l * |x-y|^2)\n");
 28 |         printf("                         2 - Exponential k(x, y) = exp(-l * |x-y|)\n");
 29 |         printf("                         3 - 3/2 Matern  k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|\n");
 30 |         printf("                         4 - 5/2 Matern  k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), k = sqrt(5) * |x-y|\n");
 31 |         printf("  - kp        [double] : Kernel function parameter (l)\n");
 32 |         printf("  - mu        [double] : Kernel matrix diagonal shift\n");
 33 |         printf("  - npt       [int]    : Number of points\n");
 34 |         printf("  - pt_dim    [int]    : Point dimension\n");
 35 |         printf("  - max_k     [int]    : Maximum global low-rank approximation rank\n");
 36 |         printf("  - ss_npt    [int]    : Number of points in the sample set\n");
 37 |         printf("  - fsai_npt  [int]    : Maximum number of nonzeros in each row of the AFN FSAI matrix\n");
 38 |         printf("  - fast_knn  [0 or 1] : If AFN FSAI should use fast approximated KNN instead of exact KNN\n");
 39 |         printf("  - coord_bin [str]    : (Optional) Binary file containing the coordinates, size pt_dim * npt,\n");
 40 |         printf("                         row major, each column is a point coordinate\n");
 41 |         return 255;
 42 |     } 
 43 |     kid      = atoi(argv[1]);
 44 |     kp       = atof(argv[2]);
 45 |     mu       = atof(argv[3]);
 46 |     npt      = atoi(argv[4]);
 47 |     pt_dim   = atoi(argv[5]);
 48 |     max_k    = atoi(argv[6]);
 49 |     ss_npt   = atoi(argv[7]);
 50 |     fsai_npt = atoi(argv[8]);
 51 |     fast_knn = atoi(argv[9]);
 52 |     coord    = (DTYPE*) malloc(sizeof(DTYPE) * npt * pt_dim);
 53 |     if (kid < 1 || kid > 4) kid = 1;
 54 |     if (pt_dim < 2 || pt_dim > 3) pt_dim = 3;
 55 |     if (argc >= 11)
 56 |     {
 57 |         FILE *inf = fopen(argv[10], "rb");
 58 |         fread(coord, sizeof(DTYPE), npt * pt_dim, inf);
 59 |         fclose(inf);
 60 |     } else {
 61 |         srand(814);  // Match with Tianshi's code
 62 |         DTYPE scale = DPOW((DTYPE) npt, 1.0 / (DTYPE) pt_dim);
 63 |         for (int i = 0; i < npt * pt_dim; i++) coord[i] = scale * (rand() / (DTYPE)(RAND_MAX));
 64 |     }
 65 |     select_kernel(kid, pt_dim, kp, mu, npt, &krnl_eval, &krnl_bimv, &krnl_bimv_flops);
 66 |     printf("Point set: %d points in %d-D\n", npt, pt_dim);
 67 |     printf("Linear system to solve: (K(X, X) + %.4f * I) * x = b\n", mu);
 68 |     printf("\nAFN preconditioner parameters:\n");
 69 |     printf("- Maximum Nystrom approximation rank     = %d\n", max_k);
 70 |     printf("- Maximum Rank estimation sampled points = %d\n", ss_npt);
 71 |     printf("- Maximum FSAI matrix nonzeros per row   = %d\n", fsai_npt);
 72 |     printf("- Fast KNN for FSAI sparsity pattern     = %s\n", fast_knn ? "Yes" : "No");
 73 |     printf("\n");
 74 |     
 75 |     // Build H2 matrix
 76 |     double st, et;
 77 |     H2Pack_p h2mat = NULL;
 78 |     DTYPE h2_reltol = 1e-8;
 79 |     H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat);
 80 | 
 81 |     // Build AFN preconditioner
 82 |     printf("Building AFN preconditioner...\n");
 83 |     st = get_wtime_sec();
 84 |     AFN_precond_p AFN_precond = NULL;
 85 |     void *h2mat_ = (fast_knn) ? (void *) h2mat : NULL;
 86 |     AFN_precond_build(krnl_eval, krnl_param, npt, pt_dim, coord, mu, max_k, ss_npt, fsai_npt, h2mat_, &AFN_precond);
 87 |     et = get_wtime_sec();
 88 |     printf("AFN_precond build time = %.3lf s\n", et - st);
 89 |     printf("AFN estimated kernel matrix rank = %d, ", AFN_precond->est_rank);
 90 |     printf("will use %s\n", (AFN_precond->est_rank >= max_k) ? "AFN" : "Nystrom");
 91 | 
 92 |     // PCG test
 93 |     DTYPE CG_reltol = 1e-4;
 94 |     int max_iter = 400;
 95 |     test_PCG(
 96 |         H2Pack_matvec_diagshift, (void *) h2mat, 
 97 |         (matvec_fptr) AFN_precond_apply, (void *) AFN_precond, 
 98 |         npt, max_iter, CG_reltol
 99 |     );
100 | 
101 |     // Print AFN preconditioner statistics and clean up
102 |     AFN_precond_print_stat(AFN_precond);
103 |     printf("\n");
104 |     free(coord);
105 |     H2P_destroy(&h2mat);
106 |     AFN_precond_destroy(&AFN_precond);
107 |     return 0;
108 | }
109 | 


--------------------------------------------------------------------------------
/examples/AFN_precond/test_Nys.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <math.h>
 5 | #include <assert.h>
 6 | #include <time.h>
 7 | #include <omp.h>
 8 | 
 9 | #include "Nys_precond.h"
10 | #include "precond_test_utils.h"
11 | #include "../PCG/pcg.h"
12 | 
13 | int main(int argc, char **argv)
14 | {
15 |     // Parse command line arguments
16 |     int kid, npt, pt_dim, nys_k;
17 |     DTYPE mu, kp, *coord = NULL;
18 |     void *krnl_param = &kp;
19 |     kernel_eval_fptr krnl_eval = NULL;
20 |     kernel_bimv_fptr krnl_bimv = NULL;
21 |     int krnl_bimv_flops = 0;
22 |     if (argc < 7)
23 |     {
24 |         printf("Usage: %s kid kp mu npt pt_dim nys_k coord_bin\n", argv[0]);
25 |         printf("  - kid       [int]    : Kernel function ID\n");
26 |         printf("                         1 - Gaussian    k(x, y) = exp(-l * |x-y|^2)\n");
27 |         printf("                         2 - Exponential k(x, y) = exp(-l * |x-y|)\n");
28 |         printf("                         3 - 3/2 Matern  k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|\n");
29 |         printf("                         4 - 5/2 Matern  k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), k = sqrt(5) * |x-y|\n");
30 |         printf("  - kp        [double] : Kernel function parameter (l)\n");
31 |         printf("  - mu        [double] : Kernel matrix diagonal shift\n");
32 |         printf("  - npt       [int]    : Number of points\n");
33 |         printf("  - pt_dim    [int]    : Point dimension\n");
34 |         printf("  - nys_k     [int]    : Randomized Nystrom approximation rank\n");
35 |         printf("  - coord_bin [str]    : (Optional) Binary file containing the coordinates, size pt_dim * npt,\n");
36 |         printf("                         row major, each column is a point coordinate\n");
37 |         return 255;
38 |     } 
39 |     kid      = atoi(argv[1]);
40 |     kp       = atof(argv[2]);
41 |     mu       = atof(argv[3]);
42 |     npt      = atoi(argv[4]);
43 |     pt_dim   = atoi(argv[5]);
44 |     nys_k    = atoi(argv[6]);
45 |     coord    = (DTYPE*) malloc(sizeof(DTYPE) * npt * pt_dim);
46 |     if (kid < 1 || kid > 4) kid = 1;
47 |     if (pt_dim < 2 || pt_dim > 3) pt_dim = 3;
48 |     if (argc >= 8)
49 |     {
50 |         FILE *inf = fopen(argv[7], "rb");
51 |         fread(coord, sizeof(DTYPE), npt * pt_dim, inf);
52 |         fclose(inf);
53 |     } else {
54 |         srand(814);  // Match with Tianshi's code
55 |         DTYPE scale = DPOW((DTYPE) npt, 1.0 / (DTYPE) pt_dim);
56 |         for (int i = 0; i < npt * pt_dim; i++) coord[i] = scale * (rand() / (DTYPE)(RAND_MAX));
57 |     }
58 |     select_kernel(kid, pt_dim, kp, mu, npt, &krnl_eval, &krnl_bimv, &krnl_bimv_flops);
59 |     printf("Point set: %d points in %d-D\n", npt, pt_dim);
60 |     printf("Linear system to solve: (K(X, X) + %.4f * I) * x = b\n", mu);
61 |     printf("Nystrom approximation rank: %d\n", nys_k);
62 | 
63 |     // Build H2 matrix
64 |     double st, et;
65 |     H2Pack_p h2mat = NULL;
66 |     DTYPE h2_reltol = 1e-8;
67 |     H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat);
68 | 
69 |     // Build Randomized Nystrom preconditioner
70 |     printf("Building randomize Nystrom preconditioner...\n");
71 |     st = get_wtime_sec();
72 |     Nys_precond_p Nys_precond = NULL;
73 |     Nys_precond_build(krnl_eval, krnl_param, npt, pt_dim, coord, mu, nys_k, &Nys_precond);
74 |     et = get_wtime_sec();
75 |     printf("Nys_precond build time = %.3lf s\n", et - st);
76 | 
77 |     // PCG test
78 |     DTYPE CG_reltol = 1e-4;
79 |     int max_iter = 400;
80 |     test_PCG(
81 |         H2Pack_matvec_diagshift, (void *) h2mat, 
82 |         (matvec_fptr) Nys_precond_apply, (void *) Nys_precond, 
83 |         npt, max_iter, CG_reltol
84 |     );
85 | 
86 |     // Clean up
87 |     printf("\n");
88 |     free(coord);
89 |     H2P_destroy(&h2mat);
90 |     Nys_precond_destroy(&Nys_precond);
91 |     return 0;
92 | }


--------------------------------------------------------------------------------
/examples/GCC-OpenBLAS.make:
--------------------------------------------------------------------------------
 1 | CC           = gcc
 2 | USE_MKL      = 0
 3 | USE_OPENBLAS = 1
 4 | 
 5 | include common.make
 6 | 
 7 | # GCC 10 need to manually specify using SVE, -march=native is not enough
 8 | # On A64FX SVE vector bits = 512, on other SVE supported processors this value might be different
 9 | USE_AARCH64_SVE = 0
10 | SVE_VECTOR_BITS = 512
11 | ifeq ($(strip $(USE_AARCH64_SVE)), 1)
12 | CFLAGS := $(subst -march=native, -march=armv8.2-a+sve -msve-vector-bits=$(SVE_VECTOR_BITS), $(CFLAGS))
13 | endif


--------------------------------------------------------------------------------
/examples/ICC-MKL.make:
--------------------------------------------------------------------------------
1 | CC           = icc
2 | USE_MKL      = 1
3 | USE_OPENBLAS = 0
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/examples/PCG/pcg.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <omp.h>
  7 | 
  8 | #include "pcg.h"
  9 | 
 10 | // Left preconditioned Conjugate Gradient for solving A * x = b
 11 | void pcg(
 12 |     const int n, const DTYPE tol, const int max_iter, 
 13 |     const matvec_fptr Ax,    const void *Ax_param,    const DTYPE *b, 
 14 |     const matvec_fptr invMx, const void *invMx_param, DTYPE *x, 
 15 |     int *flag_, DTYPE *relres_, int *iter_, DTYPE *res_vec, int print_level
 16 | )
 17 | {
 18 |     size_t vec_msize = sizeof(DTYPE) * n;
 19 |     DTYPE *r = (DTYPE*) malloc(vec_msize);
 20 |     DTYPE *z = (DTYPE*) malloc(vec_msize);
 21 |     DTYPE *p = (DTYPE*) malloc(vec_msize);
 22 |     DTYPE *s = (DTYPE*) malloc(vec_msize);
 23 |     assert(r != NULL && z != NULL && p != NULL && s != NULL);
 24 | 
 25 |     double st, st0, et, et0;
 26 |     double t_Ax = 0, t_invMx = 0, t_vec = 0;
 27 |     st0 = omp_get_wtime();
 28 | 
 29 |     // r = b - A * x;
 30 |     st = omp_get_wtime();
 31 |     Ax(Ax_param, x, r);
 32 |     et = omp_get_wtime();
 33 |     t_Ax += et - st;
 34 | 
 35 |     st = omp_get_wtime();
 36 |     #pragma omp simd
 37 |     for (int i = 0; i < n; i++) r[i] = b[i] - r[i];
 38 |     
 39 |     // b_2norm = norm(b, 2);
 40 |     // r_2norm = norm(r, 2);
 41 |     // rn_stop = b_2norm * tol;
 42 |     DTYPE b_2norm = 0.0, r_2norm = 0.0, rn_stop;
 43 |     #pragma omp simd
 44 |     for (int i = 0; i < n; i++)
 45 |     {
 46 |         b_2norm += b[i] * b[i];
 47 |         r_2norm += r[i] * r[i];
 48 |     }
 49 |     b_2norm = DSQRT(b_2norm);
 50 |     r_2norm = DSQRT(r_2norm);
 51 |     rn_stop = b_2norm * tol;
 52 |     et = omp_get_wtime();
 53 |     t_vec += et - st;
 54 | 
 55 |     if (print_level > 0)
 56 |     {        
 57 |         printf("\nPCG: ||b||_2 = %e, initial ||r||_2 = %e, stopping ||r||_2 = %e\n", b_2norm, r_2norm, rn_stop);
 58 |         printf("PCG: Max number of iterations: %d\n", max_iter);
 59 |         printf("Iter      Residual norm   Relative res.     \n");
 60 |     }
 61 | 
 62 |     int iter = 0;
 63 |     DTYPE alpha, beta, rho0, tmp, rho = 1.0;
 64 |     while (iter < max_iter && r_2norm > rn_stop)
 65 |     {
 66 |         // z = M \ r;
 67 |         st = omp_get_wtime();
 68 |         if (invMx != NULL) invMx(invMx_param, r, z);
 69 |         else memcpy(z, r, vec_msize);
 70 |         et = omp_get_wtime();
 71 |         t_invMx += et - st;
 72 | 
 73 |         // rho0 = rho;
 74 |         // rho  = r' * z;
 75 |         // beta = rho / rho0;
 76 |         st = omp_get_wtime();
 77 |         rho0 = rho;
 78 |         rho  = 0.0;
 79 |         #pragma omp simd
 80 |         for (int i = 0; i < n; i++) rho += r[i] * z[i];
 81 |         beta = rho / rho0;
 82 | 
 83 |         // p = z + beta * p; or p = z;
 84 |         if (iter == 0) memcpy(p, z, vec_msize);
 85 |         else
 86 |         {
 87 |             #pragma omp simd
 88 |             for (int i = 0; i < n; i++) p[i] = z[i] + beta * p[i];
 89 |         }
 90 |         et = omp_get_wtime();
 91 |         t_vec += et - st;
 92 | 
 93 |         // s = A * p;
 94 |         // alpha = rho / (p' * s);
 95 |         st = omp_get_wtime();
 96 |         Ax(Ax_param, p, s);
 97 |         et = omp_get_wtime();
 98 |         t_Ax += et - st;
 99 | 
100 |         st = omp_get_wtime();
101 |         tmp = 0.0;
102 |         #pragma omp simd
103 |         for (int i = 0; i < n; i++) tmp += p[i] * s[i];
104 |         alpha = rho / tmp;
105 | 
106 |         // x = x + alpha * p;
107 |         // r = r - alpha * s;
108 |         r_2norm = 0.0;
109 |         #pragma omp simd
110 |         for (int i = 0; i < n; i++) 
111 |         {
112 |             x[i] += alpha * p[i];
113 |             r[i] -= alpha * s[i];
114 |             r_2norm += r[i] * r[i];
115 |         }
116 |         r_2norm = DSQRT(r_2norm);
117 |         if (res_vec != NULL) res_vec[iter] = r_2norm;
118 |         iter++;
119 |         et = omp_get_wtime();
120 |         t_vec += et - st;
121 | 
122 |         if (print_level > 0) printf("%4d      %5.4e      %5.4e\n", iter, r_2norm, r_2norm / b_2norm);
123 |     }  // End of while
124 |     *flag_   = (r_2norm <= rn_stop) ? 0 : 1;
125 |     *relres_ = r_2norm / b_2norm;
126 |     *iter_   = iter;
127 |     et0 = omp_get_wtime();
128 | 
129 |     // Sanity check
130 |     Ax(Ax_param, x, r);
131 |     r_2norm = 0.0;
132 |     #pragma omp simd
133 |     for (int i = 0; i < n; i++)
134 |     {
135 |         r[i] = b[i] - r[i];
136 |         r_2norm += r[i] * r[i];
137 |     }
138 |     r_2norm = DSQRT(r_2norm);
139 | 
140 |     if (print_level > 0)
141 |     {
142 |         printf("PCG: Final relres = %e\n", r_2norm / b_2norm);
143 |         if (*flag_ == 0) printf("PCG: Converged in %d iterations, %.2f seconds\n", iter, et0 - st0);
144 |         else printf("PCG: Reached maximum number of iterations, %.2f seconds\n", et0 - st0);
145 |         printf("PCG: time for Ax, invMx, vector operations: %.2f, %.2f, %.2f seconds\n\n", t_Ax, t_invMx, t_vec);
146 |     }
147 | 
148 |     free(r);
149 |     free(z);
150 |     free(p);
151 |     free(s);
152 | }
153 | 


--------------------------------------------------------------------------------
/examples/PCG/pcg.h:
--------------------------------------------------------------------------------
 1 | #ifndef __PCG_H__
 2 | #define __PCG_H__
 3 | 
 4 | #ifndef DTYPE
 5 | #define DTYPE double
 6 | #define DSQRT sqrt
 7 | #endif
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | 
13 | // b := A * x
14 | typedef void (*matvec_fptr) (const void *param, const DTYPE *x, DTYPE *b);
15 | 
16 | // Left preconditioned Conjugate Gradient for solving A * x = b
17 | // Reference: Iterative Methods for Sparse Linear System (2nd Edition), algorithm 9.1
18 | // Input parameters:
19 | //   n           : Size of the matrix
20 | //   tol         : Residual vector norm tolerance
21 | //   max_iter    : Maximum number of iterations
22 | //   Ax          : Function pointer for calculating A * x
23 | //   Ax_param    : Pointer to Ax function parameters
24 | //   b           : Size n, right-hand size vector
25 | //   invMx       : Function pointer for applying preconditioner M^{-1} * r, 
26 | //                 NULL pointer means no preconditioning
27 | //   invMx_param : Pointer to invMx function parameters
28 | //   x           : Size n, initial guess vector
29 | //   print_level : Positive integer, higher value means more output
30 | // Output parameters:
31 | //   x        : Size n, solution vector
32 | //   *flag_   : 0 == converged, 1 == not converged
33 | //   *relres_ : Residual vector relative 2-norm at last step
34 | //   *iter_   : Number of iterations performed
35 | //   res_vec  : Size >= max_iter, Residual vector relative 2-norms at each iteration, 
36 | //              NULL pointer means these values will not be recorded
37 | void pcg(
38 |     const int n, const DTYPE tol, const int max_iter, 
39 |     const matvec_fptr Ax,    const void *Ax_param,    const DTYPE *b, 
40 |     const matvec_fptr invMx, const void *invMx_param, DTYPE *x, 
41 |     int *flag_, DTYPE *relres_, int *iter_, DTYPE *res_vec, int print_level
42 | );
43 | 
44 | #ifdef __cplusplus
45 | }
46 | #endif
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/CSRPlus.h:
--------------------------------------------------------------------------------
 1 | // ---------------------------------------------------------------
 2 | // @brief  : CSRPlus matrix header file 
 3 | // @author : Hua Huang <huangh223@gatech.edu>
 4 | //           Edmond Chow <echow@cc.gatech.edu>
 5 | // 
 6 | //     Copyright (c) 2017-2020 Georgia Institute of Technology      
 7 | // ----------------------------------------------------------------
 8 | 
 9 | #ifndef __CSRPLUS_H__
10 | #define __CSRPLUS_H__
11 | 
12 | struct CSRP_mat
13 | {
14 |     // Standard CSR arrays and parameters
15 |     int    nrow, ncol, nnz;
16 |     int    *row_ptr;
17 |     int    *col;
18 |     double *val;
19 |     
20 |     // CSRPlus task partitioning information
21 |     int    nblk;        // Number of non-zero blocks 
22 |     int    nthread;     // Number of threads
23 |     int    *nnz_spos;   // First nnz of a block
24 |     int    *nnz_epos;   // Last  nnz (included) of a block
25 |     int    *first_row;  // First row of a block
26 |     int    *last_row;   // Last  row of a block
27 |     int    *fr_intact;  // If the first row of a block is intact
28 |     int    *lr_intact;  // If the last  row of a block is intact
29 |     double *fr_res;     // Partial result of the first row 
30 |     double *lr_res;     // Partial result of the last  row
31 | };
32 | 
33 | typedef struct CSRP_mat  CSRP_mat_s;
34 | typedef struct CSRP_mat* CSRP_mat_p;
35 | 
36 | #ifdef __cplusplus
37 | extern "C" {
38 | #endif
39 | 
40 | // Initialize a CSRP_mat structure using a COO matrix
41 | // Note: This function assumes that the input COO matrix is not sorted
42 | // Input parameters:
43 | //   nrow, ncol, nnz : Number of rows, columns and non-zeros
44 | //   row, col, val   : Row indices, column indices and values of non-zeros
45 | // Output parameter:
46 | //   *csrp_mat_  : Pointer to a initialized CSRP_mat structure
47 | void CSRP_init_with_COO_mat(
48 |     const int nrow, const int ncol, const int nnz, const int *row,
49 |     const int *col, const double *val, CSRP_mat_p *csrp_mat_
50 | );
51 | 
52 | // Free a CSRP_mat structure
53 | // Input parameter:
54 | //   *csrp_mat_ : Pointer to a CSRP_mat structure
55 | void CSRP_free(CSRP_mat_p *csrp_mat_);
56 | 
57 | // Partition a CSR matrix into multiple blocks with the same nnz
58 | // for multiple threads execution of SpMV
59 | // Input parameters:
60 | //   csrp_mat : Pointer to a CSRP_mat structure
61 | //   nblk     : Number of non-zero blocks
62 | //   nthread  : Number of threads to be used in SpMV later
63 | // Output parameter:
64 | //   csrp_mat : Pointer to a CSRP_mat structure with partitioning information
65 | void CSRP_partition_multithread(CSRP_mat_p csrp_mat, const int nblk, const int nthread);
66 | 
67 | // Use first-touch policy to optimize the storage of CSR arrays in a CSRP_mat structure
68 | // Input:
69 | //   csrp_mat : Pointer to a CSRP_mat structure
70 | // Output:
71 | //   csrp_mat : Pointer to a CSRP_mat structure with NUMA optimized storage
72 | void CSRP_optimize_NUMA(CSRP_mat_p csrp_mat);
73 | 
74 | // Perform OpenMP parallelized CSR SpMV with a CSRP_mat structure
75 | // Input parameters:
76 | //   csrp_mat : Pointer to an initialized and partitioned CSRP_mat structure
77 | //   x        : Input vector
78 | // Output parameter:
79 | //   y : Output vector, will be overwritten by csrp_mat * x 
80 | void CSRP_SpMV(CSRP_mat_p csrp_mat, const double *x, double *y);
81 | 
82 | #ifdef __cplusplus
83 | }
84 | #endif
85 | 
86 | #endif
87 | 
88 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/FSAI_precond.h:
--------------------------------------------------------------------------------
 1 | #ifndef __FSAI_PRECOND_H__
 2 | #define __FSAI_PRECOND_H__
 3 | 
 4 | #include "H2Pack.h"
 5 | #include "CSRPlus.h"
 6 | 
 7 | struct FSAI_precond
 8 | {
 9 |     int   mat_size;     // Size of the matrix to be preconditioned
10 |     int  *fwd_pmt;      // Forward permutation index array for input vector
11 |     int   *bwd_pmt;     // Backward permutation index array for output vector
12 |     DTYPE *x0;          // Size mat_size, storing G * b in apply_FSAI_precond()
13 |     DTYPE *pmt_b;       // Size mat_size, storing the input vector after permutation
14 |     DTYPE *pmt_x;       // Size mat_size, storing the output vector before permutation
15 |     CSRP_mat_p G, Gt;   // FSAI constructed matrix and its transpose
16 | 
17 |     // Statistic info
18 |     int    n_apply;
19 |     double t_apply, t_build;
20 |     double mem_MB;
21 | };
22 | typedef struct FSAI_precond  FSAI_precond_s;
23 | typedef struct FSAI_precond* FSAI_precond_p;
24 | 
25 | #ifdef __cplusplus
26 | extern "C" {
27 | #endif
28 | 
29 | // Construct a FSAI_precond from a H2Pack structure
30 | // Input parameters:
31 | //   h2pack : Constructed H2Pack structure
32 | //   rank   : Number of nearest neighbors 
33 | //   shift  : Diagonal shifting of the target matrix
34 | // Output parameter:
35 | //   *precond_ : Constructed FSAI_precond structure
36 | void H2P_build_FSAI_precond(H2Pack_p h2pack, const int rank, const DTYPE shift, FSAI_precond_p *precond_);
37 | 
38 | // Apply FSAI preconditioner, x := M_{FSAI}^{-1} * b
39 | // Input parameters:
40 | //   precond : Constructed FSAI_precond structure
41 | //   b       : Size precond->mat_size, input vector
42 | // Output parameter:
43 | //   x : Size precond->mat_size, output vector
44 | void FSAI_precond_apply(FSAI_precond_p precond, const DTYPE *b, DTYPE *x);
45 | 
46 | // Destroy a FSAI_precond structure
47 | // Input parameter:
48 | //   *precond_ : Pointer to a FSAI_precond structure to be destroyed
49 | void FSAI_precond_destroy(FSAI_precond_p *precond_);
50 | 
51 | // Print statistic info of a FSAI_precond structure
52 | // Input parameter:
53 | //   precond : FSAI_precond structure whose statistic info to be printed
54 | void FSAI_precond_print_stat(FSAI_precond_p precond);
55 | 
56 | #ifdef __cplusplus
57 | }
58 | #endif
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/GCC-OpenBLAS.make:
--------------------------------------------------------------------------------
1 | CC           = gcc
2 | USE_MKL      = 0
3 | USE_OPENBLAS = 1
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/ICC-MKL.make:
--------------------------------------------------------------------------------
1 | CC           = icc
2 | USE_MKL      = 1
3 | USE_OPENBLAS = 0
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/LRD_precond.h:
--------------------------------------------------------------------------------
 1 | #ifndef __LRD_PRECOND_H__
 2 | #define __LRD_PRECOND_H__
 3 | 
 4 | #include "H2Pack.h"
 5 | 
 6 | struct LRD_precond
 7 | {
 8 |     int   mat_size; // Size of the matrix to be preconditioned
 9 |     int   rank;     // Rank of the low-rank decomposition
10 |     int   *fwd_pmt; // Forward permutation index array for input vector
11 |     int   *bwd_pmt; // Backward permutation index array for output vector
12 |     DTYPE shift;    // Diagonal shift
13 |     DTYPE *Ut;      // Size rank * mat_size, LRD matrix
14 |     DTYPE *pmt_b;   // Size mat_size, storing the input vector after permutation
15 |     DTYPE *pmt_x;   // Size mat_size, storing the output vector before permutation
16 |     DTYPE *workbuf; // Size rank, working buffer in apply_LRD_precond
17 | 
18 |     // Statistic info
19 |     int    n_apply;
20 |     double t_apply, t_build;
21 |     double mem_MB;
22 | };
23 | typedef struct LRD_precond  LRD_precond_s;
24 | typedef struct LRD_precond* LRD_precond_p;
25 | 
26 | #ifdef __cplusplus
27 | extern "C" {
28 | #endif
29 | 
30 | // Construct a LRD_precond from a H2Pack structure using Nystrom method with random sampling
31 | // Input parameters:
32 | //   h2pack : Constructed H2Pack structure
33 | //   rank   : Rank of the low-rank decomposition
34 | //   shift  : Diagonal shifting of the target matrix
35 | // Output parameter:
36 | //   *precond_ : Constructed LRD_precond structure
37 | void H2P_build_LRD_precond(H2Pack_p h2pack, const int rank, const DTYPE shift, LRD_precond_p *precond_);
38 | 
39 | // Apply LRD preconditioner, x := M_{LRD}^{-1} * b
40 | // Input parameters:
41 | //   precond : Constructed LRD_precond structure
42 | //   b       : Size precond->mat_size, input vector
43 | // Output parameter:
44 | //   x : Size precond->mat_size, output vector
45 | void LRD_precond_apply(LRD_precond_p precond, const DTYPE *b, DTYPE *x);
46 | 
47 | // Destroy a LRD_precond structure
48 | // Input parameter:
49 | //   *precond_ : Pointer to a LRD_precond structure to be destroyed
50 | void LRD_precond_destroy(LRD_precond_p *precond_);
51 | 
52 | // Print statistic info of a LRD_precond structure
53 | // Input parameter:
54 | //   precond : LRD_precond structure whose statistic info to be printed
55 | void LRD_precond_print_stat(LRD_precond_p precond);
56 | 
57 | #ifdef __cplusplus
58 | }
59 | #endif
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/block_jacobi_precond.h:
--------------------------------------------------------------------------------
 1 | #ifndef __BLOCK_JACOBI_PRECOND_H__
 2 | #define __BLOCK_JACOBI_PRECOND_H__
 3 | 
 4 | #include "H2Pack.h"
 5 | 
 6 | struct block_jacobi_precond
 7 | {
 8 |     int    mat_size;        // Size of the matrix to be preconditioned
 9 |     int    n_block;         // Number of blocks to use
10 |     int    *blk_sizes;      // Size n_block, size of each block
11 |     int    *blk_displs;     // Size n_block+1, start row & column of each block
12 |     int    *fwd_pmt;        // Forward permutation index array for input vector
13 |     int    *bwd_pmt;        // Backward permutation index array for output vector
14 |     size_t *blk_inv_ptr;    // Size n_block, offset of the inverse of each block
15 |     DTYPE  *pmt_b;          // Size mat_size, storing the input vector after permutation
16 |     DTYPE  *pmt_x;          // Size mat_size, storing the output vector before permutation
17 |     DTYPE  *blk_inv;        // Size unknown, inverse of each block
18 | 
19 |     // Statistic info
20 |     int    n_apply;
21 |     double t_apply, t_build;
22 |     double mem_MB;
23 | };
24 | typedef struct block_jacobi_precond  block_jacobi_precond_s;
25 | typedef struct block_jacobi_precond* block_jacobi_precond_p;
26 | 
27 | #ifdef __cplusplus
28 | extern "C" {
29 | #endif
30 | 
31 | // Construct a block_jacobi_precond from a H2Pack structure
32 | // Input parameters:
33 | //   h2pack : Constructed H2Pack structure
34 | //   shift  : Diagonal shifting of the target matrix
35 | // Output parameter:
36 | //   *precond_ : Constructed block_jacobi_precond structure
37 | void H2P_build_block_jacobi_precond(H2Pack_p h2pack, const DTYPE shift, block_jacobi_precond_p *precond_);
38 | 
39 | // Apply block Jacobi preconditioner, x := M_{BJP}^{-1} * b
40 | // Input parameters:
41 | //   precond : Constructed block_jacobi_precond structure
42 | //   b       : Size precond->mat_size, input vector
43 | // Output parameter:
44 | //   x : Size precond->mat_size, output vector
45 | void block_jacobi_precond_apply(block_jacobi_precond_p precond, const DTYPE *b, DTYPE *x);
46 | 
47 | // Destroy a block_jacobi_precond structure
48 | // Input parameter:
49 | //   *precond_ : Pointer to a block_jacobi_precond structure to be destroyed
50 | void block_jacobi_precond_destroy(block_jacobi_precond_p *precond_);
51 | 
52 | // Print statistic info of a block_jacobi_precond structure
53 | // Input parameter:
54 | //   precond : block_jacobi_precond structure whose statistic info to be printed
55 | void block_jacobi_precond_print_stat(block_jacobi_precond_p precond);
56 | 
57 | #ifdef __cplusplus
58 | }
59 | #endif
60 | 
61 | #endif
62 | 
63 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/common.make:
--------------------------------------------------------------------------------
 1 | H2PACK_DIR = ../..
 2 | 
 3 | DEFS    = 
 4 | INCS    = -I$(H2PACK_DIR)/include
 5 | CFLAGS  = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS)
 6 | LDFLAGS = -g -O3 -fopenmp
 7 | LIBS    = 
 8 | 
 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1)
10 | CFLAGS  += -fopenmp -xHost
11 | endif
12 | 
13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1)
14 | CFLAGS  += -fopenmp -march=native -Wno-unused-result -Wno-unused-function
15 | LIBS    += -lgfortran -lm
16 | endif
17 | 
18 | ifeq ($(strip $(USE_MKL)), 1)
19 | DEFS    += -DUSE_MKL
20 | CFLAGS  += -mkl=parallel
21 | LIBS    += -mkl
22 | endif
23 | 
24 | ifeq ($(strip $(USE_OPENBLAS)), 1)
25 | OPENBLAS_INSTALL_DIR = ../../../OpenBLAS-git/install
26 | DEFS    += -DUSE_OPENBLAS
27 | INCS    += -I$(OPENBLAS_INSTALL_DIR)/include
28 | LIBS    += -L$(OPENBLAS_INSTALL_DIR)/lib -lopenblas
29 | endif
30 | 
31 | C_SRCS 	= $(wildcard *.c)
32 | C_OBJS  = $(C_SRCS:.c=.c.o)
33 | EXES    = example_regularHSS.exe example_SPDHSSH2.exe example_SPDHSSH2_tol.exe test_FSAI.exe test_FSAI_IE.exe
34 | SHARED_OBJS = ../PCG/pcg.c.o block_jacobi_precond.c.o LRD_precond.c.o FSAI_precond.c.o CSRPlus.c.o pcg_tests.c.o
35 | 
36 | # Delete the default old-fashion double-suffix rules
37 | .SUFFIXES:
38 | 
39 | .SECONDARY: $(C_OBJS) $(SHARED_OBJS)
40 | 
41 | all: $(EXES)
42 | 
43 | %.c.o: %.c
44 | 	$(CC) $(CFLAGS) -c $^ -o $@
45 | 
46 | %.exe: %.c.o $(SHARED_OBJS) $(H2PACK_DIR)/lib/libH2Pack.a
47 | 	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
48 | 
49 | clean:
50 | 	rm -f $(EXES) $(C_OBJS)
51 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/pcg_tests.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | 
  6 | #include "H2Pack.h"
  7 | 
  8 | #include "../PCG/pcg.h"
  9 | #include "block_jacobi_precond.h"
 10 | #include "LRD_precond.h"
 11 | #include "FSAI_precond.h"
 12 | 
 13 | static DTYPE shift_;
 14 | 
 15 | void H2Pack_matvec(const void *h2pack_, const DTYPE *b, DTYPE *x)
 16 | {
 17 |     H2Pack_p h2pack = (H2Pack_p) h2pack_;
 18 |     H2P_matvec(h2pack, b, x);
 19 |     #pragma omp simd
 20 |     for (int i = 0; i < h2pack->krnl_mat_size; i++) x[i] += shift_ * b[i];
 21 | }
 22 | 
 23 | void block_jacobi_precond(const void *precond_, const DTYPE *b, DTYPE *x)
 24 | {
 25 |     block_jacobi_precond_p precond = (block_jacobi_precond_p) precond_;
 26 |     block_jacobi_precond_apply(precond, b, x);
 27 | }
 28 | 
 29 | void LRD_precond(const void *precond_, const DTYPE *b, DTYPE *x)
 30 | {
 31 |     LRD_precond_p precond = (LRD_precond_p) precond_;
 32 |     LRD_precond_apply(precond, b, x);
 33 | }
 34 | 
 35 | void FSAI_precond(const void *precond_, const DTYPE *b, DTYPE *x)
 36 | {
 37 |     FSAI_precond_p precond = (FSAI_precond_p) precond_;
 38 |     FSAI_precond_apply(precond, b, x);
 39 | }
 40 | 
 41 | void HSS_ULV_Chol_precond(const void *hssmat_, const DTYPE *b, DTYPE *x)
 42 | {
 43 |     H2Pack_p hssmat = (H2Pack_p) hssmat_;
 44 |     H2P_HSS_ULV_Cholesky_solve(hssmat, 3, b, x);
 45 | }
 46 | 
 47 | // Test preconditioned conjugate gradient solver with different preconditioner
 48 | void pcg_tests(
 49 |     const int krnl_mat_size, H2Pack_p h2mat, H2Pack_p hssmat, const DTYPE shift, 
 50 |     const int max_rank, const int max_iter, const DTYPE CG_tol, const int method
 51 | )
 52 | {
 53 |     DTYPE *x = malloc(sizeof(DTYPE) * krnl_mat_size);
 54 |     DTYPE *y = malloc(sizeof(DTYPE) * krnl_mat_size);
 55 |     assert(x != NULL && y != NULL);
 56 | 
 57 | 
 58 |     //  Random right hand side vector
 59 |     srand48(2);
 60 |     for (int i = 0; i < krnl_mat_size; i++) y[i] = 0.5 - drand48();
 61 | 
 62 |     int flag, iter, pcg_print_level = 1;
 63 |     DTYPE relres;
 64 |     double st, et;
 65 | 
 66 |     shift_ = shift;    
 67 | 
 68 |     if (method == 0 || method == 1)
 69 |     {
 70 |         printf("\nStarting PCG solve without preconditioner...\n");
 71 |         memset(x, 0, sizeof(DTYPE) * krnl_mat_size);
 72 |         st = get_wtime_sec();
 73 |         pcg(
 74 |             krnl_mat_size, CG_tol, max_iter, 
 75 |             H2Pack_matvec, h2mat, y, NULL, NULL, x,
 76 |             &flag, &relres, &iter, NULL, pcg_print_level
 77 |         );
 78 |         et = get_wtime_sec();
 79 |         printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st);
 80 |     }
 81 | 
 82 |     if (method == 0 || method == 2)
 83 |     {
 84 |         printf("\nConstructing block Jacobi preconditioner...\n");
 85 |         block_jacobi_precond_p bj_precond;
 86 |         H2P_build_block_jacobi_precond(h2mat, shift, &bj_precond);
 87 |         printf("Starting PCG solve with block Jacobi preconditioner...\n");
 88 |         memset(x, 0, sizeof(DTYPE) * krnl_mat_size);
 89 |         st = get_wtime_sec();
 90 |         pcg(
 91 |             krnl_mat_size, CG_tol, max_iter, 
 92 |             H2Pack_matvec, h2mat, y, block_jacobi_precond, bj_precond, x,
 93 |             &flag, &relres, &iter, NULL, pcg_print_level
 94 |         );
 95 |         et = get_wtime_sec();
 96 |         printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st);
 97 |         block_jacobi_precond_print_stat(bj_precond);
 98 |         block_jacobi_precond_destroy(&bj_precond);
 99 |     }
100 | 
101 |     if (method == 0 || method == 3)
102 |     {
103 |         printf("\nConstructing LRD preconditioner...\n");
104 |         LRD_precond_p lrd_precond;
105 |         H2P_build_LRD_precond(h2mat, max_rank, shift, &lrd_precond);
106 |         printf("Starting PCG solve with LRD preconditioner...\n");
107 |         memset(x, 0, sizeof(DTYPE) * krnl_mat_size);
108 |         st = get_wtime_sec();
109 |         pcg(
110 |             krnl_mat_size, CG_tol, max_iter, 
111 |             H2Pack_matvec, h2mat, y, LRD_precond, lrd_precond, x,
112 |             &flag, &relres, &iter, NULL, pcg_print_level
113 |         );
114 |         et = get_wtime_sec();
115 |         printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st);
116 |         LRD_precond_print_stat(lrd_precond);
117 |         LRD_precond_destroy(&lrd_precond);
118 |     }
119 | 
120 |     if (method == 0 || method == 4)
121 |     {
122 |         printf("\nConstructing FSAI preconditioner...\n");
123 |         FSAI_precond_p fsai_precond;
124 |         H2P_build_FSAI_precond(h2mat, max_rank, shift, &fsai_precond);
125 |         printf("Starting PCG solve with FSAI preconditioner...\n");
126 |         memset(x, 0, sizeof(DTYPE) * krnl_mat_size);
127 |         st = get_wtime_sec();
128 |         pcg(
129 |             krnl_mat_size, CG_tol, max_iter, 
130 |             H2Pack_matvec, h2mat, y, FSAI_precond, fsai_precond, x,
131 |             &flag, &relres, &iter, NULL, pcg_print_level
132 |         );
133 |         et = get_wtime_sec();
134 |         printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st);
135 |         FSAI_precond_print_stat(fsai_precond);
136 |         FSAI_precond_destroy(&fsai_precond);
137 |     }
138 | 
139 |     if (method == 0 || method == 5)
140 |     {
141 |         printf("\nStarting PCG solve with SPDHSS preconditioner...\n");
142 |         memset(x, 0, sizeof(DTYPE) * krnl_mat_size);
143 |         st = get_wtime_sec();
144 |         pcg(
145 |             krnl_mat_size, CG_tol, max_iter, 
146 |             H2Pack_matvec, h2mat, y, HSS_ULV_Chol_precond, hssmat, x,
147 |             &flag, &relres, &iter, NULL, pcg_print_level
148 |         );
149 |         et = get_wtime_sec();
150 |         printf("PCG stopped after %d iterations, relres = %e, used time = %.2lf sec\n", iter, relres, et - st);
151 |     }
152 | 
153 |     free(x);
154 |     free(y);
155 | }
156 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/pcg_tests.h:
--------------------------------------------------------------------------------
 1 | #ifndef __PCG_TESTS_H__
 2 | #define __PCG_TESTS_H__
 3 | 
 4 | #include "H2Pack.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | // Test preconditioned conjugate gradient solver with different preconditioner
11 | // Input parameters:
12 | //   krnl_mat_size  : Size of the kernel matrix
13 | //   h2mat          : Constructed H2 matrix
14 | //   hssmat         : Constructed SPDHSS matrix
15 | //   shift          : Diagonal shift
16 | //   max_rank       : Maximum approximation rank for LRD and FSAI
17 | //   max_iter       : Maximum number of PCG iterations
18 | //   CG_tol         : Residual vector norm tolerance
19 | //   method         : Method(s) to be tested: 1-5: no precond, block Jaboci, LRD, 
20 | //                    FSAI, HSS. 0: test all. 
21 | void pcg_tests(
22 |     const int krnl_mat_size, H2Pack_p h2mat, H2Pack_p hssmat, const DTYPE shift, 
23 |     const int max_rank, const int max_iter, const DTYPE CG_tol, const int method
24 | );
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/test_FSAI.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <math.h>
 5 | #include <assert.h>
 6 | #include <time.h>
 7 | #include <omp.h>
 8 | 
 9 | #include "FSAI_precond.h"
10 | #include "../AFN_precond/precond_test_utils.h"
11 | #include "../PCG/pcg.h"
12 | 
13 | int main(int argc, char **argv)
14 | {
15 |     // Parse command line arguments
16 |     int kid, npt, pt_dim, fsai_npt, fast_knn;
17 |     DTYPE mu, kp, *coord = NULL;
18 |     void *krnl_param = &kp;
19 |     kernel_eval_fptr krnl_eval = NULL;
20 |     kernel_bimv_fptr krnl_bimv = NULL;
21 |     int krnl_bimv_flops = 0;
22 |     if (argc < 8)
23 |     {
24 |         printf("Usage: %s kid kp mu npt pt_dim fsai_npt fast_knn coord_bin\n", argv[0]);
25 |         printf("  - kid       [int]    : Kernel function ID\n");
26 |         printf("                         1 - Gaussian    k(x, y) = exp(-l * |x-y|^2)\n");
27 |         printf("                         2 - Exponential k(x, y) = exp(-l * |x-y|)\n");
28 |         printf("                         3 - 3/2 Matern  k(x, y) = (1 + l*k) * exp(-l*k), k = sqrt(3) * |x-y|\n");
29 |         printf("                         4 - 5/2 Matern  k(x, y) = (1 + l*k + l^2*k^2/3) * exp(-l*k), k = sqrt(5) * |x-y|\n");
30 |         printf("  - kp        [double] : Kernel function parameter (l)\n");
31 |         printf("  - mu        [double] : Kernel matrix diagonal shift\n");
32 |         printf("  - npt       [int]    : Number of points\n");
33 |         printf("  - pt_dim    [int]    : Point dimension\n");
34 |         printf("  - fsai_npt  [int]    : Maximum number of nonzeros in each row of the AFN FSAI matrix\n");
35 |         printf("  - fast_knn  [0 or 1] : If AFN FSAI should use fast approximated KNN instead of exact KNN\n");
36 |         printf("  - coord_bin [str]    : (Optional) Binary file containing the coordinates, size pt_dim * npt,\n");
37 |         printf("                         row major, each column is a point coordinate\n");
38 |         return 255;
39 |     } 
40 |     kid      = atoi(argv[1]);
41 |     kp       = atof(argv[2]);
42 |     mu       = atof(argv[3]);
43 |     npt      = atoi(argv[4]);
44 |     pt_dim   = atoi(argv[5]);
45 |     fsai_npt = atoi(argv[6]);
46 |     fast_knn = atoi(argv[7]);
47 |     coord    = (DTYPE*) malloc(sizeof(DTYPE) * npt * pt_dim);
48 |     if (kid < 1 || kid > 4) kid = 1;
49 |     if (pt_dim < 2 || pt_dim > 3) pt_dim = 3;
50 |     if (argc >= 9)
51 |     {
52 |         FILE *inf = fopen(argv[8], "rb");
53 |         fread(coord, sizeof(DTYPE), npt * pt_dim, inf);
54 |         fclose(inf);
55 |     } else {
56 |         srand(814);  // Match with Tianshi's code
57 |         DTYPE scale = DPOW((DTYPE) npt, 1.0 / (DTYPE) pt_dim);
58 |         for (int i = 0; i < npt * pt_dim; i++) coord[i] = scale * (rand() / (DTYPE)(RAND_MAX));
59 |     }
60 |     select_kernel(kid, pt_dim, kp, mu, npt, &krnl_eval, &krnl_bimv, &krnl_bimv_flops);
61 |     printf("Point set: %d points in %d-D\n", npt, pt_dim);
62 |     printf("Linear system to solve: (K(X, X) + %.4f * I) * x = b\n", mu);
63 |     printf("\nFSAI preconditioner parameters:\n");
64 |     printf("- Maximum FSAI matrix nonzeros per row = %d\n", fsai_npt);
65 |     printf("- Fast KNN for FSAI sparsity pattern   = %s\n", fast_knn ? "Yes" : "No");
66 |     printf("\n");
67 |     
68 |     // Build H2 matrix
69 |     double st, et;
70 |     H2Pack_p h2mat = NULL;
71 |     DTYPE h2_reltol = 1e-8;
72 |     H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat);
73 | 
74 |     // Build FSAI preconditioner
75 |     printf("Building FSAI preconditioner...\n");
76 |     FSAI_precond_p FSAI_precond = NULL;
77 |     H2P_build_FSAI_precond(h2mat, fsai_npt, mu, &FSAI_precond);
78 |     int nnz_upper = fsai_npt * (fsai_npt + 1) / 2 + fsai_npt * (npt - fsai_npt);
79 |     DEBUG_PRINTF("FSAI G matrix nnz = %d, nnz upper bound = %d\n", FSAI_precond->G->nnz, nnz_upper);
80 | 
81 |     // PCG test
82 |     DTYPE CG_reltol = 1e-4;
83 |     int max_iter = 400;
84 |     test_PCG(
85 |         H2Pack_matvec_diagshift, (void *) h2mat, 
86 |         (matvec_fptr) FSAI_precond_apply, (void *) FSAI_precond, 
87 |         npt, max_iter, CG_reltol
88 |     );
89 |     FSAI_precond_print_stat(FSAI_precond);
90 | 
91 |     // Clean up
92 |     printf("\n");
93 |     free(coord);
94 |     H2P_destroy(&h2mat);
95 |     FSAI_precond_destroy(&FSAI_precond);
96 |     return 0;
97 | }
98 | 


--------------------------------------------------------------------------------
/examples/SPDHSS-H2/test_FSAI_IE.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <omp.h>
  7 | 
  8 | #include "H2Pack.h"
  9 | #include "FSAI_precond.h"
 10 | #include "../AFN_precond/IE_diag_quad.h"
 11 | #include "../AFN_precond/precond_test_utils.h"
 12 | 
 13 | // This file follows the test settings in the FLAM library rskelf/test/{ie_cube1.m, ie_square1.m}
 14 | // Solve an intergral equation (IE): a_i * u_i + b_i * \sum_{j=1}^n K(x_i, x_j) * c_j * u_j = f_i,
 15 | // with setting a(x_i) == 0, b(x_i) == c(x_i) == 1, K(x, y) is the Laplace kernel. 
 16 | // This test setting is the same as Example 5 in paper DOI:10.1002/cpa.21577
 17 | 
 18 | static DTYPE k_scale;  // Scaling factor for the kernel function
 19 | static void H2Pack_matvec_scale(const void *h2pack_, const DTYPE *b, DTYPE *x)
 20 | {
 21 |     H2Pack_p h2pack = (H2Pack_p) h2pack_;
 22 |     H2P_matvec(h2pack, b, x);
 23 |     #pragma omp simd
 24 |     for (int i = 0; i < h2pack->krnl_mat_size; i++) x[i] *= k_scale;
 25 | }
 26 | 
 27 | int main(int argc, char **argv)
 28 | {
 29 |     // Parse command line arguments
 30 |     int npt, pt_dim, dim_n;
 31 |     int fsai_npt, fast_knn, max_iter;
 32 |     DTYPE mu = 0.0, dv, solve_tol, h, *coord = NULL, *krnl_param = NULL;
 33 |     kernel_eval_fptr krnl_eval = NULL;
 34 |     kernel_bimv_fptr krnl_bimv = NULL;
 35 |     int krnl_bimv_flops = 0;
 36 |     if (argc < 7)
 37 |     {
 38 |         printf("Usage: %s pt_dim dim_n fsai_npt fast_knn solve_tol max_iter\n", argv[0]);
 39 |         printf("  - pt_dim       [int]    : Point dimension, 2 or 3\n");
 40 |         printf("  - dim_n        [int]    : Number of discretization points in each dimension\n");
 41 |         printf("  - fsai_npt     [int]    : FSAI nonzeros per row\n");
 42 |         printf("  - fast_knn     [0 or 1] : If FSAI should use fast approximated KNN instead of exact KNN\n");
 43 |         printf("  - solve_tol    [double] : PCG relative residual tolerance\n");
 44 |         printf("  - max_iter     [int]    : PCG maximum iteration\n");
 45 |         return 255;
 46 |     } 
 47 |     pt_dim       = atoi(argv[1]);
 48 |     dim_n        = atoi(argv[2]);
 49 |     fsai_npt     = atoi(argv[3]);
 50 |     fast_knn     = atoi(argv[4]);
 51 |     solve_tol    = atof(argv[5]);
 52 |     max_iter     = atoi(argv[6]);
 53 |     printf("Point set: %d^%d equal-space points in [0, 1]^%d\n", dim_n, pt_dim, pt_dim);
 54 |     printf("Laplace kernel, K(x, y) = ");
 55 |     if (pt_dim == 2)
 56 |     {
 57 |         krnl_eval = Laplace_2D_eval_intrin_t;
 58 |         krnl_bimv = Laplace_2D_krnl_bimv_intrin_t;
 59 |         krnl_bimv_flops = Laplace_2D_krnl_bimv_flop;
 60 |         // Laplace_2D computes -log(|x - y|), so the scaling factor is 1 / (2 * pi)
 61 |         k_scale = 1.0 / (2.0 * M_PI);
 62 |         dv = diag_quad_2d[dim_n - 1];
 63 |         printf("-1 / (2 * pi) * log(|x - y|), K(x, x) = %e\n", dv);
 64 |     } else {
 65 |         krnl_eval = Coulomb_3D_eval_intrin_t;
 66 |         krnl_bimv = Coulomb_3D_krnl_bimv_intrin_t;
 67 |         krnl_bimv_flops = Coulomb_3D_krnl_bimv_flop;
 68 |         // Coulomb_3D computes 1 / |x - y|, so the scaling factor is 1 / (4 * pi)
 69 |         k_scale = 1.0 / (4.0 * M_PI);
 70 |         dv = diag_quad_3d[dim_n - 1];
 71 |         printf("1 / (4 * pi * |x - y|), K(x, x) = %e\n", dv);
 72 |     }
 73 |     printf("Linear system to solve: K(X, X) * x = b\n");
 74 |     printf("PCG relative residual tolerance = %.2e, max iterations = %d\n", solve_tol, max_iter);
 75 |     printf("\nFSAI nonzeros per row = %d\n", fsai_npt);
 76 |     printf("\nFast KNN for FSAI sparsity pattern = %s\n", fast_knn ? "Yes" : "No");
 77 | 
 78 |     // Generate equal-space grid
 79 |     h = 1.0 / (DTYPE) dim_n;
 80 |     npt = (pt_dim == 2) ? dim_n * dim_n : dim_n * dim_n * dim_n;
 81 |     n_ = npt;
 82 |     coord = (DTYPE*) malloc(sizeof(DTYPE) * pt_dim * npt);
 83 |     if (pt_dim == 2)
 84 |     {
 85 |         for (int i = 0; i < dim_n; i++)
 86 |         {
 87 |             for (int j = 0; j < dim_n; j++)
 88 |             {
 89 |                 int idx = i * dim_n + j;
 90 |                 coord[0 * npt + idx] = h * (i + 1);
 91 |                 coord[1 * npt + idx] = h * (j + 1);
 92 |             }
 93 |         }
 94 |     } else {
 95 |         for (int i = 0; i < dim_n; i++)
 96 |         {
 97 |             for (int j = 0; j < dim_n; j++)
 98 |             {
 99 |                 for (int k = 0; k < dim_n; k++)
100 |                 {
101 |                     int idx = i * dim_n * dim_n + j * dim_n + k;
102 |                     coord[0 * npt + idx] = h * (i + 1);
103 |                     coord[1 * npt + idx] = h * (j + 1);
104 |                     coord[2 * npt + idx] = h * (k + 1);
105 |                 }
106 |             }
107 |         }
108 |     }
109 | 
110 |     // Scale the kernel matrix for area-weighted point interaction (what's this?)
111 |     k_scale = k_scale / (DTYPE) npt;  
112 |     // Since the diagonal value will also be scaled by k_scale, we need to scale it back
113 |     dv = dv / k_scale;
114 |     krnl_param = &dv;
115 | 
116 |     // Build H2 matrix
117 |     double st, et;
118 |     H2Pack_p h2mat = NULL;
119 |     DTYPE h2_reltol = (solve_tol < 1e-8) ? solve_tol : 1e-8;
120 |     H2mat_build(npt, pt_dim, coord, h2_reltol, krnl_eval, krnl_bimv, krnl_bimv_flops, krnl_param, &h2mat);
121 | 
122 |     // Build FSAI preconditioner
123 |     printf("Building FSAI preconditioner...\n");
124 |     FSAI_precond_p FSAI_precond = NULL;
125 |     H2P_build_FSAI_precond(h2mat, fsai_npt, mu, &FSAI_precond);
126 |     int nnz_upper = fsai_npt * (fsai_npt + 1) / 2 + fsai_npt * (npt - fsai_npt);
127 |     DEBUG_PRINTF("FSAI G matrix nnz = %d, nnz upper bound = %d\n", FSAI_precond->G->nnz, nnz_upper);
128 |     printf("\n\n");
129 | 
130 |     // PCG test
131 |     printf("\nTesting FSAI preconditioner...\n");
132 |     test_PCG(
133 |         H2Pack_matvec_scale, (void *) h2mat, 
134 |         (matvec_fptr) FSAI_precond_apply, (void *) FSAI_precond, 
135 |         npt, max_iter, solve_tol
136 |     );
137 | 
138 |     // Clean up
139 |     free(coord);
140 |     H2P_destroy(&h2mat);
141 |     FSAI_precond_destroy(&FSAI_precond);
142 |     return 0;
143 | }
144 | 


--------------------------------------------------------------------------------
/examples/common.make:
--------------------------------------------------------------------------------
 1 | H2PACK_INSTALL_DIR = ..
 2 | 
 3 | DEFS    = 
 4 | INCS    = -I$(H2PACK_INSTALL_DIR)/include
 5 | CFLAGS  = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS)
 6 | LDFLAGS = -g -O3 -fopenmp
 7 | LIBS    = $(H2PACK_INSTALL_DIR)/lib/libH2Pack.a
 8 | 
 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1)
10 | CFLAGS  += -fopenmp -xHost
11 | endif
12 | 
13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1)
14 | CFLAGS  += -fopenmp -march=native -Wno-unused-result -Wno-unused-function
15 | LIBS    += -lgfortran -lm
16 | endif
17 | 
18 | ifeq ($(strip $(USE_MKL)), 1)
19 | DEFS    += -DUSE_MKL
20 | CFLAGS  += -mkl
21 | LDFLAGS += -mkl
22 | endif
23 | 
24 | ifeq ($(strip $(USE_OPENBLAS)), 1)
25 | OPENBLAS_INSTALL_DIR = ../../OpenBLAS-git/install
26 | DEFS    += -DUSE_OPENBLAS
27 | INCS    += -I$(OPENBLAS_INSTALL_DIR)/include
28 | LDFLAGS += -L$(OPENBLAS_INSTALL_DIR)/lib
29 | LIBS    += -lopenblas
30 | endif
31 | 
32 | C_SRCS 	= $(wildcard *.c)
33 | C_OBJS  = $(C_SRCS:.c=.c.o)
34 | EXES    = $(C_SRCS:.c=.exe)
35 | 
36 | # Delete the default old-fashion double-suffix rules
37 | .SUFFIXES:
38 | 
39 | .SECONDARY: $(C_OBJS)
40 | 
41 | all: $(EXES)
42 | 
43 | %.c.o: %.c
44 | 	$(CC) $(CFLAGS) -c $^ -o $@
45 | 
46 | %.exe: %.c.o
47 | 	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
48 | 
49 | clean:
50 | 	rm -f $(EXES) $(C_OBJS)


--------------------------------------------------------------------------------
/examples/direct_nbody.h:
--------------------------------------------------------------------------------
 1 | 
 2 | void direct_nbody(
 3 |     const void *krnl_param, kernel_eval_fptr krnl_eval, const int pt_dim, const int krnl_dim, 
 4 |     const DTYPE *src_coord, const int src_coord_ld, const int n_src_pt, const DTYPE *src_val,
 5 |     const DTYPE *dst_coord, const int dst_coord_ld, const int n_dst_pt, DTYPE *dst_val
 6 | )
 7 | {
 8 |     const int npt_blk  = 256;
 9 |     const int blk_size = npt_blk * krnl_dim;
10 |     const int n_thread = omp_get_max_threads();
11 |     
12 |     memset(dst_val, 0, sizeof(DTYPE) * n_dst_pt * krnl_dim);
13 |     
14 |     DTYPE *krnl_mat_buffs = (DTYPE*) malloc(sizeof(DTYPE) * n_thread * blk_size * blk_size);
15 |     assert(krnl_mat_buffs != NULL);
16 |     
17 |     #pragma omp parallel
18 |     {
19 |         int tid = omp_get_thread_num();
20 |         DTYPE *krnl_mat_buff = krnl_mat_buffs + tid * blk_size * blk_size;
21 |         
22 |         int tid_dst_pt_s, tid_dst_pt_n, tid_dst_pt_e;
23 |         calc_block_spos_len(n_dst_pt, n_thread, tid, &tid_dst_pt_s, &tid_dst_pt_n);
24 |         tid_dst_pt_e = tid_dst_pt_s + tid_dst_pt_n;
25 |         
26 |         for (int dst_pt_idx = tid_dst_pt_s; dst_pt_idx < tid_dst_pt_e; dst_pt_idx += npt_blk)
27 |         {
28 |             int dst_pt_blk = (dst_pt_idx + npt_blk > tid_dst_pt_e) ? (tid_dst_pt_e - dst_pt_idx) : npt_blk;
29 |             int krnl_mat_nrow = dst_pt_blk * krnl_dim;
30 |             const DTYPE *dst_coord_ptr = dst_coord + dst_pt_idx;
31 |             DTYPE *dst_val_ptr = dst_val + dst_pt_idx * krnl_dim;
32 |             for (int src_pt_idx = 0; src_pt_idx < n_src_pt; src_pt_idx += npt_blk)
33 |             {
34 |                 int src_pt_blk = (src_pt_idx + npt_blk > n_src_pt) ? (n_src_pt - src_pt_idx) : npt_blk;
35 |                 int krnl_mat_ncol = src_pt_blk * krnl_dim;
36 |                 const DTYPE *src_coord_ptr = src_coord + src_pt_idx;
37 |                 const DTYPE *src_val_ptr = src_val + src_pt_idx * krnl_dim;
38 |                 
39 |                 krnl_eval(
40 |                     dst_coord_ptr, dst_coord_ld, dst_pt_blk,
41 |                     src_coord_ptr, src_coord_ld, src_pt_blk, 
42 |                     krnl_param, krnl_mat_buff, krnl_mat_ncol
43 |                 );
44 |                 
45 |                 CBLAS_GEMV(
46 |                     CblasRowMajor, CblasNoTrans, krnl_mat_nrow, krnl_mat_ncol, 
47 |                     1.0, krnl_mat_buff, krnl_mat_ncol, src_val_ptr, 1, 1.0, dst_val_ptr, 1
48 |                 );
49 |             }
50 |         }
51 |     }
52 |     //printf("Calculate direct n-body reference results for %d points done\n", n_dst_pt);
53 |     free(krnl_mat_buffs);
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/examples/example_H2.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "H2Pack_kernels.h"
 11 | #include "direct_nbody.h"
 12 | 
 13 | int main(int argc, char **argv)
 14 | {
 15 |     srand48(time(NULL));
 16 |     double st, et;
 17 | 
 18 |     // Point configuration, random generation
 19 |     int pt_dim  = 3;
 20 |     int n_point = 40000;
 21 |     DTYPE* coord = (DTYPE*) malloc_aligned(sizeof(DTYPE) * n_point * pt_dim, 64);
 22 |     assert(coord != NULL);
 23 | 
 24 |     DTYPE prefac = DPOW((DTYPE) n_point, 1.0 / (DTYPE) pt_dim);
 25 |     printf("Generating random coordinates in a scaled cubic box...");
 26 |     for (int i = 0; i < n_point * pt_dim; i++)
 27 |     {
 28 |         coord[i] = (DTYPE) drand48();
 29 |         coord[i] *= prefac;
 30 |     }
 31 |     printf(" done.\n");
 32 |  
 33 |     // Kernel configuration
 34 |     int krnl_dim = 1;
 35 |     DTYPE *krnl_param = NULL;  // Coulomb kernel has no parameter
 36 |     kernel_eval_fptr krnl_eval = Coulomb_3D_eval_intrin_t;
 37 |     kernel_bimv_fptr krnl_bimv = Coulomb_3D_krnl_bimv_intrin_t;
 38 |     int krnl_bimv_flops = Coulomb_3D_krnl_bimv_flop;
 39 | 
 40 |     // H2 construction configuration
 41 |     int krnl_mat_size = krnl_dim * n_point;
 42 |     DTYPE rel_tol = 1e-6;
 43 |     const int BD_JIT = 1;
 44 | 
 45 |     // Initialization of H2Pack
 46 |     H2Pack_p h2pack;
 47 |     H2P_init(&h2pack, pt_dim, krnl_dim, QR_REL_NRM, &rel_tol);
 48 |     
 49 |     // Hierarchical partitioning
 50 |     int max_leaf_points = 0;    // use the default in h2pack for maximum number of points in the leaf node
 51 |     DTYPE max_leaf_size = 0.0;  // use the default in h2pack for maximum edge length of leaf box
 52 |     char *pp_fname = "./PP_Coulomb3D_1e-6.dat"; //  file name for storage and reuse of proxy points, can be set as NULL.
 53 |     H2P_calc_enclosing_box(pt_dim, n_point, coord, pp_fname, &h2pack->root_enbox);
 54 |     H2P_partition_points(h2pack, n_point, coord, max_leaf_points, max_leaf_size);
 55 |     
 56 |     // Select proxy points
 57 |     H2P_dense_mat_p *pp;
 58 |     //  method 1: numerical proxy point selection, works for any kernel but require relatively expensive precomputation
 59 |     //            the computed proxy points will be stored in `pp_fname' (if not NULL) for reuse if needed. 
 60 |     if (1)
 61 |     {
 62 |         st = get_wtime_sec();
 63 |         H2P_generate_proxy_point_ID_file(h2pack, krnl_param, krnl_eval, pp_fname, &pp);
 64 |         et = get_wtime_sec();
 65 |         printf("H2Pack generate numerical proxy points used %.3lf (s)...\n", et - st);
 66 |     }
 67 |     else
 68 |     {
 69 |     //  method 2: proxy surface points, works for kernel from potential theory, has negligible cost.
 70 |         // The edge length of the root box enclosing all the points
 71 |         DTYPE max_L = h2pack->root_enbox[pt_dim];
 72 |         // A heuristic but effective selection of the number of proxy surface points given the expected relative tolerance
 73 |         int num_pp, num_pp_dim = ceil(-log10(rel_tol));
 74 |         if (num_pp_dim < 4 ) num_pp_dim = 4;
 75 |         if (num_pp_dim > 10) num_pp_dim = 10;
 76 |         if (pt_dim == 2) num_pp = 2 * pt_dim * num_pp_dim;
 77 |         if (pt_dim == 3) num_pp = 2 * pt_dim * num_pp_dim * num_pp_dim;
 78 |         st = get_wtime_sec();
 79 |         H2P_generate_proxy_point_surface(
 80 |             pt_dim, pt_dim, num_pp, h2pack->max_level,
 81 |             h2pack->min_adm_level, max_L, &pp
 82 |         );
 83 |         et = get_wtime_sec();
 84 |         printf("H2Pack generate proxy surface points used %.3lf (s)...\n", et - st);
 85 |     }
 86 |     
 87 |     // Construct H2 matrix representation
 88 |     H2P_build(h2pack, pp, BD_JIT, krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops);
 89 |     
 90 |     // Check multiplication error at 20000 entries
 91 |     int n_check_pt = 20000, check_pt_s;
 92 |     if (n_check_pt >= n_point)
 93 |     {
 94 |         n_check_pt = n_point;
 95 |         check_pt_s = 0;
 96 |     } else {
 97 |         srand(time(NULL));
 98 |         check_pt_s = rand() % (n_point - n_check_pt);
 99 |     }
100 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
101 |     
102 |     DTYPE *x, *y0, *y1;
103 |     x  = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size);
104 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_dim * n_check_pt);
105 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size);
106 |     assert(x != NULL && y0 != NULL && y1 != NULL);
107 |     for (int i = 0; i < krnl_mat_size; i++) 
108 |         x[i] = (DTYPE) drand48() - 0.5;
109 | 
110 |  
111 |     // Get reference results
112 |     st = get_wtime_sec();
113 |     direct_nbody(
114 |         krnl_param, krnl_eval, pt_dim, krnl_dim, 
115 |         coord,              n_point, n_point,    x, 
116 |         coord + check_pt_s, n_point, n_check_pt, y0
117 |     );
118 |     et = get_wtime_sec();
119 |     printf("Direct n-body for %d points takes %.3lf (s)\n", n_check_pt, et - st);
120 |     
121 |     // H2 matrix-vector multiplication
122 |     st = get_wtime_sec();
123 |     H2P_matvec(h2pack, x, y1);
124 |     et = get_wtime_sec();
125 |     printf("Full H2 matvec takes %.3lf (s)\n", et - st);
126 |     
127 |     // Print out details of the H2 matrix
128 |     H2P_print_statistic(h2pack);
129 |     
130 |     // Verify H2 matvec results
131 |     DTYPE y0_norm = 0.0, err_norm = 0.0;
132 |     for (int i = 0; i < krnl_dim * n_check_pt; i++)
133 |     {
134 |         DTYPE diff = y1[krnl_dim * check_pt_s + i] - y0[i];
135 |         y0_norm  += y0[i] * y0[i];
136 |         err_norm += diff * diff;
137 |     }
138 |     y0_norm  = DSQRT(y0_norm);
139 |     err_norm = DSQRT(err_norm);
140 |     printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
141 |     printf("The specified relative error threshold is %e\n", rel_tol);
142 |     
143 |     // Store H2 matrix data to file
144 |     int store_to_file = 0;
145 |     printf("Store H2 matrix data to file? 1-yes, 0-no : ");
146 |     scanf("%d", &store_to_file);
147 |     if (store_to_file)
148 |     {
149 |         const char *meta_json_fname = "Coulomb_3D_1e-6_meta.json";
150 |         const char *aux_json_fname  = "Coulomb_3D_1e-6_aux.json";
151 |         const char *binary_fname    = "Coulomb_3D_1e-6.bin";
152 |         printf("Storing H2 matrix data to files %s, %s, and %s...", meta_json_fname, aux_json_fname, binary_fname);
153 |         fflush(stdout);
154 |         H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname);
155 |         printf("done\n");
156 |     }
157 | 
158 |     free(x);
159 |     free(y0);
160 |     free(y1);
161 |     free_aligned(coord);
162 |     H2P_destroy(&h2pack);
163 | }
164 | 


--------------------------------------------------------------------------------
/examples/example_H2_tensor.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "H2Pack_kernels.h"
 11 | #include "direct_nbody.h"
 12 | 
 13 | int main(int argc, char **argv)
 14 | {
 15 |     srand48(time(NULL));
 16 |     double st, et;
 17 | 
 18 |     // Point configuration, random generation
 19 |     int pt_dim  = 3;
 20 |     int n_point = 20000;
 21 |     DTYPE* coord = (DTYPE*) malloc_aligned(sizeof(DTYPE) * n_point * pt_dim, 64);
 22 |     assert(coord != NULL);
 23 | 
 24 |     DTYPE prefac = DPOW((DTYPE) n_point, 1.0 / (DTYPE) pt_dim);
 25 |     printf("Generating random coordinates in a scaled cubic box...");
 26 |     for (int i = 0; i < n_point * pt_dim; i++)
 27 |     {
 28 |         coord[i] = (DTYPE) drand48();
 29 |         coord[i] *= prefac;
 30 |     }
 31 |     printf(" done.\n");
 32 |  
 33 |     // Kernel configuration
 34 |     int krnl_dim = 3;
 35 |     DTYPE krnl_param[2] = {1.0, 0.1};  // Stokes kernel with parameter, eta, a
 36 |     kernel_eval_fptr krnl_eval = Stokes_eval_std;
 37 |     kernel_bimv_fptr krnl_bimv = Stokes_krnl_bimv_intrin_t;
 38 |     int krnl_bimv_flops = Stokes_krnl_bimv_flop;
 39 | 
 40 |     // H2 construction configuration
 41 |     int krnl_mat_size = krnl_dim * n_point;
 42 |     DTYPE rel_tol = 1e-6;
 43 |     const int BD_JIT = 1;
 44 | 
 45 |     // Initialization of H2Pack
 46 |     H2Pack_p h2pack;
 47 |     H2P_init(&h2pack, pt_dim, krnl_dim, QR_REL_NRM, &rel_tol);
 48 |     
 49 |     // Hierarchical partitioning
 50 |     int max_leaf_points = 0;    // use the default in h2pack for maximum number of points in the leaf node
 51 |     DTYPE max_leaf_size = 0.0;  // use the default in h2pack for maximum edge length of leaf box
 52 |     char *pp_fname = "./PP_Stokes3D_1e-6.dat"; //  file name for storage and reuse of proxy points, can be set as NULL.
 53 |     H2P_calc_enclosing_box(pt_dim, n_point, coord, pp_fname, &h2pack->root_enbox);
 54 |     H2P_partition_points(h2pack, n_point, coord, max_leaf_points, max_leaf_size);
 55 |     
 56 |     // Select proxy points
 57 |     H2P_dense_mat_p *pp;
 58 |     //  method 1: numerical proxy point selection, works for any kernel but require relatively expensive precomputation
 59 |     //            the computed proxy points will be stored in `pp_fname' (if not NULL) for reuse if needed. 
 60 |     if (0)
 61 |     {
 62 |         st = get_wtime_sec();
 63 |         H2P_generate_proxy_point_ID_file(h2pack, krnl_param, krnl_eval, pp_fname, &pp);
 64 |         et = get_wtime_sec();
 65 |         printf("H2Pack generate numerical proxy points used %.3lf (s)...\n", et - st);
 66 |     }
 67 |     else
 68 |     {
 69 |     //  method 2: proxy surface points, works for kernel from potential theory, has negligible cost.
 70 |         // The edge length of the root box enclosing all the points
 71 |         DTYPE max_L = h2pack->root_enbox[pt_dim];
 72 |         // A heuristic but effective selection of the number of proxy surface points given the expected relative tolerance
 73 |         int num_pp, num_pp_dim = ceil(-log10(rel_tol));
 74 |         if (num_pp_dim < 4 ) num_pp_dim = 4;
 75 |         if (num_pp_dim > 10) num_pp_dim = 10;
 76 |         if (pt_dim == 2) num_pp = 2 * pt_dim * num_pp_dim;
 77 |         if (pt_dim == 3) num_pp = 2 * pt_dim * num_pp_dim * num_pp_dim;
 78 |         st = get_wtime_sec();
 79 |         H2P_generate_proxy_point_surface(
 80 |             pt_dim, pt_dim, num_pp, h2pack->max_level,
 81 |             h2pack->min_adm_level, max_L, &pp
 82 |         );
 83 |         et = get_wtime_sec();
 84 |         printf("H2Pack generate proxy surface points used %.3lf (s)...\n", et - st);
 85 |     }
 86 |     
 87 |     // Construct H2 matrix representation
 88 |     H2P_build(h2pack, pp, BD_JIT, krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops);
 89 |     
 90 |     // Check multiplication error at 20000 entries
 91 |     int n_check_pt = 20000, check_pt_s;
 92 |     if (n_check_pt >= n_point)
 93 |     {
 94 |         n_check_pt = n_point;
 95 |         check_pt_s = 0;
 96 |     } else {
 97 |         srand(time(NULL));
 98 |         check_pt_s = rand() % (n_point - n_check_pt);
 99 |     }
100 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
101 |     
102 |     DTYPE *x, *y0, *y1;
103 |     x  = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size);
104 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_dim * n_check_pt);
105 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size);
106 |     assert(x != NULL && y0 != NULL && y1 != NULL);
107 |     for (int i = 0; i < krnl_mat_size; i++) 
108 |         x[i] = (DTYPE) drand48() - 0.5;
109 | 
110 |  
111 |     // Get reference results
112 |     st = get_wtime_sec();
113 |     direct_nbody(
114 |         krnl_param, krnl_eval, pt_dim, krnl_dim, 
115 |         coord,              n_point, n_point,    x, 
116 |         coord + check_pt_s, n_point, n_check_pt, y0
117 |     );
118 |     et = get_wtime_sec();
119 |     printf("Direct n-body for %d points takes %.3lf (s)\n", n_check_pt, et - st);
120 |     
121 |     // H2 matrix-vector multiplication
122 |     st = get_wtime_sec();
123 |     H2P_matvec(h2pack, x, y1);
124 |     et = get_wtime_sec();
125 |     printf("Full H2 matvec takes %.3lf (s)\n", et - st);
126 |     
127 |     // Print out details of the H2 matrix
128 |     H2P_print_statistic(h2pack);
129 |     
130 |     // Verify H2 matvec results
131 |     DTYPE y0_norm = 0.0, err_norm = 0.0;
132 |     for (int i = 0; i < krnl_dim * n_check_pt; i++)
133 |     {
134 |         DTYPE diff = y1[krnl_dim * check_pt_s + i] - y0[i];
135 |         y0_norm  += y0[i] * y0[i];
136 |         err_norm += diff * diff;
137 |     }
138 |     y0_norm  = DSQRT(y0_norm);
139 |     err_norm = DSQRT(err_norm);
140 |     printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
141 |     printf("The specified relative error threshold is %e\n", rel_tol);
142 | 
143 |     // Store H2 matrix data to file
144 |     int store_to_file = 0;
145 |     printf("Store H2 matrix data to file? 1-yes, 0-no : ");
146 |     scanf("%d", &store_to_file);
147 |     if (store_to_file)
148 |     {
149 |         const char *meta_json_fname = "Stokes_3D_1e-6_meta.json";
150 |         const char *aux_json_fname  = "Stokes_3D_1e-6_aux.json";
151 |         const char *binary_fname    = "Stokes_3D_1e-6.bin";
152 |         printf("Storing H2 matrix data to files %s, %s, and %s...", meta_json_fname, aux_json_fname, binary_fname);
153 |         fflush(stdout);
154 |         H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname);
155 |         printf("done\n");
156 |     }
157 |     
158 |     free(x);
159 |     free(y0);
160 |     free(y1);
161 |     free_aligned(coord);
162 |     H2P_destroy(&h2pack);
163 | }
164 | 


--------------------------------------------------------------------------------
/examples/example_read_H2_file.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "H2Pack_kernels.h"
 11 | #include "direct_nbody.h"
 12 | 
 13 | int main(int argc, char **argv)
 14 | {
 15 |     srand48(time(NULL));
 16 |     double st, et;
 17 | 
 18 |     // Kernel configuration
 19 |     int krnl_dim = 1;
 20 |     DTYPE *krnl_param = NULL;  // Coulomb kernel has no parameter
 21 |     kernel_eval_fptr krnl_eval = Coulomb_3D_eval_intrin_t;
 22 |     kernel_bimv_fptr krnl_bimv = Coulomb_3D_krnl_bimv_intrin_t;
 23 |     int krnl_bimv_flops = Coulomb_3D_krnl_bimv_flop;
 24 |     /*
 25 |     int krnl_dim = 3;
 26 |     DTYPE krnl_param[2] = {1.0, 0.1};  // Stokes kernel with parameter, eta, a
 27 |     kernel_eval_fptr krnl_eval = Stokes_eval_std;
 28 |     kernel_bimv_fptr krnl_bimv = Stokes_krnl_bimv_intrin_t;
 29 |     int krnl_bimv_flops = Stokes_krnl_bimv_flop;
 30 |     */
 31 | 
 32 |     // Read H2 matrix data from file and construct H2Pack
 33 |     const int BD_JIT = 1;
 34 |     H2Pack_p h2pack;
 35 |     const char *meta_json_fname = "Coulomb_3D_1e-6_meta.json";
 36 |     const char *aux_json_fname  = "Coulomb_3D_1e-6_aux.json";
 37 |     const char *binary_fname    = "Coulomb_3D_1e-6.bin";
 38 |     //const char *meta_json_fname = "Stokes_3D_1e-6_meta.json";
 39 |     //const char *aux_json_fname  = "Stokes_3D_1e-6_aux.json";
 40 |     //const char *binary_fname    = "Stokes_3D_1e-6.bin";
 41 |     printf("Reading H2 matrix data from files %s, %s, and %s\n", meta_json_fname, aux_json_fname, binary_fname);
 42 |     H2P_read_from_file(
 43 |         &h2pack, meta_json_fname, aux_json_fname, binary_fname, BD_JIT, 
 44 |         krnl_param, krnl_eval, krnl_bimv, krnl_bimv_flops
 45 |     );
 46 |     int pt_dim  = h2pack->pt_dim;
 47 |     int n_point = h2pack->n_point;
 48 |     int krnl_mat_size = h2pack->krnl_mat_size;
 49 |     DTYPE rel_tol = h2pack->QR_stop_tol;
 50 |     DTYPE *coord  = h2pack->coord0;  // Input (not sorted) point coordinates
 51 | 
 52 |     // Check multiplication error at 20000 entries
 53 |     int n_check_pt = 20000, check_pt_s;
 54 |     if (n_check_pt >= n_point)
 55 |     {
 56 |         n_check_pt = n_point;
 57 |         check_pt_s = 0;
 58 |     } else {
 59 |         srand(time(NULL));
 60 |         check_pt_s = rand() % (n_point - n_check_pt);
 61 |     }
 62 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
 63 |     
 64 |     DTYPE *x, *y0, *y1;
 65 |     x  = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size);
 66 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_dim * n_check_pt);
 67 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * krnl_mat_size);
 68 |     assert(x != NULL && y0 != NULL && y1 != NULL);
 69 |     for (int i = 0; i < krnl_mat_size; i++) 
 70 |         x[i] = (DTYPE) drand48() - 0.5;
 71 | 
 72 |  
 73 |     // Get reference results
 74 |     st = get_wtime_sec();
 75 |     direct_nbody(
 76 |         krnl_param, krnl_eval, pt_dim, krnl_dim, 
 77 |         coord,              n_point, n_point,    x, 
 78 |         coord + check_pt_s, n_point, n_check_pt, y0
 79 |     );
 80 |     et = get_wtime_sec();
 81 |     printf("Direct n-body for %d points takes %.3lf (s)\n", n_check_pt, et - st);
 82 |     
 83 |     // H2 matrix-vector multiplication
 84 |     st = get_wtime_sec();
 85 |     H2P_matvec(h2pack, x, y1);
 86 |     et = get_wtime_sec();
 87 |     printf("Full H2 matvec takes %.3lf (s)\n", et - st);
 88 |     
 89 |     // Print out details of the H2 matrix
 90 |     H2P_print_statistic(h2pack);
 91 |     
 92 |     // Verify H2 matvec results
 93 |     DTYPE y0_norm = 0.0, err_norm = 0.0;
 94 |     for (int i = 0; i < krnl_dim * n_check_pt; i++)
 95 |     {
 96 |         DTYPE diff = y1[krnl_dim * check_pt_s + i] - y0[i];
 97 |         y0_norm  += y0[i] * y0[i];
 98 |         err_norm += diff * diff;
 99 |     }
100 |     y0_norm  = DSQRT(y0_norm);
101 |     err_norm = DSQRT(err_norm);
102 |     printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
103 |     printf("The specified relative error threshold is %e\n", rel_tol);
104 | 
105 |     free(x);
106 |     free(y0);
107 |     free(y1);
108 |     H2P_destroy(&h2pack);
109 | }


--------------------------------------------------------------------------------
/examples/meta_txt_to_json.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import json
  4 | import sys
  5 | import struct
  6 | 
  7 | class EmptyClass(object):
  8 |     def toJSON(self):
  9 |         return json.dumps(self, default=lambda o: o.__dict__, indent=2)
 10 | 
 11 | def hex_to_double(f):
 12 |     return struct.unpack('!d', bytes.fromhex(f))[0]
 13 | 
 14 | def metadata_txt_to_json(meta_txt_fname, meta_json_fname, aux_json_fname):
 15 |     txt_file = open(meta_txt_fname, 'r')
 16 |     lines = txt_file.readlines()
 17 |     txt_file.close()
 18 | 
 19 |     meta_json = EmptyClass()   # Metadata JSON
 20 |     aux_json  = EmptyClass()   # Auxiliary JSON
 21 | 
 22 |     # 1. Metadata: H2 / HSS common part
 23 |     aux_json.dim_point                  = int(lines[0])     # C.1 dim_point
 24 |     aux_json.dim_kernel                 = int(lines[1])     # C.2 dim_kernel
 25 |     aux_json.num_point                  = int(lines[2])     # C.3 num_point
 26 |     meta_json.nrow_matrix               = int(lines[3])     # A.1 nrow_matrix
 27 |     meta_json.ncol_matrix               = int(lines[4])     # A.2 ncol_matrix
 28 |     meta_json.is_symmetric              = int(lines[5])     # A.3 is_symmetric
 29 |     meta_json.num_node_row              = int(lines[6])     # A.4 num_node_row
 30 |     meta_json.num_node_col              = int(lines[7])     # A.5 num_node_col
 31 |     meta_json.root_node_row             = int(lines[8])     # A.6 root_node_row
 32 |     meta_json.root_node_col             = int(lines[9])     # A.7 root_node_col
 33 |     meta_json.num_level_row             = int(lines[10])    # A.8 num_level_row
 34 |     meta_json.num_level_col             = int(lines[11])    # A.9 num_level_col
 35 |     aux_json.is_HSS                     = int(lines[12])    # C.4 is_HSS
 36 |     aux_json.min_adm_level              = int(lines[13])    # C.5 min_adm_level
 37 |     meta_json.num_inadmissible_blocks   = int(lines[14])    # A.14 num_inadmissible_blocks - n_leaf_node
 38 |     meta_json.num_admissible_blocks     = int(lines[15])    # A.15 num_admissible_blocks
 39 |     meta_json.has_partial_adm_blocks    = int(lines[16])    # A.16 has_partial_adm_blocks
 40 |     curr_row = 17
 41 | 
 42 |     # 2. Metadata: partitioning tree
 43 |     # A.10 nodes_row; A.11 nodes_col == NULL since H2 matrix is symmetric
 44 |     nodes_row = []
 45 |     num_leaf_node = 0
 46 |     for i in range(meta_json.num_node_row):
 47 |         raw_data = [x for x in lines[curr_row + i].split(' ') if x]
 48 |         node_i = EmptyClass()
 49 |         node_i.index        = int(raw_data[0])  # A.10.1 index
 50 |         node_i.level        = int(raw_data[1])  # A.10.2 level
 51 |         node_i.cluster_head = int(raw_data[2])  # A.10.3 cluster_head
 52 |         node_i.cluster_tail = int(raw_data[3])  # A.10.4 cluster_tail
 53 |         node_i.num_children = int(raw_data[4])  # A.10.5 num_children
 54 |         if 0 == node_i.num_children:
 55 |             num_leaf_node += 1
 56 |         # A.10.6 children
 57 |         node_i.children = []
 58 |         for j in range(node_i.num_children):
 59 |             node_i.children.append(int(raw_data[5 + j]))
 60 |         nodes_row.append(node_i)
 61 |     meta_json.nodes_row = nodes_row
 62 |     curr_row += meta_json.num_node_row
 63 |     meta_json.num_inadmissible_blocks += num_leaf_node
 64 | 
 65 |     # 3. Metadata data: U matrices
 66 |     # A.12 basis_matrices_row (A.13 ignored since H2 matrix is symmetric)
 67 |     U_mat = []
 68 |     for i in range(meta_json.num_node_row):
 69 |         raw_data = [x for x in lines[curr_row + i].split(' ') if x]
 70 |         U_i = EmptyClass()
 71 |         U_i.node    = int(raw_data[0])  # A.12.1 node
 72 |         U_i.num_row = int(raw_data[1])  # A.12.2 num_row
 73 |         U_i.num_col = int(raw_data[2])  # A.12.3 num_col
 74 |         U_mat.append(U_i)
 75 |     meta_json.basis_matrices_row = U_mat
 76 |     curr_row += meta_json.num_node_row
 77 | 
 78 |     # 4. Metadata data: B matrices
 79 |     B_mat = []
 80 |     for i in range(meta_json.num_admissible_blocks):
 81 |         raw_data = [x for x in lines[curr_row + i].split(' ') if x]
 82 |         B_i = EmptyClass()
 83 |         B_i.node_row    = int(raw_data[0])  # A.17.1 node_row
 84 |         B_i.node_col    = int(raw_data[1])  # A.17.2 node_col
 85 |         B_i.num_row     = int(raw_data[2])  # A.17.3 num_row
 86 |         B_i.num_col     = int(raw_data[3])  # A.17.4 num_col
 87 |         B_i.is_part_adm = int(raw_data[4])  # A.17.5 is_part_adm
 88 |         B_mat.append(B_i)
 89 |     meta_json.B_matrices = B_mat
 90 |     curr_row += meta_json.num_admissible_blocks
 91 | 
 92 |     # 5. Metadata data: D matrices
 93 |     D_mat = []
 94 |     for i in range(meta_json.num_inadmissible_blocks):
 95 |         raw_data = [x for x in lines[curr_row + i].split(' ') if x]
 96 |         D_i = EmptyClass()
 97 |         D_i.node_row = int(raw_data[0]) # A.18.1 node_row
 98 |         D_i.node_col = int(raw_data[1]) # A.18.2 node_col
 99 |         D_i.num_row  = int(raw_data[2]) # A.18.3 num_row
100 |         D_i.num_col  = int(raw_data[3]) # A.18.4 num_col
101 |         D_mat.append(D_i)
102 |     meta_json.D_matrices = D_mat
103 |     curr_row += meta_json.num_inadmissible_blocks
104 | 
105 |     # 6. Other necessary information for H2Pack
106 |     aux_json.max_leaf_points     = int(lines[curr_row])         # C.6 max_leaf_points
107 |     aux_json.QR_stop_tol         = float(lines[curr_row + 1])   # C.7 QR_stop_tol
108 |     aux_json.has_skeleton_points = int(lines[curr_row + 2])     # C.8 has_skeleton_points
109 |     curr_row += 3
110 |     # C.9 point_coordinate
111 |     # Cast it from uint64_t back to double
112 |     coord = []
113 |     for i in range(aux_json.num_point):
114 |         raw_data = [x for x in lines[curr_row + i].split(' ') if x]
115 |         for j in range(aux_json.dim_point):
116 |             coord.append(hex_to_double(raw_data[j]))
117 |     aux_json.point_coordinate = coord
118 |     curr_row += aux_json.num_point
119 |     # C.10 permutation_array
120 |     perm = []
121 |     for i in range(aux_json.num_point):
122 |         perm.append(int(lines[curr_row + i]))
123 |     aux_json.permutation_array = perm
124 |     curr_row += aux_json.num_point
125 |     # C.11 skeleton_point
126 |     node_skel = []
127 |     for i in range(meta_json.num_node_row):
128 |         raw_data = [x for x in lines[curr_row + i].split(' ') if x]
129 |         skel_i = EmptyClass()
130 |         skel_i.node = int(raw_data[0])
131 |         skel_i.num_skeleton_point = int(raw_data[1])
132 |         pt_idx = []
133 |         for j in range(skel_i.num_skeleton_point):
134 |             pt_idx.append(int(raw_data[2 + j]))
135 |         skel_i.skeleton_point_indices = pt_idx
136 |         node_skel.append(skel_i)
137 |     aux_json.skeleton_points = node_skel
138 | 
139 |     json_file0 = open(meta_json_fname, 'w')
140 |     json_file0.write(meta_json.toJSON())
141 |     json_file0.close()
142 | 
143 |     json_file1 = open(aux_json_fname, 'w')
144 |     json_file1.write(aux_json.toJSON())
145 |     json_file1.close()
146 | 
147 | if __name__=='__main__':
148 |     if len(sys.argv) < 4:
149 |         print('Usage: %s <metadata txt file> <metadata json file> <auxiliary json file>'%sys.argv[0])
150 |         exit(1)
151 |     meta_txt_fname  = sys.argv[1]
152 |     meta_json_fname = sys.argv[2]
153 |     aux_json_fname  = sys.argv[3]
154 |     metadata_txt_to_json(meta_txt_fname, meta_json_fname, aux_json_fname)


--------------------------------------------------------------------------------
/extra/GCC-OpenBLAS.make:
--------------------------------------------------------------------------------
 1 | CC           = gcc
 2 | USE_MKL      = 0
 3 | USE_OPENBLAS = 1
 4 | 
 5 | include common.make
 6 | 
 7 | # GCC 10 need to manually specify using SVE, -march=native is not enough
 8 | # On A64FX SVE vector bits = 512, on other SVE supported processors this value might be different
 9 | USE_AARCH64_SVE = 0
10 | SVE_VECTOR_BITS = 512
11 | ifeq ($(strip $(USE_AARCH64_SVE)), 1)
12 | CFLAGS := $(subst -march=native, -march=armv8.2-a+sve -msve-vector-bits=$(SVE_VECTOR_BITS), $(CFLAGS))
13 | endif


--------------------------------------------------------------------------------
/extra/ICC-MKL.make:
--------------------------------------------------------------------------------
1 | CC           = icc
2 | USE_MKL      = 1
3 | USE_OPENBLAS = 0
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/extra/common.make:
--------------------------------------------------------------------------------
 1 | H2PACK_INSTALL_DIR = ..
 2 | 
 3 | DEFS    = 
 4 | INCS    = -I$(H2PACK_INSTALL_DIR)/include
 5 | CFLAGS  = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS)
 6 | LDFLAGS = -g -O3 -fopenmp
 7 | LIBS    = $(H2PACK_INSTALL_DIR)/lib/libH2Pack.a
 8 | 
 9 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1)
10 | CFLAGS  += -fopenmp -xHost
11 | endif
12 | 
13 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1)
14 | CFLAGS  += -fopenmp -march=native -Wno-unused-result -Wno-unused-function
15 | LIBS    += -lgfortran -lm
16 | endif
17 | 
18 | ifeq ($(strip $(USE_MKL)), 1)
19 | DEFS    += -DUSE_MKL
20 | CFLAGS  += -mkl
21 | LDFLAGS += -mkl
22 | endif
23 | 
24 | ifeq ($(strip $(USE_OPENBLAS)), 1)
25 | OPENBLAS_INSTALL_DIR = ../../OpenBLAS-git/install
26 | DEFS    += -DUSE_OPENBLAS
27 | INCS    += -I$(OPENBLAS_INSTALL_DIR)/include
28 | LDFLAGS += -L$(OPENBLAS_INSTALL_DIR)/lib
29 | LIBS    += -lopenblas
30 | endif
31 | 
32 | C_SRCS 	= $(wildcard *.c)
33 | C_OBJS  = $(C_SRCS:.c=.c.o)
34 | EXES    = $(C_SRCS:.c=.exe)
35 | 
36 | # Delete the default old-fashion double-suffix rules
37 | .SUFFIXES:
38 | 
39 | .SECONDARY: $(C_OBJS)
40 | 
41 | all: $(EXES)
42 | 
43 | %.c.o: %.c
44 | 	$(CC) $(CFLAGS) -c $^ -o $@
45 | 
46 | %.exe: %.c.o $(H2PACK_INSTALL_DIR)/lib/libH2Pack.a
47 | 	$(CC) $(LDFLAGS) -o $@ $^ $(LIBS)
48 | 
49 | clean:
50 | 	rm -f $(EXES) $(C_OBJS)
51 | 


--------------------------------------------------------------------------------
/extra/debug.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #include "H2Pack.h"
 3 | #include "H2Pack_utils.h"
 4 | #include "utils.h"
 5 | 
 6 | void dump_HSS(H2Pack_p h2pack)
 7 | {
 8 |     // Assumption: MATLAB code is using the same point set
 9 |     // and has the same r_adm_pairs
10 |     FILE *ouf0 = fopen("add_C_HSS_mat_metadata.m", "w");
11 |     FILE *ouf1 = fopen("C_HSS_mat.bin", "wb");
12 | 
13 |     H2P_dense_mat_p tmpM;
14 |     H2P_dense_mat_init(&tmpM, 1024, 1024);
15 | 
16 |     fprintf(ouf0, "C_U_sizes = [\n");
17 |     for (int i = 0; i < h2pack->n_node; i++)
18 |     {
19 |         fprintf(ouf0, "%d %d;\n", h2pack->U[i]->nrow, h2pack->U[i]->ncol);
20 |         fwrite(h2pack->U[i]->data, sizeof(DTYPE), h2pack->U[i]->nrow * h2pack->U[i]->ncol, ouf1);
21 |     }
22 |     fprintf(ouf0, "];\n");
23 | 
24 |     fprintf(ouf0, "C_B_sizes = [\n");
25 |     for (int i = 0; i < h2pack->HSS_n_r_adm_pair; i++)
26 |     {
27 |         int node0 = h2pack->HSS_r_adm_pairs[2 * i];
28 |         int node1 = h2pack->HSS_r_adm_pairs[2 * i + 1];
29 |         H2P_get_Bij_block(h2pack, node0, node1, tmpM);
30 |         fprintf(ouf0, "%d %d;\n", h2pack->B_nrow[i], h2pack->B_ncol[i]);
31 |         fwrite(tmpM->data, sizeof(DTYPE), tmpM->nrow * tmpM->ncol, ouf1);
32 |     }
33 |     fprintf(ouf0, "];\n");
34 | 
35 |     fprintf(ouf0, "C_D_sizes = [\n");
36 |     for (int i = 0; i < h2pack->n_leaf_node; i++)
37 |     {
38 |         int node = h2pack->height_nodes[i];  // i-th leaf node
39 |         fprintf(ouf0, "%d %d;\n", h2pack->D_nrow[i], h2pack->D_ncol[i]);
40 |         H2P_get_Dij_block(h2pack, node, node, tmpM);
41 |         fwrite(tmpM->data, sizeof(DTYPE), tmpM->nrow * tmpM->ncol, ouf1);
42 |     }
43 |     fprintf(ouf0, "];\n");
44 | 
45 |     H2P_dense_mat_destroy(&tmpM);
46 | 
47 |     fclose(ouf0);
48 |     fclose(ouf1);
49 | }
50 | 


--------------------------------------------------------------------------------
/extra/direct_nbody.h:
--------------------------------------------------------------------------------
 1 | 
 2 | void direct_nbody(
 3 |     const void *krnl_param, kernel_eval_fptr krnl_eval, const int pt_dim, const int krnl_dim, 
 4 |     const DTYPE *src_coord, const int src_coord_ld, const int n_src_pt, const DTYPE *src_val,
 5 |     const DTYPE *dst_coord, const int dst_coord_ld, const int n_dst_pt, DTYPE *dst_val
 6 | )
 7 | {
 8 |     const int npt_blk  = 256;
 9 |     const int blk_size = npt_blk * krnl_dim;
10 |     const int n_thread = omp_get_max_threads();
11 |     
12 |     memset(dst_val, 0, sizeof(DTYPE) * n_dst_pt * krnl_dim);
13 |     
14 |     DTYPE *krnl_mat_buffs = (DTYPE*) malloc(sizeof(DTYPE) * n_thread * blk_size * blk_size);
15 |     assert(krnl_mat_buffs != NULL);
16 |     
17 |     #pragma omp parallel
18 |     {
19 |         int tid = omp_get_thread_num();
20 |         DTYPE *krnl_mat_buff = krnl_mat_buffs + tid * blk_size * blk_size;
21 |         
22 |         int tid_dst_pt_s, tid_dst_pt_n, tid_dst_pt_e;
23 |         calc_block_spos_len(n_dst_pt, n_thread, tid, &tid_dst_pt_s, &tid_dst_pt_n);
24 |         tid_dst_pt_e = tid_dst_pt_s + tid_dst_pt_n;
25 |         
26 |         for (int dst_pt_idx = tid_dst_pt_s; dst_pt_idx < tid_dst_pt_e; dst_pt_idx += npt_blk)
27 |         {
28 |             int dst_pt_blk = (dst_pt_idx + npt_blk > tid_dst_pt_e) ? (tid_dst_pt_e - dst_pt_idx) : npt_blk;
29 |             int krnl_mat_nrow = dst_pt_blk * krnl_dim;
30 |             const DTYPE *dst_coord_ptr = dst_coord + dst_pt_idx;
31 |             DTYPE *dst_val_ptr = dst_val + dst_pt_idx * krnl_dim;
32 |             for (int src_pt_idx = 0; src_pt_idx < n_src_pt; src_pt_idx += npt_blk)
33 |             {
34 |                 int src_pt_blk = (src_pt_idx + npt_blk > n_src_pt) ? (n_src_pt - src_pt_idx) : npt_blk;
35 |                 int krnl_mat_ncol = src_pt_blk * krnl_dim;
36 |                 const DTYPE *src_coord_ptr = src_coord + src_pt_idx;
37 |                 const DTYPE *src_val_ptr = src_val + src_pt_idx * krnl_dim;
38 |                 
39 |                 krnl_eval(
40 |                     dst_coord_ptr, dst_coord_ld, dst_pt_blk,
41 |                     src_coord_ptr, src_coord_ld, src_pt_blk, 
42 |                     krnl_param, krnl_mat_buff, krnl_mat_ncol
43 |                 );
44 |                 
45 |                 CBLAS_GEMV(
46 |                     CblasRowMajor, CblasNoTrans, krnl_mat_nrow, krnl_mat_ncol, 
47 |                     1.0, krnl_mat_buff, krnl_mat_ncol, src_val_ptr, 1, 1.0, dst_val_ptr, 1
48 |                 );
49 |             }
50 |         }
51 |     }
52 |     //printf("Calculate direct n-body reference results for %d points done\n", n_dst_pt);
53 |     free(krnl_mat_buffs);
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/extra/parse_tensor_params.h:
--------------------------------------------------------------------------------
  1 | struct H2P_test_params
  2 | {
  3 |     int   pt_dim;
  4 |     int   xpt_dim;
  5 |     int   krnl_dim;
  6 |     int   n_point;
  7 |     int   krnl_mat_size;
  8 |     int   BD_JIT;
  9 |     int   kernel_id;
 10 |     int   krnl_bimv_flops;
 11 |     void  *krnl_param;
 12 |     DTYPE rel_tol;
 13 |     DTYPE *coord;
 14 |     kernel_eval_fptr krnl_eval;
 15 |     kernel_bimv_fptr krnl_bimv;
 16 | };
 17 | struct H2P_test_params test_params;
 18 | 
 19 | DTYPE Stokes_krnl_param[2] = {1.0, 0.1};
 20 | DTYPE RPY_krnl_param[1]    = {1.0};
 21 | 
 22 | static double pseudo_randn()
 23 | {
 24 |     double res = 0.0;
 25 |     for (int i = 0; i < 12; i++) res += drand48();
 26 |     return (res - 6.0) / 12.0;
 27 | }
 28 | 
 29 | void parse_tensor_params(int argc, char **argv)
 30 | {
 31 |     test_params.pt_dim   = 3;
 32 |     test_params.xpt_dim  = 3;
 33 |     test_params.krnl_dim = 3;
 34 |     
 35 |     if (argc < 2)
 36 |     {
 37 |         printf("Number of points   = ");
 38 |         scanf("%d", &test_params.n_point);
 39 |     } else {
 40 |         test_params.n_point = atoi(argv[1]);
 41 |         printf("Number of points   = %d\n", test_params.n_point);
 42 |     }
 43 |     test_params.krnl_mat_size = test_params.n_point * test_params.krnl_dim;
 44 |     
 45 |     if (argc < 3)
 46 |     {
 47 |         printf("QR relative tol    = ");
 48 |         scanf("%lf", &test_params.rel_tol);
 49 |     } else {
 50 |         test_params.rel_tol = atof(argv[2]);
 51 |         printf("QR relative tol    = %e\n", test_params.rel_tol);
 52 |     }
 53 |     
 54 |     if (argc < 4)
 55 |     {
 56 |         printf("Just-In-Time B & D = ");
 57 |         scanf("%d", &test_params.BD_JIT);
 58 |     } else {
 59 |         test_params.BD_JIT = atoi(argv[3]);
 60 |         printf("Just-In-Time B & D = %d\n", test_params.BD_JIT);
 61 |     }
 62 | 
 63 |     if (argc < 5)
 64 |     {
 65 |         printf("Kernel function ID = ");
 66 |         scanf("%d", &test_params.kernel_id);
 67 |     } else {
 68 |         test_params.kernel_id = atoi(argv[4]);
 69 |         printf("Kernel function ID = %d\n", test_params.kernel_id);
 70 |     }
 71 |     switch (test_params.kernel_id)
 72 |     {
 73 |         case 0: 
 74 |         {
 75 |             printf("Using 3D Stokes kernel, eta = %.2lf, a = %.2lf\n", Stokes_krnl_param[0], Stokes_krnl_param[1]); 
 76 |             break;
 77 |         }
 78 |         case 1: 
 79 |         {
 80 |             printf("Using 3D RPY kernel, eta = %.2lf\n", RPY_krnl_param[0]);
 81 |             break;
 82 |         }
 83 |     }
 84 |     
 85 |     if (test_params.kernel_id == 1) test_params.xpt_dim = 4;
 86 |     test_params.coord = (DTYPE*) malloc_aligned(sizeof(DTYPE) * test_params.n_point * test_params.xpt_dim, 64);
 87 |     assert(test_params.coord != NULL);
 88 |     
 89 |     // Note: coordinates need to be stored in column-major style, i.e. test_params.coord 
 90 |     // is row-major and each column stores the coordinate of a point. 
 91 |     int need_gen = 1;
 92 |     if (argc >= 6)
 93 |     {
 94 |         DTYPE *tmp = (DTYPE*) malloc(sizeof(DTYPE) * test_params.n_point * test_params.xpt_dim);
 95 |         if (strstr(argv[5], ".csv") != NULL)
 96 |         {
 97 |             printf("Reading coordinates from CSV file...");
 98 |             FILE *inf = fopen(argv[5], "r");
 99 |             for (int i = 0; i < test_params.n_point; i++)
100 |             {
101 |                 for (int j = 0; j < test_params.xpt_dim-2; j++) 
102 |                     fscanf(inf, "%lf,", &tmp[i * test_params.xpt_dim + j]);
103 |                 fscanf(inf, "%lf\n", &tmp[i * test_params.xpt_dim + test_params.xpt_dim-2]);
104 |             }
105 |             fclose(inf);
106 |             printf(" done.\n");
107 |             need_gen = 0;
108 |         }
109 |         if (strstr(argv[5], ".bin") != NULL)
110 |         {
111 |             printf("Reading coordinates from binary file...");
112 |             FILE *inf = fopen(argv[5], "rb");
113 |             fread(tmp, sizeof(DTYPE), test_params.n_point * test_params.xpt_dim, inf);
114 |             fclose(inf);
115 |             printf(" done.\n");
116 |             need_gen = 0;
117 |         }
118 |         if (need_gen == 0)
119 |         {
120 |             for (int i = 0; i < test_params.xpt_dim; i++)
121 |                 for (int j = 0; j < test_params.n_point; j++)
122 |                     test_params.coord[i * test_params.n_point + j] = tmp[j * test_params.xpt_dim + i];
123 |         }
124 |         free(tmp);
125 |     }
126 |     if (need_gen == 1)
127 |     {
128 |         DTYPE vol_frac = 0.1;
129 |         DTYPE base = 4.0 / 3.0 * M_PI / vol_frac * (DTYPE) test_params.n_point;
130 |         DTYPE expn = 1.0 / (DTYPE) test_params.pt_dim;
131 |         DTYPE prefac = DPOW(base, expn);
132 |         printf("Binary/CSV coordinate file not provided. Generating random coordinates in unit box...");
133 |         if (test_params.kernel_id == 1)
134 |         {
135 |             DTYPE *x = test_params.coord;
136 |             DTYPE *y = test_params.coord + test_params.n_point;
137 |             DTYPE *z = test_params.coord + test_params.n_point * 2;
138 |             DTYPE *a = test_params.coord + test_params.n_point * 3;
139 |             DTYPE sum_a3 = 0.0;
140 |             for (int i = 0; i < test_params.n_point; i++)
141 |             {
142 |                 a[i] = 0.5 + 5.0 * (DTYPE) drand48();
143 |                 sum_a3 += a[i] * a[i] * a[i];
144 |             }
145 |             base = 4.0 / 3.0 * M_PI * sum_a3 / vol_frac;
146 |             prefac = DPOW(base, expn);
147 |             for (int i = 0; i < test_params.n_point; i++)
148 |             {
149 |                 x[i] = (DTYPE) drand48() * prefac;
150 |                 y[i] = (DTYPE) drand48() * prefac;
151 |                 z[i] = (DTYPE) drand48() * prefac;
152 |             }
153 |         } else {
154 |             for (int i = 0; i < test_params.n_point * test_params.pt_dim; i++)
155 |             {
156 |                 //test_params.coord[i] = (DTYPE) pseudo_randn();
157 |                 test_params.coord[i] = (DTYPE) drand48();
158 |                 test_params.coord[i] *= prefac;
159 |             }
160 |         }
161 |         printf(" done.\n");
162 |     }
163 |     
164 |     switch (test_params.kernel_id)
165 |     {
166 |         case 0: 
167 |         { 
168 |             test_params.krnl_eval       = Stokes_eval_std;
169 |             test_params.krnl_bimv       = Stokes_krnl_bimv_intrin_t;
170 |             test_params.krnl_bimv_flops = Stokes_krnl_bimv_flop;
171 |             test_params.krnl_param      = (void*) &Stokes_krnl_param[0];
172 |             break;
173 |         }
174 |         case 1: 
175 |         {
176 |             test_params.krnl_eval       = RPY_eval_std;
177 |             test_params.krnl_bimv       = RPY_krnl_bimv_intrin_t;
178 |             test_params.krnl_bimv_flops = RPY_krnl_bimv_flop;
179 |             test_params.krnl_param      = (void*) &RPY_krnl_param[0];
180 |             break;
181 |         }
182 |     }
183 | }
184 | 
185 | 


--------------------------------------------------------------------------------
/extra/rand_3D_sphere_points.m:
--------------------------------------------------------------------------------
 1 | function X = rand_3D_sphere_points(n, density)
 2 | % Input parameters:
 3 | %   n       : Number of points
 4 | %   density : Point density on unit surface area, default is 100,
 5 | %             < 0 will generate points on a unit sphere
 6 | % Output parameter:
 7 | %   X : Size n * 3, each row is a point coordinate
 8 | if (nargin < 2), density = 100; end
 9 | X = rand(n, 3) - 0.5;
10 | X = normr(X);
11 | if (density > 0)
12 |     r = sqrt(n / (4 * pi * density));
13 |     X = X .* r;
14 | end
15 | end


--------------------------------------------------------------------------------
/extra/src-obsolete/H2P_generate_proxy_point_ID.c:
--------------------------------------------------------------------------------
 1 | 
 2 | // Generate proxy points for constructing H2 projection and skeleton matrices
 3 | // using ID compress for any kernel function. 
 4 | // This function is isolated because if the enclosing box for all points are fixed,
 5 | // we only need to generate proxy points once and use them repeatedly.
 6 | // Input parameters:
 7 | //   pt_dim     : Dimension of point coordinate
 8 | //   krnl_dim   : Dimension of kernel's return
 9 | //   reltol     : Proxy point selection relative error tolerance
10 | //   max_level  : Maximum level (included) of a H2 tree, (root level == 0)
11 | //   min_level  : Minimum level that needs proxy points
12 | //   max_L      : The size of the root node's enclosing box
13 | //   krnl_eval  : Pointer to kernel matrix evaluation function
14 | //   krnl_param : Pointer to kernel function parameter array
15 | // Output parameter:
16 | //   pp_  : Array of proxy points for each level
17 | void H2P_generate_proxy_point_ID(
18 |     const int pt_dim, const int krnl_dim, const DTYPE reltol, const int max_level, const int min_level,
19 |     DTYPE max_L, const void *krnl_param, kernel_eval_fptr krnl_eval, H2P_dense_mat_p **pp_
20 | )
21 | {
22 |     // 1. Initialize proxy point arrays and parameters
23 |     int n_level = max_level + 1;
24 |     H2P_dense_mat_p *pp = (H2P_dense_mat_p*) malloc(sizeof(H2P_dense_mat_p) * n_level);
25 |     ASSERT_PRINTF(pp != NULL, "Failed to allocate %d arrays for storing proxy points", n_level);
26 |     for (int i = 0; i <= max_level; i++) 
27 |     {
28 |         H2P_dense_mat_init(&pp[i], pt_dim, 0);
29 |         pp[i]->ncol = 0;
30 |     }
31 |     
32 |     GET_ENV_INT_VAR(gen_pp_param.alg,          "H2P_GEN_PP_ALG",       "alg",          2,    0,    2);
33 |     GET_ENV_INT_VAR(gen_pp_param.X0_size,      "H2P_GEN_PP_X0_SIZE",   "X0_size",      2000, 500,  5000);
34 |     GET_ENV_INT_VAR(gen_pp_param.Y0_lsize,     "H2P_GEN_PP_Y0_LSIZE",  "Y0_lsize",     4000, 1000, 20000);
35 |     GET_ENV_INT_VAR(gen_pp_param.L3_nlayer,    "H2P_GEN_PP_L3_NLAYER", "L3_nlayer",    8,    8,    32);
36 |     GET_ENV_INT_VAR(gen_pp_param.max_layer,    "H2P_GEN_PP_MAX_LAYER", "max_layer",    8,    4,    32);
37 |     GET_ENV_INT_VAR(gen_pp_param.print_timers, "H2P_PRINT_TIMERS",     "print_timers", 0,    0,    1);
38 | 
39 |     double timers[4];
40 |     DTYPE L3_nlayer_ = (DTYPE) gen_pp_param.L3_nlayer;
41 | 
42 |     // 2. Construct proxy points on each level
43 |     DTYPE pow_2_level = 0.5;
44 |     for (int level = 0; level < min_level; level++) pow_2_level *= 2.0;
45 |     for (int level = min_level; level <= max_level; level++)
46 |     {
47 |         // Level 0 and level 1 nodes are not admissible, do not need proxy points
48 |         if (level < 2)
49 |         {
50 |             pow_2_level *= 2.0;
51 |             WARNING_PRINTF("Level %d: no proxy points are generated\n", level);
52 |             continue;
53 |         }
54 | 
55 |         // Decide box sizes for domains X and Y
56 |         pow_2_level *= 2.0;
57 |         DTYPE L1   = max_L / pow_2_level;
58 |         DTYPE L2   = (1.0 + 2.0 * ALPHA_H2) * L1;
59 |         DTYPE L3_0 = (1.0 + L3_nlayer_ * ALPHA_H2) * L1;
60 |         DTYPE L3_1 = 2.0 * max_L - L1;
61 |         DTYPE L3   = MIN(L3_0, L3_1);
62 | 
63 |         int Y0_lsize_ = gen_pp_param.Y0_lsize;
64 |         if (gen_pp_param.alg == 0)  // Only one ring, multiple Y0_lsize_ by the number of rings
65 |         {
66 |             int n_layer = DROUND((L3 - L2) / L1);
67 |             if (n_layer > gen_pp_param.max_layer) n_layer = gen_pp_param.max_layer;
68 |             Y0_lsize_ *= n_layer;
69 |         }
70 |         
71 |         // Reset timers
72 |         timers[GEN_PP_KRNL_TIMER_IDX] = 0.0;
73 |         timers[GEN_PP_KRNL_TIMER_IDX] = 0.0;
74 |         timers[GEN_PP_ID_TIMER_IDX]   = 0.0;
75 |         timers[GEN_PP_MISC_TIMER_IDX] = 0.0;
76 | 
77 |         // Generate proxy points
78 |         H2P_generate_proxy_point_nlayer(
79 |             pt_dim, krnl_dim, reltol, 
80 |             krnl_param, krnl_eval, 
81 |             L1, L2, L3, 
82 |             gen_pp_param.alg, gen_pp_param.X0_size, Y0_lsize_, gen_pp_param.max_layer, 
83 |             pp[level], &timers[0]
84 |         );
85 |         
86 |         if (gen_pp_param.print_timers == 1)
87 |         {
88 |             INFO_PRINTF("Level %d: %d proxy points generated\n", level, pp[level]->ncol);
89 |             INFO_PRINTF(
90 |                 "    kernel, SpMM, ID, other time = %.3lf, %.3lf, %.3lf, %.3lf sec\n", 
91 |                 timers[GEN_PP_KRNL_TIMER_IDX], timers[GEN_PP_KRNL_TIMER_IDX], 
92 |                 timers[GEN_PP_ID_TIMER_IDX],   timers[GEN_PP_MISC_TIMER_IDX]
93 |             );
94 |         }
95 |     }  // End of level loop
96 |     
97 |     *pp_ = pp;
98 | }
99 | 


--------------------------------------------------------------------------------
/extra/test_H2_accuracy.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "H2Pack_kernels.h"
 11 | 
 12 | #include "parse_scalar_params.h"
 13 | #include "direct_nbody.h"
 14 | 
 15 | int main(int argc, char **argv)
 16 | {
 17 |     srand48(time(NULL));
 18 |     
 19 |     parse_scalar_params(argc, argv);
 20 |     
 21 |     double st, et;
 22 | 
 23 |     H2Pack_p h2pack;
 24 | 
 25 |     // Test parameters
 26 |     #define n_rel_tol      3
 27 |     #define krnl_param_len 1
 28 |     #define n_krnl_param   5
 29 |     DTYPE rel_tols[n_rel_tol] = {1e-3, 1e-6, 1e-9};
 30 |     DTYPE krnl_params[n_krnl_param * krnl_param_len] = {1e-2, 1e-1, 1e0, 1e1, 1e2};
 31 | 
 32 |     // Loop over rel_tol and krnl_param combinations
 33 |     for (int i_rel_tol = 0; i_rel_tol < n_rel_tol; i_rel_tol++)
 34 |     {
 35 |         test_params.rel_tol = rel_tols[i_rel_tol];
 36 |         for (int i_krnl_param = 0; i_krnl_param < n_krnl_param; i_krnl_param++)
 37 |         {
 38 |             const DTYPE *krnl_param_ = krnl_params + i_krnl_param * krnl_param_len;
 39 |             test_params.krnl_param = (void*) krnl_param_;
 40 | 
 41 |             printf("Current parameters: rel_tol = %.1e, krnl_param[] = ", test_params.rel_tol);
 42 |             for (int i = 0; i < krnl_param_len; i++) printf("%.1e ", krnl_param_[i]);
 43 |             printf("\n");
 44 | 
 45 |             H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol);
 46 | 
 47 |             H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox);
 48 |             
 49 |             int max_leaf_points = 0;
 50 |             DTYPE max_leaf_size = 0.0;    
 51 |             H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size);
 52 | 
 53 |             // Generate proxy points
 54 |             H2P_dense_mat_p *pp = NULL;
 55 |             st = get_wtime_sec();
 56 |             H2P_generate_proxy_point_ID_file(
 57 |                 h2pack, test_params.krnl_param, test_params.krnl_eval,
 58 |                 test_params.pp_fname, &pp
 59 |             );
 60 |             et = get_wtime_sec();
 61 |             printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st);
 62 |             
 63 |             // Build H2 representation
 64 |             st = get_wtime_sec();
 65 |             H2P_build(
 66 |                 h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 
 67 |                 test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops
 68 |             );
 69 |             et = get_wtime_sec();
 70 |             printf("H2Pack H2 construction used %.3lf (s)\n", et - st);
 71 | 
 72 |             // Allocate input & output vectors
 73 |             int n_check_pt = 50000, check_pt_s;
 74 |             if (n_check_pt >= test_params.n_point)
 75 |             {
 76 |                 n_check_pt = test_params.n_point;
 77 |                 check_pt_s = 0;
 78 |             } else {
 79 |                 srand(time(NULL));
 80 |                 check_pt_s = rand() % (test_params.n_point - n_check_pt);
 81 |             }
 82 |             DTYPE *x, *y0, *y1;
 83 |             x  = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 84 |             y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt);
 85 |             y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 86 |             assert(x != NULL && y0 != NULL && y1 != NULL);
 87 |             for (int i = 0; i < test_params.krnl_mat_size; i++) 
 88 |             {
 89 |                 //x[i] = (DTYPE) pseudo_randn();
 90 |                 x[i] = (DTYPE) drand48() - 0.5;
 91 |             }
 92 |             
 93 |             // Get reference results
 94 |             printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
 95 |             direct_nbody(
 96 |                 test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 
 97 |                 test_params.coord,              test_params.n_point, test_params.n_point, x, 
 98 |                 test_params.coord + check_pt_s, test_params.n_point, n_check_pt,          y0
 99 |             );
100 |             
101 |             // Check H2 matvec accuracy
102 |             H2P_matvec(h2pack, x, y1);
103 |             H2P_print_statistic(h2pack);
104 |             DTYPE y0_norm = 0.0, err_norm = 0.0;
105 |             for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++)
106 |             {
107 |                 DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i];
108 |                 y0_norm  += y0[i] * y0[i];
109 |                 err_norm += diff * diff;
110 |             }
111 |             y0_norm  = DSQRT(y0_norm);
112 |             err_norm = DSQRT(err_norm);
113 |             printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
114 | 
115 |             // Destroy H2Pack structure and I/O vectors
116 |             H2P_destroy(&h2pack);
117 |             free(h2pack);
118 |             free(x);
119 |             free(y0);
120 |             free(y1);
121 |             printf("\n\n\n");
122 |         }  // End of i_krnl_param loop
123 |     }  // End of i_rel_tol loop
124 |     
125 |     free_aligned(test_params.coord);
126 | 
127 |     return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/extra/test_H2_matmul.h:
--------------------------------------------------------------------------------
  1 | 
  2 | void calc_err_2norm_dtype(
  3 |     const int len, const DTYPE *x0, const DTYPE *x1, 
  4 |     DTYPE *x0_2norm_, DTYPE *err_2norm_
  5 | )
  6 | {
  7 |     DTYPE x0_2norm = 0.0, err_2norm = 0.0, diff;
  8 |     for (int i = 0; i < len; i++)
  9 |     {
 10 |         diff = x0[i] - x1[i];
 11 |         x0_2norm  += x0[i] * x0[i];
 12 |         err_2norm += diff  * diff;
 13 |     }
 14 |     *x0_2norm_  = DSQRT(x0_2norm);
 15 |     *err_2norm_ = DSQRT(err_2norm);
 16 | }
 17 | 
 18 | void test_H2_matmul(H2Pack_p h2pack, const int n_vec)
 19 | {
 20 |     double st, et;
 21 |     int n_thread = omp_get_num_threads();
 22 |     int krnl_mat_size = h2pack->krnl_mat_size;
 23 |     int mat_size = krnl_mat_size * n_vec;
 24 |     DTYPE *x0, *x1, *y0, *y1, *y2;
 25 |     x0 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size);
 26 |     x1 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size);
 27 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size);
 28 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size);
 29 |     y2 = (DTYPE*) malloc(sizeof(DTYPE) * mat_size);
 30 |     ASSERT_PRINTF(
 31 |         x0 != NULL && x1 != NULL && y0 != NULL && y1 != NULL && y2 != NULL,
 32 |         "Failed to allocate 5 arrays of size %d for H2 matmul tests\n", mat_size
 33 |     );
 34 |     for (int i = 0; i < mat_size; i++) 
 35 |     {
 36 |         //x0[i] = (DTYPE) pseudo_randn();
 37 |         x0[i] = (DTYPE) drand48() - 0.5;
 38 |         y0[i] = 0.0;
 39 |         y1[i] = 0.0;
 40 |     }
 41 | 
 42 |     // Test multiple matvec
 43 |     st = get_wtime_sec();
 44 |     for (int i = 0; i < n_vec; i++)
 45 |     {
 46 |         DTYPE *x_ivec = x0 + i * krnl_mat_size;
 47 |         DTYPE *y_ivec = y0 + i * krnl_mat_size;
 48 |         H2P_matvec(h2pack, x_ivec, y_ivec);
 49 |     }
 50 |     et = get_wtime_sec();
 51 |     printf("%3d           matvec used %.3lf sec\n", n_vec, et - st);
 52 | 
 53 |     DTYPE y0_2norm, err_2norm, relerr;
 54 |     
 55 |     // Test column-major matmul performance
 56 |     st = get_wtime_sec();
 57 |     H2P_matmul(h2pack, CblasColMajor, n_vec, x0, krnl_mat_size, y1, krnl_mat_size);
 58 |     et = get_wtime_sec();
 59 |     printf("One col-major matmul used %.3lf sec\n", et - st);
 60 | 
 61 |     // Check H2 column-major matmul results
 62 |     DTYPE cm_max_relerr = 0.0;
 63 |     DTYPE cm_avg_relerr = 0.0; 
 64 |     for (int i = 0; i < n_vec; i++)
 65 |     {
 66 |         DTYPE *y0_ivec = y0 + i * krnl_mat_size;
 67 |         DTYPE *y1_ivec = y1 + i * krnl_mat_size;
 68 |         calc_err_2norm_dtype(krnl_mat_size, y0_ivec, y1_ivec, &y0_2norm, &err_2norm);
 69 |         relerr = err_2norm / y0_2norm;
 70 |         if (relerr > cm_max_relerr) cm_max_relerr = relerr;
 71 |         cm_avg_relerr += relerr;
 72 |     }
 73 |     cm_avg_relerr /= (DTYPE) n_vec;
 74 |     
 75 |     // Test row-major matmul performance
 76 |     //double trans_t = 0.0, matmul_t = 0.0, total_t = 0.0;
 77 |     //st = get_wtime_sec();
 78 |     H2P_transpose_dmat(n_thread, n_vec, krnl_mat_size, x0, krnl_mat_size, x1, n_vec);
 79 |     //et = get_wtime_sec();
 80 |     //trans_t += et - st;
 81 | 
 82 |     st = get_wtime_sec();
 83 |     H2P_matmul(h2pack, CblasRowMajor, n_vec, x1, n_vec, y1, n_vec);
 84 |     et = get_wtime_sec();
 85 |     //matmul_t = et - st;
 86 | 
 87 |     //st = get_wtime_sec();
 88 |     H2P_transpose_dmat(n_thread, krnl_mat_size, n_vec, y1, n_vec, y2, krnl_mat_size);
 89 |     //et = get_wtime_sec();
 90 |     //trans_t += et - st;
 91 |     //total_t = matmul_t + trans_t;
 92 |     printf("One row-major matmul used %.3lf sec\n", et - st);
 93 | 
 94 |     // Check H2 row-major matmul results
 95 |     DTYPE rm_max_relerr = 0.0;
 96 |     DTYPE rm_avg_relerr = 0.0; 
 97 |     for (int i = 0; i < n_vec; i++)
 98 |     {
 99 |         DTYPE *y0_ivec = y0 + i * krnl_mat_size;
100 |         DTYPE *y2_ivec = y2 + i * krnl_mat_size;
101 |         calc_err_2norm_dtype(krnl_mat_size, y0_ivec, y2_ivec, &y0_2norm, &err_2norm);
102 |         relerr = err_2norm / y0_2norm;
103 |         if (relerr > rm_max_relerr) rm_max_relerr = relerr;
104 |         rm_avg_relerr += relerr;
105 |     }
106 |     rm_avg_relerr /= (DTYPE) n_vec;
107 | 
108 |     printf("%d vectors col-major matmul max/avg relerr = %e, %e\n", n_vec, cm_max_relerr, cm_avg_relerr);
109 |     printf("%d vectors row-major matmul max/avg relerr = %e, %e\n", n_vec, rm_max_relerr, rm_avg_relerr);
110 |     
111 |     free(x0);
112 |     free(x1);
113 |     free(y0);
114 |     free(y1);
115 |     free(y2);
116 | }
117 | 
118 | 


--------------------------------------------------------------------------------
/extra/test_H2_scalar.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | //#include <ittnotify.h>
 10 | 
 11 | #include "H2Pack.h"
 12 | #include "H2Pack_kernels.h"
 13 | 
 14 | #include "parse_scalar_params.h"
 15 | #include "direct_nbody.h"
 16 | 
 17 | int main(int argc, char **argv)
 18 | {
 19 |     //__itt_pause();
 20 |     srand48(time(NULL));
 21 |     
 22 |     parse_scalar_params(argc, argv);
 23 |     
 24 |     double st, et;
 25 | 
 26 |     H2Pack_p h2pack;
 27 |     
 28 |     H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol);
 29 |     
 30 |     H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox);
 31 | 
 32 |     int max_leaf_points = 0;
 33 |     DTYPE max_leaf_size = 0.0;    
 34 |     H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size);
 35 | 
 36 |     H2P_dense_mat_p *pp;
 37 |     st = get_wtime_sec();
 38 |     H2P_generate_proxy_point_ID_file(
 39 |         h2pack, test_params.krnl_param, test_params.krnl_eval,
 40 |         test_params.pp_fname, &pp
 41 |     );
 42 |     et = get_wtime_sec();
 43 |     printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st);
 44 |     
 45 |     H2P_build(
 46 |         h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 
 47 |         test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops
 48 |     );
 49 |     
 50 |     int n_check_pt = 50000, check_pt_s;
 51 |     if (n_check_pt >= test_params.n_point)
 52 |     {
 53 |         n_check_pt = test_params.n_point;
 54 |         check_pt_s = 0;
 55 |     } else {
 56 |         srand(time(NULL));
 57 |         check_pt_s = rand() % (test_params.n_point - n_check_pt);
 58 |     }
 59 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
 60 |     
 61 |     DTYPE *x, *y0, *y1;
 62 |     x  = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 63 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt);
 64 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 65 |     assert(x != NULL && y0 != NULL && y1 != NULL);
 66 |     for (int i = 0; i < test_params.krnl_mat_size; i++) 
 67 |     {
 68 |         //x[i] = (DTYPE) pseudo_randn();
 69 |         x[i] = (DTYPE) drand48() - 0.5;
 70 |     }
 71 | 
 72 |     // Get reference results
 73 |     direct_nbody(
 74 |         test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 
 75 |         test_params.coord,              test_params.n_point, test_params.n_point, x, 
 76 |         test_params.coord + check_pt_s, test_params.n_point, n_check_pt,          y0
 77 |     );
 78 |     
 79 |     // Warm up, reset timers, and test the matvec performance
 80 |     H2P_matvec(h2pack, x, y1);
 81 |     H2P_reset_timers(h2pack);
 82 |     //__itt_resume();
 83 |     for (int i = 0; i < 10; i++) 
 84 |         H2P_matvec(h2pack, x, y1);
 85 |     //__itt_pause();
 86 |     
 87 |     H2P_print_statistic(h2pack);
 88 |     
 89 |     // Verify H2 matvec results
 90 |     DTYPE y0_norm = 0.0, err_norm = 0.0;
 91 |     for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++)
 92 |     {
 93 |         DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i];
 94 |         y0_norm  += y0[i] * y0[i];
 95 |         err_norm += diff * diff;
 96 |     }
 97 |     y0_norm  = DSQRT(y0_norm);
 98 |     err_norm = DSQRT(err_norm);
 99 |     printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
100 |     
101 |     // Store H2 matrix data to file
102 |     int store_to_file = 0;
103 |     printf("Store H2 matrix data to file? 1-yes, 0-no : ");
104 |     scanf("%d", &store_to_file);
105 |     if (store_to_file)
106 |     {
107 |         char meta_json_fname[1024];
108 |         char aux_json_fname[1024];
109 |         char binary_fname[1024];
110 |         printf("Enter meta JSON file name: ");
111 |         scanf("%s", meta_json_fname);
112 |         printf("Enter auxiliary JSON file name: ");
113 |         scanf("%s", aux_json_fname);
114 |         printf("Enter binary data file name: ");
115 |         scanf("%s", binary_fname);
116 |         H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname);
117 |         printf("done\n");
118 |     }
119 | 
120 |     free(x);
121 |     free(y0);
122 |     free(y1);
123 |     free_aligned(test_params.coord);
124 |     H2P_destroy(&h2pack);
125 | 
126 |     return 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/extra/test_H2_scalar_samplept.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | //#include <ittnotify.h>
 10 | 
 11 | #include "H2Pack.h"
 12 | #include "H2Pack_kernels.h"
 13 | 
 14 | #include "parse_scalar_params.h"
 15 | #include "direct_nbody.h"
 16 | 
 17 | // Copy from MATLAB code
 18 | int sample_approx_rank(const DTYPE tau, const DTYPE reltol)
 19 | {
 20 |     int r = 1, r_tmp;
 21 |     if (reltol < 2e-1) r = 2;
 22 |     if (reltol < 2e-2) r = 3;
 23 |     if (reltol < 2e-3) r = 4;
 24 |     if (reltol < 2e-4)
 25 |     {
 26 |         r_tmp = 2.0 * DFLOOR(DLOG(reltol) / DLOG(tau)) - 15.0;
 27 |         if (r_tmp < 20.0) r_tmp = 20.0;
 28 |         r = (int) DCEIL(DSQRT(r_tmp));
 29 |     }
 30 |     if (reltol < 7e-7)
 31 |     {
 32 |         r_tmp = 2.0 * DFLOOR(DLOG(reltol) / DLOG(tau)) - 10.0;
 33 |         if (r_tmp < 20.0) r_tmp = 20.0;
 34 |         r = (int) DCEIL(DSQRT(r_tmp));
 35 |     }
 36 |     if (reltol < 7e-9)
 37 |     {
 38 |         r_tmp = 2.0 * DFLOOR(DLOG(reltol) / DLOG(tau));
 39 |         if (r_tmp < 90.0) r_tmp = 90.0;
 40 |         r = (int) DCEIL(DSQRT(r_tmp));
 41 |     }
 42 |     return r;
 43 | }
 44 | 
 45 | int main(int argc, char **argv)
 46 | {
 47 |     //__itt_pause();
 48 |     srand48(time(NULL));
 49 |     
 50 |     printf("For this sample point example program, please enter an arbitrary proxy point file name if asked\n\n");
 51 |     parse_scalar_params(argc, argv);
 52 |     
 53 |     double st, et;
 54 | 
 55 |     H2Pack_p h2pack;
 56 |     
 57 |     H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol);
 58 |     
 59 |     H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox);
 60 | 
 61 |     int max_leaf_points = 0;
 62 |     DTYPE max_leaf_size = 0.0;    
 63 |     H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size);
 64 | 
 65 |     DTYPE tau = 0.7;  // Separation threshold
 66 |     #if 0
 67 |     int approx_rank, approx_rank0;
 68 |     approx_rank0 = sample_approx_rank(tau, test_params.rel_tol);
 69 |     if (argc >= 9) approx_rank = atoi(argv[8]);
 70 |     else 
 71 |     {
 72 |         printf("Sample approx rank (suggested %d): ", approx_rank0);
 73 |         scanf("%d", &approx_rank);
 74 |     }
 75 |     #endif
 76 | 
 77 |     H2P_dense_mat_p *sample_pt;
 78 |     st = get_wtime_sec();
 79 |     H2P_select_sample_point(
 80 |         h2pack, test_params.krnl_param, test_params.krnl_eval, 
 81 |         tau, &sample_pt
 82 |     );
 83 |     et = get_wtime_sec();
 84 |     printf("H2Pack select sample points used %.3lf (s)\n", et - st);
 85 |     
 86 |     H2P_build_with_sample_point(
 87 |         h2pack, sample_pt, test_params.BD_JIT, test_params.krnl_param, 
 88 |         test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops
 89 |     );
 90 |     
 91 |     int n_check_pt = 50000, check_pt_s;
 92 |     if (n_check_pt >= test_params.n_point)
 93 |     {
 94 |         n_check_pt = test_params.n_point;
 95 |         check_pt_s = 0;
 96 |     } else {
 97 |         srand(time(NULL));
 98 |         check_pt_s = rand() % (test_params.n_point - n_check_pt);
 99 |     }
100 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
101 |     
102 |     DTYPE *x, *y0, *y1;
103 |     x  = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
104 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt);
105 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
106 |     assert(x != NULL && y0 != NULL && y1 != NULL);
107 |     for (int i = 0; i < test_params.krnl_mat_size; i++) 
108 |     {
109 |         //x[i] = (DTYPE) pseudo_randn();
110 |         x[i] = (DTYPE) drand48() - 0.5;
111 |     }
112 | 
113 |     // Get reference results
114 |     direct_nbody(
115 |         test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 
116 |         test_params.coord,              test_params.n_point, test_params.n_point, x, 
117 |         test_params.coord + check_pt_s, test_params.n_point, n_check_pt,          y0
118 |     );
119 |     
120 |     // Warm up, reset timers, and test the matvec performance
121 |     H2P_matvec(h2pack, x, y1);
122 |     H2P_reset_timers(h2pack);
123 |     //__itt_resume();
124 |     for (int i = 0; i < 10; i++) 
125 |         H2P_matvec(h2pack, x, y1);
126 |     //__itt_pause();
127 |     
128 |     H2P_print_statistic(h2pack);
129 |     
130 |     // Verify H2 matvec results
131 |     DTYPE y0_norm = 0.0, err_norm = 0.0;
132 |     for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++)
133 |     {
134 |         DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i];
135 |         y0_norm  += y0[i] * y0[i];
136 |         err_norm += diff * diff;
137 |     }
138 |     y0_norm  = DSQRT(y0_norm);
139 |     err_norm = DSQRT(err_norm);
140 |     printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
141 |     
142 |     // Store H2 matrix data to file
143 |     int store_to_file = 0;
144 |     printf("Store H2 matrix data to file? 1-yes, 0-no : ");
145 |     scanf("%d", &store_to_file);
146 |     if (store_to_file)
147 |     {
148 |         char meta_json_fname[1024];
149 |         char aux_json_fname[1024];
150 |         char binary_fname[1024];
151 |         printf("Enter meta JSON file name: ");
152 |         scanf("%s", meta_json_fname);
153 |         printf("Enter auxiliary JSON file name: ");
154 |         scanf("%s", aux_json_fname);
155 |         printf("Enter binary data file name: ");
156 |         scanf("%s", binary_fname);
157 |         H2P_store_to_file(h2pack, meta_json_fname, aux_json_fname, binary_fname);
158 |         printf("done\n");
159 |     }
160 | 
161 |     free(x);
162 |     free(y0);
163 |     free(y1);
164 |     free_aligned(test_params.coord);
165 |     H2P_destroy(&h2pack);
166 | 
167 |     return 0;
168 | }
169 | 


--------------------------------------------------------------------------------
/extra/test_HSS_scalar.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | //#include <ittnotify.h>
 10 | 
 11 | #include "H2Pack.h"
 12 | #include "H2Pack_kernels.h"
 13 | 
 14 | #include "parse_scalar_params.h"
 15 | #include "direct_nbody.h"
 16 | 
 17 | #include "debug.h"
 18 | 
 19 | int main(int argc, char **argv)
 20 | {
 21 |     //__itt_pause();
 22 |     srand48(time(NULL));
 23 |     
 24 |     parse_scalar_params(argc, argv);
 25 | 
 26 |     double st, et;
 27 | 
 28 |     H2Pack_p h2pack;
 29 |     
 30 |     H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol);
 31 |     H2P_run_HSS(h2pack);
 32 |     
 33 |     H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox);
 34 | 
 35 |     int max_leaf_points = 0;
 36 |     DTYPE max_leaf_size = 0.0;    
 37 |     H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size);
 38 | 
 39 |     H2P_dense_mat_p *pp;
 40 |     st = get_wtime_sec();
 41 |     H2P_generate_proxy_point_ID_file(
 42 |         h2pack, test_params.krnl_param, test_params.krnl_eval,
 43 |         test_params.pp_fname, &pp
 44 |     );
 45 |     et = get_wtime_sec();
 46 |     printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st);
 47 |     
 48 |     H2P_build(
 49 |         h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 
 50 |         test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops
 51 |     );
 52 |     
 53 |     int n_check_pt = 50000, check_pt_s;
 54 |     if (n_check_pt >= test_params.n_point)
 55 |     {
 56 |         n_check_pt = test_params.n_point;
 57 |         check_pt_s = 0;
 58 |     } else {
 59 |         srand(time(NULL));
 60 |         check_pt_s = rand() % (test_params.n_point - n_check_pt);
 61 |     }
 62 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
 63 |     
 64 |     DTYPE *x0, *x1, *y0, *y1;
 65 |     x0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 66 |     x1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 67 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt);
 68 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
 69 |     assert(x0 != NULL && x1 != NULL && y0 != NULL && y1 != NULL);
 70 |     for (int i = 0; i < test_params.krnl_mat_size; i++) 
 71 |     {
 72 |         //x0[i] = (DTYPE) pseudo_randn();
 73 |         x0[i] = (DTYPE) drand48() - 0.5;
 74 |     }
 75 | 
 76 |     // Get reference results
 77 |     direct_nbody(
 78 |         test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 
 79 |         test_params.coord,              test_params.n_point, test_params.n_point, x0, 
 80 |         test_params.coord + check_pt_s, test_params.n_point, n_check_pt,          y0
 81 |     );
 82 |     
 83 |     // Warm up, reset timers, and test the matvec performance
 84 |     H2P_matvec(h2pack, x0, y1);
 85 |     H2P_reset_timers(h2pack);
 86 |     //__itt_resume();
 87 |     for (int i = 0; i < 10; i++) 
 88 |         H2P_matvec(h2pack, x0, y1);
 89 |     //__itt_pause();
 90 |     
 91 |     
 92 |     // Verify HSS matvec results
 93 |     DTYPE ref_norm = 0.0, err_norm = 0.0;
 94 |     for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++)
 95 |     {
 96 |         DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i];
 97 |         ref_norm += y0[i] * y0[i];
 98 |         err_norm += diff * diff;
 99 |     }
100 |     ref_norm = DSQRT(ref_norm);
101 |     err_norm = DSQRT(err_norm);
102 |     printf("For %d validation points: ||y_{HSS} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / ref_norm);
103 |     
104 |     #if 0
105 |     // Test ULV Cholesky factorization
106 |     const DTYPE shift = 0;
107 |     H2P_HSS_ULV_Cholesky_factorize(h2pack, shift);
108 | 
109 |     for (int i = 0; i < test_params.krnl_mat_size; i++) y1[i] += shift * x0[i];
110 |     // Warm up, reset timers, and test the ULV solve performance
111 |     H2P_HSS_ULV_Cholesky_solve(h2pack, 3, y1, x1);
112 |     h2pack->n_ULV_solve = 0;
113 |     h2pack->timers[ULV_SLV_TIMER_IDX] = 0.0;
114 |     for (int i = 0; i < 10; i++) 
115 |         H2P_HSS_ULV_Cholesky_solve(h2pack, 3, y1, x1);
116 |     ref_norm = 0.0; 
117 |     err_norm = 0.0;
118 |     for (int i = 0; i < test_params.krnl_mat_size; i++)
119 |     {
120 |         DTYPE diff = x1[i] - x0[i];
121 |         ref_norm += x0[i] * x0[i];
122 |         err_norm += diff * diff;
123 |     }
124 |     ref_norm = DSQRT(ref_norm);
125 |     err_norm = DSQRT(err_norm);
126 |     printf("H2P_HSS_ULV_Cholesky_solve relerr = %e\n",  err_norm / ref_norm);
127 |     #endif
128 | 
129 |     //dump_HSS(h2pack);
130 | 
131 |     // Test ULV LU factorization
132 |     const DTYPE shift = 0;
133 |     H2P_HSS_ULV_LU_factorize(h2pack, shift);
134 | 
135 |     for (int i = 0; i < test_params.krnl_mat_size; i++) y1[i] += shift * x0[i];
136 |     // Warm up, reset timers, and test the ULV solve performance
137 |     H2P_HSS_ULV_LU_solve(h2pack, 3, y1, x1);
138 |     h2pack->n_ULV_solve = 0;
139 |     h2pack->timers[ULV_SLV_TIMER_IDX] = 0.0;
140 |     for (int i = 0; i < 10; i++) 
141 |         H2P_HSS_ULV_LU_solve(h2pack, 3, y1, x1);
142 |     ref_norm = 0.0; 
143 |     err_norm = 0.0;
144 |     for (int i = 0; i < test_params.krnl_mat_size; i++)
145 |     {
146 |         DTYPE diff = x1[i] - x0[i];
147 |         ref_norm += x0[i] * x0[i];
148 |         err_norm += diff * diff;
149 |     }
150 |     ref_norm = DSQRT(ref_norm);
151 |     err_norm = DSQRT(err_norm);
152 |     printf("H2P_HSS_ULV_LU_solve relerr = %e\n",  err_norm / ref_norm);
153 | 
154 |     H2P_print_statistic(h2pack);
155 | 
156 |     free(x0);
157 |     free(x1);
158 |     free(y0);
159 |     free(y1);
160 |     free_aligned(test_params.coord);
161 |     H2P_destroy(&h2pack);
162 | 
163 |     return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/extra/test_ID_compress.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "utils.h"
 11 | 
 12 | int main()
 13 | {
 14 |     int nrow, ncol;
 15 |     printf("matrix size: ");
 16 |     scanf("%d %d", &nrow, &ncol);
 17 |     H2P_dense_mat_p A, A0, U;
 18 |     H2P_dense_mat_init(&A, nrow, ncol);
 19 |     H2P_dense_mat_init(&A0, nrow, ncol);
 20 |     
 21 |     DTYPE A0_fnorm = 0.0;
 22 |     srand48(time(NULL));
 23 |     DTYPE *x1 = (DTYPE*) malloc(sizeof(DTYPE) * nrow);
 24 |     DTYPE *y1 = (DTYPE*) malloc(sizeof(DTYPE) * nrow);
 25 |     DTYPE *x2 = (DTYPE*) malloc(sizeof(DTYPE) * ncol);
 26 |     DTYPE *y2 = (DTYPE*) malloc(sizeof(DTYPE) * ncol);
 27 |     assert(x1 != NULL && x2 != NULL && y1 != NULL && y2 != NULL);
 28 |     for (int i = 0; i < nrow; i++) 
 29 |     {
 30 |         x1[i] = (DTYPE) drand48();
 31 |         y1[i] = (DTYPE) drand48();
 32 |     }
 33 |     for (int i = 0; i < ncol; i++) 
 34 |     {
 35 |         x2[i] = (DTYPE) (drand48() + 0.6);
 36 |         y2[i] = (DTYPE) (drand48() + 0.4);
 37 |     }
 38 |     for (int irow = 0; irow < nrow; irow++)
 39 |     {
 40 |         DTYPE *A_irow  = A->data + irow * ncol;
 41 |         DTYPE *A0_irow = A0->data + irow * ncol;
 42 |         for (int icol = 0; icol < ncol; icol++)
 43 |         {
 44 |             DTYPE dx = x1[irow] - x2[icol];
 45 |             DTYPE dy = y1[irow] - y2[icol];
 46 |             DTYPE d  = DSQRT(dx * dx + dy * dy);
 47 |             A_irow[icol]  = 1.0 / d;
 48 |             A0_irow[icol] = A_irow[icol];
 49 |             A0_fnorm += A_irow[icol] * A_irow[icol];
 50 |         }
 51 |     }
 52 |     A0_fnorm = DSQRT(A0_fnorm);
 53 |     
 54 |     /*
 55 |     FILE *ouf = fopen("A.csv", "w");
 56 |     for (int irow = 0; irow < nrow; irow++)
 57 |     {
 58 |         DTYPE *A_irow = A->data + irow * ncol;
 59 |         for (int icol = 0; icol < ncol - 1; icol++)
 60 |         {
 61 |             fprintf(ouf, "%.15lf, ", A_irow[icol]);
 62 |             //printf("%e ", A_irow[icol]);
 63 |         }
 64 |         fprintf(ouf, "%.15lf\n", A_irow[ncol - 1]);
 65 |         //printf("%e\n", A_irow[ncol - 1]);
 66 |     }
 67 |     fclose(ouf);
 68 |     */
 69 | 
 70 |     H2P_int_vec_p J;
 71 |     H2P_int_vec_init(&J, nrow);
 72 |     DTYPE tol_norm;
 73 |     printf("norm_rel_tol: ");
 74 |     scanf(DTYPE_FMTSTR, &tol_norm);
 75 |     int n_thread = omp_get_max_threads();
 76 |     int   *ID_buff = (int*)   malloc(sizeof(int)   * A->nrow * 4);
 77 |     DTYPE *QR_buff = (DTYPE*) malloc(sizeof(DTYPE) * A->nrow);
 78 |     assert(ID_buff != NULL && QR_buff != NULL);
 79 |     H2P_ID_compress(A, QR_REL_NRM, &tol_norm, &U, J, n_thread, QR_buff, ID_buff, 1);  // Warm up
 80 |     double ut = 0.0;
 81 |     for (int i = 0; i < 10; i++)
 82 |     {
 83 |         memcpy(A->data, A0->data, sizeof(DTYPE) * nrow * ncol);
 84 |         A->nrow = nrow;
 85 |         A->ncol = ncol;
 86 |         A->ld = ncol;
 87 |         double st = get_wtime_sec();
 88 |         H2P_ID_compress(A, QR_REL_NRM, &tol_norm, &U, J, n_thread, QR_buff, ID_buff, 1);
 89 |         double et = get_wtime_sec();
 90 |         ut += et - st;
 91 |     }
 92 |     printf("U rank = %d, average used time = %.8lf (s)\n", U->ncol, ut / 10.0);
 93 |     fflush(stdout);
 94 |     
 95 |     /*
 96 |     ouf = fopen("U.csv", "w");
 97 |     for (int irow = 0; irow < U->nrow; irow++)
 98 |     {
 99 |         DTYPE *U_irow = U->data + irow * U->ncol;
100 |         for (int icol = 0; icol < U->ncol - 1; icol++) 
101 |         {
102 |             fprintf(ouf, "%.15lf, ", U_irow[icol]);
103 |             //printf("% .4lf  ", U_irow[icol]);
104 |         }
105 |         fprintf(ouf, "%.15lf\n", U_irow[U->ncol - 1]);
106 |         //printf("% .4lf  \n", U_irow[U->ncol - 1]);
107 |     }
108 |     fclose(ouf);
109 |     */
110 |     
111 |     //printf("A skeleton rows: ");
112 |     //for (int i = 0; i < U->ncol; i++) printf("%d ", J[i]);
113 |     //printf("\n");
114 |     
115 |     DTYPE *AJ = (DTYPE*) malloc(sizeof(DTYPE) * ncol * U->ncol);
116 |     for (int i = 0; i < U->ncol; i++)
117 |         memcpy(AJ + i * ncol, A0->data + J->data[i] * ncol, sizeof(DTYPE) * ncol);
118 |     CBLAS_GEMM(
119 |         CblasRowMajor, CblasNoTrans, CblasNoTrans, nrow, ncol, U->ncol,
120 |         1.0, U->data, U->ncol, AJ, ncol, -1.0, A0->data, A0->ncol
121 |     );
122 |     DTYPE res_fnorm = 0.0;
123 |     for (int i = 0; i < nrow * ncol; i++) 
124 |         res_fnorm += A0->data[i] * A0->data[i];
125 |     res_fnorm = DSQRT(res_fnorm);
126 |     printf("||A - A_{ID}||_fro / ||A||_fro = %e\n", res_fnorm / A0_fnorm);
127 |     
128 |     free(ID_buff);
129 |     free(QR_buff);
130 |     free(x1);
131 |     free(y1);
132 |     free(x2);
133 |     free(y2);
134 |     H2P_int_vec_destroy(&J);
135 |     H2P_dense_mat_destroy(&U);
136 |     H2P_dense_mat_destroy(&A);
137 |     H2P_dense_mat_destroy(&A0);
138 |     return 0;
139 | }


--------------------------------------------------------------------------------
/extra/test_ID_compress_dim.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack_aux_structs.h"
 10 | #include "H2Pack_ID_compress.h"
 11 | #include "utils.h"
 12 | 
 13 | void RPY_kernel_3d(
 14 |     const DTYPE *coord0, const int ld0, const int n0,
 15 |     const DTYPE *coord1, const int ld1, const int n1,
 16 |     const int dim, DTYPE *mat, const int ldm
 17 | )
 18 | {
 19 |     const DTYPE a = 1.0, eta = 1.0;
 20 |     const DTYPE C   = 1.0 / (6.0 * M_PI * a * eta);
 21 |     const DTYPE aa  = a * a;
 22 |     const DTYPE a2  = 2.0 * a;
 23 |     const DTYPE aa2 = aa * 2.0;
 24 |     const DTYPE aa_2o3   = aa2 / 3.0;
 25 |     const DTYPE C_075    = C * 0.75;
 26 |     const DTYPE C_9o32oa = C * 9.0 / 32.0 / a;
 27 |     const DTYPE C_3o32oa = C * 3.0 / 32.0 / a;
 28 |     for (int i = 0; i < n0; i++)
 29 |     {
 30 |         DTYPE x0 = coord0[i];
 31 |         DTYPE y0 = coord0[i + ld0];
 32 |         DTYPE z0 = coord0[i + ld0 * 2];
 33 |         for (int j = 0; j < n1; j++)
 34 |         {
 35 |             DTYPE r0 = x0 - coord1[j];
 36 |             DTYPE r1 = y0 - coord1[j + ld1];
 37 |             DTYPE r2 = z0 - coord1[j + ld1 * 2];
 38 |             DTYPE s2 = r0 * r0 + r1 * r1 + r2 * r2;
 39 |             DTYPE s  = DSQRT(s2);
 40 |             DTYPE inv_s = 1.0 / s;
 41 |             r0 *= inv_s;
 42 |             r1 *= inv_s;
 43 |             r2 *= inv_s;
 44 |             DTYPE t1, t2;
 45 |             if (s < a2)
 46 |             {
 47 |                 t1 = C - C_9o32oa * s;
 48 |                 t2 = C_3o32oa * s;
 49 |             } else {
 50 |                 t1 = C_075 / s * (1 + aa_2o3 / s2);
 51 |                 t2 = C_075 / s * (1 - aa2 / s2); 
 52 |             }
 53 |             int base = 3 * i * ldm + 3 * j;
 54 |             #define krnl(k, l) mat[base + k * ldm + l]
 55 |             krnl(0, 0) = t2 * r0 * r0 + t1;
 56 |             krnl(0, 1) = t2 * r0 * r1;
 57 |             krnl(0, 2) = t2 * r0 * r2;
 58 |             krnl(1, 0) = t2 * r1 * r0;
 59 |             krnl(1, 1) = t2 * r1 * r1 + t1;
 60 |             krnl(1, 2) = t2 * r1 * r2;
 61 |             krnl(2, 0) = t2 * r2 * r0;
 62 |             krnl(2, 1) = t2 * r2 * r1;
 63 |             krnl(2, 2) = t2 * r2 * r2 + t1;
 64 |         }
 65 |     }
 66 | }
 67 | 
 68 | 
 69 | int main()
 70 | {
 71 |     int nrow, ncol, kdim = 3;
 72 |     printf("matrix size: ");
 73 |     scanf("%d%d", &nrow, &ncol);
 74 |     int A_nrow = nrow * kdim;
 75 |     int A_ncol = ncol * kdim;
 76 |     DTYPE tol_norm;
 77 |     printf("norm_rel_tol: ");
 78 |     scanf(DTYPE_FMTSTR, &tol_norm);
 79 |     
 80 |     H2P_dense_mat_p A, A0, U;
 81 |     H2P_int_vec_p J;
 82 |     H2P_dense_mat_init(&A, A_nrow, A_ncol);
 83 |     H2P_dense_mat_init(&A0, A_nrow, A_ncol);
 84 |     H2P_int_vec_init(&J, A_nrow);
 85 |     
 86 |     DTYPE *coord0 = (DTYPE*) malloc(sizeof(DTYPE) * A_nrow);
 87 |     DTYPE *coord1 = (DTYPE*) malloc(sizeof(DTYPE) * A_ncol);
 88 |     assert(coord0 != NULL && coord1 != NULL);
 89 |     DTYPE *x0 = coord0, *x1 = coord1;
 90 |     DTYPE *y0 = coord0 + nrow, *y1 = coord1 + ncol;
 91 |     DTYPE *z0 = coord0 + nrow * 2, *z1 = coord1 + ncol * 2;
 92 |     for (int i = 0; i < nrow; i++) 
 93 |     {
 94 |         x0[i] = (DTYPE) drand48();
 95 |         y0[i] = (DTYPE) drand48();
 96 |         z0[i] = (DTYPE) drand48();
 97 |     }
 98 |     for (int i = 0; i < ncol; i++) 
 99 |     {
100 |         x1[i] = (DTYPE) (drand48() + 1.9);
101 |         y1[i] = (DTYPE) (drand48() + 0.8);
102 |         z1[i] = (DTYPE) (drand48() + 0.9);
103 |     }
104 |     
105 |     RPY_kernel_3d(
106 |         coord0, nrow, nrow, 
107 |         coord1, ncol, ncol, 
108 |         1, A->data, A_ncol
109 |     );
110 |     memcpy(A0->data, A->data, sizeof(DTYPE) * A_nrow * A_ncol);
111 |     DTYPE A0_fnorm = 0.0;
112 |     for (int i = 0; i < A_nrow * A_ncol; i++)
113 |         A0_fnorm += A->data[i] * A->data[i];
114 |     
115 |     int n_thread = omp_get_max_threads();
116 |     int QR_buff_size = (2 * kdim + 2) * A->ncol + (kdim + 1) * A->nrow;
117 |     int   *ID_buff = (int *)   malloc(sizeof(int)   * A->nrow * 4);
118 |     DTYPE *QR_buff = (DTYPE *) malloc(sizeof(DTYPE) * QR_buff_size);
119 |     double st = get_wtime_sec();
120 |     H2P_ID_compress(
121 |         A, QR_REL_NRM, &tol_norm, &U, J, 
122 |         n_thread, QR_buff, ID_buff, kdim
123 |     );
124 |     double ut = get_wtime_sec() - st;
125 |     printf("H2P_ID_compress used %.3lf s\n", ut);
126 |     
127 |     DTYPE *AJ = (DTYPE*) malloc(sizeof(DTYPE) * U->ncol * A_ncol);
128 |     for (int i = 0; i < J->length; i++)
129 |     {
130 |         int i30 = i * 3 + 0;
131 |         int i31 = i * 3 + 1;
132 |         int i32 = i * 3 + 2;
133 |         int j30 = J->data[i] * 3 + 0;
134 |         int j31 = J->data[i] * 3 + 1;
135 |         int j32 = J->data[i] * 3 + 2;
136 |         memcpy(AJ + i30*A_ncol, A0->data + j30*A_ncol, sizeof(DTYPE) * A_ncol);
137 |         memcpy(AJ + i31*A_ncol, A0->data + j31*A_ncol, sizeof(DTYPE) * A_ncol);
138 |         memcpy(AJ + i32*A_ncol, A0->data + j32*A_ncol, sizeof(DTYPE) * A_ncol);
139 |     }
140 |     CBLAS_GEMM(
141 |         CblasRowMajor, CblasNoTrans, CblasNoTrans, A_nrow, A_ncol, U->ncol,
142 |         1.0, U->data, U->ncol, AJ, A_ncol, -1.0, A0->data, A_ncol
143 |     );
144 |     DTYPE res_fnorm = 0.0;
145 |     for (int i = 0; i < A_nrow * A_ncol; i++)
146 |         res_fnorm += A0->data[i] * A0->data[i];
147 |     res_fnorm = DSQRT(res_fnorm);
148 |     printf("U rank = %d (%d column blocks)\n", U->ncol, J->length);
149 |     printf("||A - A_{ID}||_fro / ||A||_fro = %e\n", res_fnorm / A0_fnorm);
150 |     
151 |     free(QR_buff);
152 |     free(ID_buff);
153 |     free(coord0);
154 |     free(coord1);
155 |     H2P_int_vec_destroy(&J);
156 |     H2P_dense_mat_destroy(&U);
157 |     H2P_dense_mat_destroy(&A);
158 |     H2P_dense_mat_destroy(&A0);
159 |     return 0;
160 | }


--------------------------------------------------------------------------------
/extra/test_kernel_SIMD.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <math.h>
  5 | #include <assert.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack.h"
 10 | #include "H2Pack_kernels.h"
 11 | 
 12 | #include "parse_scalar_params.h"
 13 | #include "direct_nbody.h"
 14 | 
 15 | static void Gaussian_3D_eval_std_d(KRNL_EVAL_PARAM)
 16 | {
 17 |     EXTRACT_3D_COORD();
 18 |     const DTYPE *param_ = (DTYPE*) param;
 19 |     const DTYPE l = param_[0];
 20 |     for (int i = 0; i < n0; i++)
 21 |     {
 22 |         DTYPE *mat_irow = mat + i * ldm;
 23 |         const DTYPE x0_i = x0[i];
 24 |         const DTYPE y0_i = y0[i];
 25 |         const DTYPE z0_i = z0[i];
 26 |         //#pragma novector
 27 |         #pragma omp simd
 28 |         for (int j = 0; j < n1; j++)
 29 |         {
 30 |             DTYPE dx = x0_i - x1[j];
 31 |             DTYPE dy = y0_i - y1[j];
 32 |             DTYPE dz = z0_i - z1[j];
 33 |             DTYPE r2 = dx * dx + dy * dy + dz * dz;
 34 |             mat_irow[j] = exp(-l * r2);
 35 |         }
 36 |     }
 37 | }
 38 | 
 39 | static void Gaussian_3D_bimv_std_d(KRNL_BIMV_PARAM)
 40 | {
 41 |     EXTRACT_3D_COORD();
 42 |     const DTYPE *param_ = (DTYPE*) param;
 43 |     const DTYPE l = param_[0];
 44 |     for (int i = 0; i < n0; i += 2)
 45 |     {
 46 |         const DTYPE x0_i0 = x0[i];
 47 |         const DTYPE y0_i0 = y0[i];
 48 |         const DTYPE z0_i0 = z0[i];
 49 |         const DTYPE x0_i1 = x0[i + 1];
 50 |         const DTYPE y0_i1 = y0[i + 1];
 51 |         const DTYPE z0_i1 = z0[i + 1];
 52 |         const DTYPE xin1_i0 = x_in_1[i];
 53 |         const DTYPE xin1_i1 = x_in_1[i + 1];
 54 |         DTYPE sum_i0 = 0.0, sum_i1 = 0.0;
 55 |         //#pragma novector
 56 |         #pragma omp simd
 57 |         for (int j = 0; j < n1; j++)
 58 |         {
 59 |             DTYPE d0, d1, r20, r21;
 60 | 
 61 |             d0 = x0_i0 - x1[j];
 62 |             d1 = x0_i1 - x1[j];
 63 |             r20 = d0 * d0;
 64 |             r21 = d1 * d1;
 65 | 
 66 |             d0 = y0_i0 - y1[j];
 67 |             d1 = y0_i1 - y1[j];
 68 |             r20 += d0 * d0;
 69 |             r21 += d1 * d1;
 70 | 
 71 |             d0 = z0_i0 - z1[j];
 72 |             d1 = z0_i1 - z1[j];
 73 |             r20 += d0 * d0;
 74 |             r21 += d1 * d1;
 75 | 
 76 |             r20 = exp(-l * r20);
 77 |             r21 = exp(-l * r21);
 78 | 
 79 |             sum_i0 += r20 * x_in_0[j];
 80 |             sum_i1 += r21 * x_in_0[j];
 81 |             x_out_1[j] += (r20 * xin1_i0 + r21 * xin1_i1);
 82 |         }
 83 |         x_out_0[i]   += sum_i0;
 84 |         x_out_0[i+1] += sum_i1;
 85 |     }
 86 | }
 87 | 
 88 | int main(int argc, char **argv)
 89 | {
 90 |     //__itt_pause();
 91 |     srand48(time(NULL));
 92 |     
 93 |     parse_scalar_params(argc, argv);
 94 |     test_params.krnl_eval = Gaussian_3D_eval_std_d;
 95 |     test_params.krnl_bimv = Gaussian_3D_bimv_std_d;
 96 | 
 97 |     double st, et;
 98 | 
 99 |     H2Pack_p h2pack;
100 |     
101 |     H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol);
102 |     
103 |     H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox);
104 | 
105 |     int max_leaf_points = 0;
106 |     DTYPE max_leaf_size = 0.0;    
107 |     H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size);
108 | 
109 |     H2P_dense_mat_p *pp;
110 |     st = get_wtime_sec();
111 |     H2P_generate_proxy_point_ID_file(
112 |         h2pack, test_params.krnl_param, test_params.krnl_eval,
113 |         test_params.pp_fname, &pp
114 |     );
115 |     et = get_wtime_sec();
116 |     printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st);
117 |     
118 |     H2P_build(
119 |         h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 
120 |         test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops
121 |     );
122 |     
123 |     int n_check_pt = 50000, check_pt_s;
124 |     if (n_check_pt >= test_params.n_point)
125 |     {
126 |         n_check_pt = test_params.n_point;
127 |         check_pt_s = 0;
128 |     } else {
129 |         srand(time(NULL));
130 |         check_pt_s = rand() % (test_params.n_point - n_check_pt);
131 |     }
132 |     printf("Calculating direct n-body reference result for points %d -> %d\n", check_pt_s, check_pt_s + n_check_pt - 1);
133 |     
134 |     DTYPE *x, *y0, *y1;
135 |     x  = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
136 |     y0 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_dim * n_check_pt);
137 |     y1 = (DTYPE*) malloc(sizeof(DTYPE) * test_params.krnl_mat_size);
138 |     assert(x != NULL && y0 != NULL && y1 != NULL);
139 |     for (int i = 0; i < test_params.krnl_mat_size; i++) 
140 |     {
141 |         //x[i] = (DTYPE) pseudo_randn();
142 |         x[i] = (DTYPE) drand48() - 0.5;
143 |     }
144 | 
145 |     // Get reference results
146 |     direct_nbody(
147 |         test_params.krnl_param, test_params.krnl_eval, test_params.pt_dim, test_params.krnl_dim, 
148 |         test_params.coord,              test_params.n_point, test_params.n_point, x, 
149 |         test_params.coord + check_pt_s, test_params.n_point, n_check_pt,          y0
150 |     );
151 |     
152 |     // Warm up, reset timers, and test the matvec performance
153 |     H2P_matvec(h2pack, x, y1);
154 |     H2P_reset_timers(h2pack);
155 |     for (int i = 0; i < 10; i++) 
156 |         H2P_matvec(h2pack, x, y1);
157 |     
158 |     H2P_print_statistic(h2pack);
159 |     
160 |     // Verify H2 matvec results
161 |     DTYPE y0_norm = 0.0, err_norm = 0.0;
162 |     for (int i = 0; i < test_params.krnl_dim * n_check_pt; i++)
163 |     {
164 |         DTYPE diff = y1[test_params.krnl_dim * check_pt_s + i] - y0[i];
165 |         y0_norm  += y0[i] * y0[i];
166 |         err_norm += diff * diff;
167 |     }
168 |     y0_norm  = DSQRT(y0_norm);
169 |     err_norm = DSQRT(err_norm);
170 |     printf("For %d validation points: ||y_{H2} - y||_2 / ||y||_2 = %e\n", n_check_pt, err_norm / y0_norm);
171 |     
172 |     free(x);
173 |     free(y0);
174 |     free(y1);
175 |     free_aligned(test_params.coord);
176 |     H2P_destroy(&h2pack);
177 | 
178 |     return 0;
179 | }
180 | 


--------------------------------------------------------------------------------
/extra/test_scalar_matmul.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <math.h>
 5 | #include <assert.h>
 6 | #include <time.h>
 7 | #include <omp.h>
 8 | 
 9 | #include "H2Pack.h"
10 | #include "H2Pack_kernels.h"
11 | #include "H2Pack_utils.h"
12 | 
13 | #include "parse_scalar_params.h"
14 | #include "direct_nbody.h"
15 | #include "test_H2_matmul.h"
16 | 
17 | int main(int argc, char **argv)
18 | {
19 |     srand48(time(NULL));
20 |     
21 |     parse_scalar_params(argc, argv);
22 | 
23 |     H2Pack_p h2pack;
24 |     double st, et;
25 |     
26 |     H2P_init(&h2pack, test_params.pt_dim, test_params.krnl_dim, QR_REL_NRM, &test_params.rel_tol);
27 |     
28 |     H2P_calc_enclosing_box(test_params.pt_dim, test_params.n_point, test_params.coord, test_params.pp_fname, &h2pack->root_enbox);
29 | 
30 |     int max_leaf_points = 0;
31 |     DTYPE max_leaf_size = 0.0;    
32 |     H2P_partition_points(h2pack, test_params.n_point, test_params.coord, max_leaf_points, max_leaf_size);
33 | 
34 |     H2P_dense_mat_p *pp;
35 |     st = get_wtime_sec();
36 |     H2P_generate_proxy_point_ID_file(
37 |         h2pack, test_params.krnl_param, test_params.krnl_eval,
38 |         test_params.pp_fname, &pp
39 |     );
40 |     et = get_wtime_sec();
41 |     printf("H2Pack load/generate proxy points used %.3lf (s)\n", et - st);
42 |     
43 |     H2P_build(
44 |         h2pack, pp, test_params.BD_JIT, test_params.krnl_param, 
45 |         test_params.krnl_eval, test_params.krnl_bimv, test_params.krnl_bimv_flops
46 |     );
47 |     
48 |     int n_vecs[10] = {2, 2, 4, 8, 12, 16, 20, 24, 28, 32};
49 |     for (int i = 0; i < 10; i++)
50 |         test_H2_matmul(h2pack, n_vecs[i]);
51 | 
52 |     h2pack->n_matvec = 0;  // Skip printing matvec timings
53 |     H2P_print_statistic(h2pack);
54 | 
55 |     free_aligned(test_params.coord);
56 |     H2P_destroy(&h2pack);
57 | 
58 |     return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/pyh2pack/example.py:
--------------------------------------------------------------------------------
  1 | import pyh2pack
  2 | import numpy as np
  3 | 
  4 | '''
  5 |    NOTE:
  6 |    In Jupyter notebook, the outputs of `print_statistics/print_setting' might be redirected to terminals and will not be properly shown.
  7 |    Solution to this problem is to use package 'wurlitzer'
  8 |    Run `%load_ext wurlitzer` in Jupyeter.
  9 | '''
 10 | 
 11 | N = 80000
 12 | krnl_dim = 1
 13 | pt_dim = 3
 14 | coord = np.random.uniform(0, 1, size=(pt_dim, N))
 15 | x = np.random.normal(size=(krnl_dim*N))
 16 | 
 17 | 
 18 | '''
 19 |    Test without precomputed proxy points
 20 | '''
 21 | #   build
 22 | krnl_param = np.array([1, -0.5])
 23 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param)
 24 | #   matvec
 25 | y = A.matvec(x)
 26 | #   partial direct matvec
 27 | start_pt = 8000
 28 | end_pt = 9999
 29 | z = A.direct_matvec(x, start_pt, end_pt)
 30 | #   print the matvec error in the partial results
 31 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z))
 32 | #   statistic info of pyh2pack performance
 33 | A.print_statistic()
 34 | A.print_setting()
 35 | A.clean()
 36 | 
 37 | 
 38 | 
 39 | '''
 40 |    Test with precomputed proxy points
 41 | '''
 42 | #   path to the file of storing proxy points
 43 | pp_fname = "./pp_tmp.dat"
 44 | #   build
 45 | krnl_param = np.array([1,-0.5])
 46 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param, pp_filename=pp_fname)
 47 | #   matvec
 48 | y = A.matvec(x)
 49 | #   partial direct matvec
 50 | start_pt = 8000
 51 | end_pt = 9999
 52 | z = A.direct_matvec(x, start_pt, end_pt)
 53 | #   print the matvec error in the partial results
 54 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z))
 55 | #   statistic info of pyh2pack performance
 56 | A.print_statistic()
 57 | A.clean()
 58 | 
 59 | 
 60 | '''
 61 |    Test with matmul
 62 | '''
 63 | #   build
 64 | krnl_param = np.array([1,-0.5])
 65 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param)
 66 | #   matmul
 67 | nvec = 10
 68 | xs = np.random.normal(size=(krnl_dim*N, nvec))
 69 | ys = A.matmul(xs)
 70 | #  partial direct sum
 71 | zs = []
 72 | start_pt = 0
 73 | end_pt = 999
 74 | for i in range(nvec):
 75 |    zs.append(A.direct_matvec(xs[:,i], start_pt, end_pt))
 76 | zs = np.hstack([z[:,np.newaxis] for z in zs])
 77 | print(np.linalg.norm(ys[start_pt*krnl_dim:(end_pt+1)*krnl_dim, :] - zs, ord='fro') / np.linalg.norm(zs,  ord='fro'))
 78 | A.print_statistic()
 79 | A.clean()
 80 | 
 81 | 
 82 | 
 83 | '''
 84 |    Test with direct matrix vector multiplication in pyh2pack
 85 | '''
 86 | A = pyh2pack.H2Mat(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param)
 87 | y = A.matvec(x)
 88 | 
 89 | #   partial direct matvec by class h2 variable.
 90 | start_pt = 0
 91 | end_pt = 999
 92 | z = A.direct_matvec(x, start_pt, end_pt)
 93 | 
 94 | #   direct matvec via package method: kernel_matvec
 95 | target_coord = coord[:, start_pt:(end_pt+1)]
 96 | z0 = pyh2pack.kernel_matvec(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_dim=pt_dim, krnl_param=krnl_param, source=coord, target=target_coord, x_in=x)
 97 | 
 98 | A_blk = pyh2pack.kernel_block(kernel="Quadratic_3D", krnl_dim=krnl_dim, pt_dim=pt_dim, krnl_param=krnl_param, source=coord, target=target_coord)
 99 | z1 = np.matmul(A_blk, x)
100 | 
101 | #  check error
102 | print(np.linalg.norm(z - z0))
103 | print(np.linalg.norm(z - z1))
104 | A.print_statistic()
105 | A.clean()
106 | 


--------------------------------------------------------------------------------
/pyh2pack/example_hss.py:
--------------------------------------------------------------------------------
 1 | import pyh2pack
 2 | import numpy as np
 3 | 
 4 | '''
 5 |    NOTE: 
 6 |    In Jupyter notebook, the outputs of `print_statistics/print_setting' might be redirected to terminals and will not be properly shown. 
 7 |    Solution to this problem is to use package 'wurlitzer'   
 8 |    Run `%load_ext wurlitzer` in Jupyeter. 
 9 | '''
10 | 
11 | N = 40000
12 | krnl_dim = 1
13 | pt_dim = 2
14 | coord = np.random.uniform(0, N**(1.0/pt_dim), size=(pt_dim, N))
15 | x = np.random.normal(size=(krnl_dim*N))
16 | 
17 | '''
18 | Standard HSS 
19 | '''
20 | 
21 | ##   build
22 | krnl_param = np.array([1, -0.5])
23 | A = pyh2pack.HSSMat(kernel="Quadratic_2D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, rel_tol=1e-3, krnl_param=krnl_param)
24 | # A = pyh2pack.HSSMat(kernel="Quadratic_2D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, rank=100, krnl_param=krnl_param)
25 | 
26 | 
27 | ##   matvec  
28 | y = A.matvec(x)
29 | #   partial direct matvec
30 | start_pt = 6000
31 | end_pt = 9999
32 | z = A.direct_matvec(x, start_pt, end_pt)
33 | #   print the matvec error in the partial results
34 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z))
35 | 
36 | ##  ULV factorization
37 | diag_shift = 0.1
38 | A.factorize(is_cholesky=1, shift=diag_shift)
39 | 
40 | ##   solve based on ULV decomposition
41 | b = y + diag_shift * x
42 | x0 = A.solve(b)
43 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x0) / np.linalg.norm(x)))
44 | 
45 | ##   partial solve, A = LU, apply inv(L) first and then apply inv(U)
46 | z = A.solve(b, op="L")
47 | x1 = A.solve(z, op="U")
48 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x1) / np.linalg.norm(x)))
49 | 
50 | ##   statistic info of pyh2pack performance
51 | A.print_statistic()
52 | A.print_setting()
53 | A.clean()
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | '''
62 | SPD HSS 
63 | '''
64 | 
65 | ##   build
66 | krnl_param = np.array([1, -0.5])
67 | A = pyh2pack.HSSMat("Quadratic_2D", krnl_dim, coord, pt_dim, rel_tol=1e-6, krnl_param=krnl_param, spdhss=1, spdhss_shift=0.0, rank=100)
68 | # A = pyh2pack.HSSMat(kernel="Quadratic_2D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, rank=100, krnl_param=krnl_param)
69 | 
70 | 
71 | ##   matvec  
72 | y = A.matvec(x)
73 | #   partial direct matvec
74 | start_pt = 6000
75 | end_pt = 9999
76 | z = A.direct_matvec(x, start_pt, end_pt)
77 | #   print the matvec error in the partial results
78 | print(np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z))
79 | 
80 | ##  ULV factorization
81 | diag_shift = 0.0
82 | A.factorize(is_cholesky=1, shift=diag_shift)
83 | 
84 | ##   solve based on ULV decomposition
85 | b = y + diag_shift * x
86 | x0 = A.solve(b)
87 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x0) / np.linalg.norm(x)))
88 | 
89 | ##   partial solve, A = LU, apply inv(L) first and then apply inv(U)
90 | z = A.solve(b, op="L")
91 | x1 = A.solve(z, op="U")
92 | print("HSS solve error (compared to HSS matvec) %.3e" % (np.linalg.norm(x - x1) / np.linalg.norm(x)))
93 | 
94 | ##   statistic info of pyh2pack performance
95 | A.print_statistic()
96 | A.print_setting()
97 | A.clean()
98 | 
99 | 


--------------------------------------------------------------------------------
/pyh2pack/example_samplept.py:
--------------------------------------------------------------------------------
 1 | import pyh2pack
 2 | import numpy as np
 3 | 
 4 | N = 80000
 5 | krnl_dim = 1
 6 | pt_dim = 3
 7 | coord = np.random.uniform(0, 1, size=(pt_dim, N))
 8 | x = np.random.normal(size=(krnl_dim*N))
 9 | 
10 | # build
11 | krnl_param = np.array([0.5])
12 | A = pyh2pack.H2Mat(kernel="Gaussian_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-3, krnl_param=krnl_param, sample_pt=1)
13 | # Coulomb kernel does not have krnl_param
14 | #A = pyh2pack.H2Mat(kernel="Coulomb_3D", krnl_dim=krnl_dim, pt_coord=coord, pt_dim=pt_dim, JIT_mode=1, rel_tol=1e-6, sample_pt=1)
15 | 
16 | # show build settings
17 | A.print_setting()
18 | 
19 | # matvec
20 | y = A.matvec(x)
21 | 
22 | # partial direct matvec
23 | start_pt = 8000
24 | end_pt = 9999
25 | z = A.direct_matvec(x, start_pt, end_pt)
26 | 
27 | # print the matvec relative error in the partial results
28 | relerr = np.linalg.norm(y[start_pt*krnl_dim:(end_pt+1)*krnl_dim] - z) / np.linalg.norm(z)
29 | print("H2 matvec relative error = %e\n" % relerr)
30 | 
31 | # statistic info of pyh2pack performance
32 | A.print_statistic()
33 | 
34 | # clean out
35 | A.clean()
36 | 


--------------------------------------------------------------------------------
/pyh2pack/readme.md:
--------------------------------------------------------------------------------
 1 | ## Building and Installing PyH2Pack
 2 | 
 3 | ### Intel compiler (ICC) + Intel MKL
 4 | Use this command to compile:
 5 | ```shell
 6 | LDSHARED="icc -shared" CC=icc python3 setup_icc.py install
 7 | ```
 8 | Before running the python code, you need to manually preload the following MKL file: 
 9 | ```shell
10 | # Check if $MKLROOT is set correctly
11 | # ls $MKLROOT/lib/intel64/libmkl_rt.so
12 | export LD_PRELOAD=$MKLROOT/lib/intel64/libmkl_rt.so
13 | ```
14 | 
15 | ### GNU compiler (GCC) + OpenBLAS
16 | 
17 | Install or compile OpenBLAS first, then modify `setup.py` and update variable `OPENBLAS_INSTALL_DIR` according to the location OpenBLAS is installed. Use this command to compile:
18 | 
19 | ```shell
20 | CC=gcc python3 setup.py install
21 | ```
22 | 
23 | If you see an error message like:
24 | 
25 | ```text
26 | copying build/lib.linux-x86_64-3.8/pyh2pack.cpython-38-x86_64-linux-gnu.so -> /usr/local/lib/python3.8/dist-packages
27 | error: could not delete '/usr/local/lib/python3.8/dist-packages/pyh2pack.cpython-38-x86_64-linux-gnu.so': Permission denied
28 | ```
29 | 
30 | Then manually run:
31 | 
32 | ```shell
33 | sudo cp build/lib.linux-x86_64-3.8/pyh2pack.cpython-38-x86_64-linux-gnu.so /usr/local/lib/python3.8/dist-packages
34 | ```
35 | 
36 | 
37 | 
38 | ## Using PyH2Pack
39 | 
40 | See `example.py`. 
41 | 
42 | If you want to try the data-driven sample point method instead of the default proxy point / proxy surface method, see `example_samplept.py`. 
43 | 


--------------------------------------------------------------------------------
/pyh2pack/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup, Extension
 2 | import os
 3 | import numpy
 4 | 
 5 | H2PACK_DIR = ".."
 6 | OPENBLAS_INSTALL_DIR = "/usr/local/opt/openblas"
 7 | #C_DIR = "/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include"
 8 | 
 9 | extra_cflags  = ["-I"+H2PACK_DIR+"/include"]
10 | extra_cflags += ["-I"+OPENBLAS_INSTALL_DIR+"/include"]
11 | extra_cflags += ["-g", "-std=gnu99", "-O3"]
12 | extra_cflags += ["-DUSE_OPENBLAS", "-fopenmp", "-march=native"]
13 | extra_cflags += ["-Wno-unused-result", "-Wno-unused-function"]
14 | 
15 | LIB = [H2PACK_DIR+"/lib/libH2Pack.a", OPENBLAS_INSTALL_DIR+"/lib/libopenblas.a"]
16 | extra_lflags = LIB + ["-g", "-O3", "-fopenmp", "-lm", "-lgfortran"]
17 | 
18 | def main():
19 |     setup(name="pyh2pack",
20 |         version="1.0.0",
21 |         description="Python interface for H2Pack",
22 |         author="Hua Huang, Xin Xing, and Edmond Chow",
23 |         author_email="xxing02@gmail.com",
24 |         ext_modules=[Extension(
25 |             name = "pyh2pack",
26 |             sources = ["pyh2pack.c"],
27 |             include_dirs=[H2PACK_DIR+"/include", numpy.get_include()],
28 |             extra_compile_args = extra_cflags,
29 |             extra_link_args= extra_lflags,
30 |             )
31 |         ]
32 |     )
33 | 
34 | if __name__ == "__main__":
35 |     main()
36 | 


--------------------------------------------------------------------------------
/pyh2pack/setup_icc.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup, Extension
 2 | import os
 3 | import numpy
 4 | 
 5 | H2PACK_DIR = ".."
 6 | 
 7 | extra_cflags  = ["-I"+H2PACK_DIR+"/include"]
 8 | extra_cflags += ["-g", "-std=gnu99", "-O3"]
 9 | extra_cflags += ["-DUSE_MKL", "-qopenmp", "-xHost", "-mkl"]
10 | 
11 | LIB = [H2PACK_DIR+"/lib/libH2Pack.a"]
12 | extra_lflags = LIB + ["-g", "-O3", "-qopenmp", "-L${MKLROOT}/lib/intel64", "-mkl_rt", "-lpthread"]
13 | 
14 | def main():
15 |     setup(name="pyh2pack",
16 |         version="1.0.0",
17 |         description="Python interface for H2Pack",
18 |         author="Hua Huang, Xin Xing, and Edmond Chow",
19 |         author_email="xxing02@gmail.com",
20 |         ext_modules=[Extension(
21 |             name = "pyh2pack",
22 |             sources = ["pyh2pack.c"],
23 |             include_dirs=[H2PACK_DIR+"/include", numpy.get_include()],
24 |             extra_compile_args = extra_cflags,
25 |             extra_link_args= extra_lflags,
26 |             )
27 |         ]
28 |     )
29 | 
30 | if __name__ == "__main__":
31 |     main()
32 | 


--------------------------------------------------------------------------------
/src/AFN_precond.h:
--------------------------------------------------------------------------------
 1 | #ifndef __AFN_PRECOND_H__
 2 | #define __AFN_PRECOND_H__
 3 | 
 4 | // Adaptive Factorized Nystrom preconditioner, ref: https://arxiv.org/pdf/2304.05460.pdf
 5 | 
 6 | #include "H2Pack_typedef.h"
 7 | 
 8 | struct AFN_precond
 9 | {
10 |     int   is_nys, is_afn;   // Whether to use Nystrom ot AFN
11 |     int   n;                // Size of the kernel matrix, == number of points (does not support krnl_dim > 1 yet)
12 |     int   n1;               // Size of K11 block (== global low-rank approximation rank)
13 |     int   n2;               // == n - n1
14 |     int   est_rank;         // Estimated rank
15 |     int   *perm;            // Permutation array, size n
16 |     DTYPE *px, *py;         // Size n, permuted x and y in AFN_precond_apply
17 |     DTYPE *t1, *t2;         // Size n, intermediate vectors in AFN_precond_apply
18 |     DTYPE *nys_U;           // Size n * n1, row major, Nystrom basis
19 |     DTYPE *nys_M;           // Size n1, Nystrom eigenvalues + diagonal shift, then scaled
20 |     int   *afn_G_rowptr;    // Size n2 + 1, AFN G matrix CSR row_ptr array
21 |     int   *afn_GT_rowptr;   // Size n2 + 1, AFN G^T matrix CSR row_ptr array
22 |     int   *afn_G_colidx;    // Size nnz, AFN G matrix CSR col_idx array
23 |     int   *afn_GT_colidx;   // Size nnz, AFN G^T matrix CSR col_idx array
24 |     DTYPE *afn_G_val;       // Size nnz, AFN G matrix CSR values array
25 |     DTYPE *afn_GT_val;      // Size nnz, AFN G^T matrix CSR values array
26 |     DTYPE *afn_invK11;      // Size n1 * n1, row major, AFN K11^{-1} matrix
27 |     DTYPE *afn_K12;         // Size n1 * n2, row major, AFN K12 matrix
28 | 
29 |     // Timers for profiling
30 |     int n_apply;
31 |     double t_build, t_apply, t_rankest, t_fps, t_K11K12, t_nys;
32 |     double t_afn, t_afn_mat, t_afn_knn, t_afn_fsai, t_afn_csr;
33 | };
34 | typedef struct AFN_precond  AFN_precond_s;
35 | typedef struct AFN_precond *AFN_precond_p;
36 | 
37 | #ifdef __cplusplus
38 | extern "C" {
39 | #endif
40 | 
41 | // Build an AFN preconditioner for a kernel matrix
42 | // Input parameters:
43 | //   krnl_eval  : Pointer to kernel matrix evaluation function
44 | //   krnl_param : Pointer to kernel function parameter array
45 | //   npt        : Number of points in coord
46 | //   pt_dim     : Dimension of each point
47 | //   coord      : Matrix, size pt_dim-by-npt, coordinates of points
48 | //   mu         : Scalar, diagonal shift of the kernel matrix
49 | //   max_k      : Maximum global low-rank approximation rank 
50 | //   ss_npt     : Number of points in the sampling set
51 | //   fsai_npt   : Maximum number of nonzeros in each row of the FSAI matrix
52 | //   h2mat      : Optional, pointer to an initialized H2Pack struct, used for FSAI KNN search
53 | // Output parameter:
54 | //   AFN_precond_ : Pointer to an initialized AFN_precond struct
55 | void AFN_precond_build(
56 |     kernel_eval_fptr krnl_eval, void *krnl_param, const int npt, const int pt_dim, 
57 |     const DTYPE *coord, const DTYPE mu, const int max_k, const int ss_npt,
58 |     const int fsai_npt, void *h2mat, AFN_precond_p *AFN_precond_
59 | );
60 | 
61 | // Destroy an initialized AFN_precond struct
62 | void AFN_precond_destroy(AFN_precond_p *AFN_precond_);
63 | 
64 | // Apply an AFN preconditioner to a vector
65 | // Input parameters:
66 | //   AFN_precond : Pointer to an initialized AFN_precond struct
67 | //   x           : Input vector, size n
68 | // Output parameter:
69 | //   y : Output vector, size n
70 | void AFN_precond_apply(AFN_precond_p AFN_precond, const DTYPE *x, DTYPE *y);
71 | 
72 | // Print statistics of an AFN_precond struct
73 | void AFN_precond_print_stat(AFN_precond_p AFN_precond);
74 | 
75 | #ifdef __cplusplus
76 | }
77 | #endif
78 | 
79 | #endif  // End of "#ifndef __AFN_PRECOND_H__"
80 | 


--------------------------------------------------------------------------------
/src/DAG_task_queue.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <assert.h>
  5 | #include <unistd.h>
  6 | 
  7 | #include "DAG_task_queue.h"
  8 | 
  9 | // Initialize a DAG_task_queue structure with a DAG stored in CSR format. 
 10 | // DAG(i, j) is nonzero means that task j relies on task i. If DAG(i, i) is 
 11 | // nonzero, task i will be skipped. 
 12 | void DAG_task_queue_init(
 13 |     const int max_task_id, const int num_dep, const int *DAG_src_ptr, 
 14 |     const int *DAG_dst_idx, DAG_task_queue_p *tq_
 15 | )
 16 | {
 17 |     DAG_task_queue_p tq = (DAG_task_queue_p) malloc(sizeof(DAG_task_queue_s));
 18 |     assert(tq != NULL);
 19 |     
 20 |     // Allocate arrays in DAG_task_queue
 21 |     tq->DAG_src_ptr = (int*) malloc(sizeof(int) * (max_task_id + 1));
 22 |     tq->DAG_dst_idx = (int*) malloc(sizeof(int) * num_dep);
 23 |     tq->indeg       = (int*) malloc(sizeof(int) * max_task_id);
 24 |     tq->curr_indeg  = (int*) malloc(sizeof(int) * max_task_id);
 25 |     tq->task_queue  = (int*) malloc(sizeof(int) * max_task_id);
 26 |     assert(tq->DAG_src_ptr != NULL);
 27 |     assert(tq->DAG_dst_idx != NULL);
 28 |     assert(tq->indeg       != NULL);
 29 |     assert(tq->curr_indeg  != NULL);
 30 |     assert(tq->task_queue  != NULL);
 31 |     
 32 |     // Copy DAG CSR matrix, count DAG vertex indegree and number of actual tasks
 33 |     if (num_dep != DAG_src_ptr[max_task_id])
 34 |     {
 35 |         fprintf(stderr, "ERROR: num_dep != DAG_src_ptr[max_task_id] \n");
 36 |         return;
 37 |     }
 38 |     tq->max_task_id = max_task_id;
 39 |     tq->num_task    = max_task_id;
 40 |     memcpy(tq->DAG_src_ptr, DAG_src_ptr, sizeof(int) * (max_task_id + 1));
 41 |     memcpy(tq->DAG_dst_idx, DAG_dst_idx, sizeof(int) * num_dep);
 42 |     memset(tq->indeg, 0, sizeof(int) * max_task_id);
 43 |     for (int i = 0; i < max_task_id; i++)
 44 |     {
 45 |         for (int j = tq->DAG_src_ptr[i]; j < tq->DAG_src_ptr[i + 1]; j++)
 46 |         {
 47 |             int dst = tq->DAG_dst_idx[j];
 48 |             if (dst == i) tq->num_task--;  // Task i relies on task i
 49 |             tq->indeg[dst]++;
 50 |         }
 51 |     }
 52 |     
 53 |     DAG_task_queue_reset(tq);
 54 |     
 55 |     *tq_ = tq;
 56 | }
 57 | 
 58 | // Destroy a DAG_task_queue structure
 59 | void DAG_task_queue_destroy(DAG_task_queue_p *tq_)
 60 | {
 61 |     DAG_task_queue_p tq = *tq_;
 62 |     if (tq == NULL) return;
 63 |     free(tq->DAG_src_ptr);
 64 |     free(tq->DAG_dst_idx);
 65 |     free(tq->indeg);
 66 |     free(tq->curr_indeg);
 67 |     free(tq->task_queue);
 68 |     free(tq);
 69 |     *tq_ = NULL;
 70 | }
 71 | 
 72 | // Get a new task from a DAG_task_queue structure and update its task queue.  
 73 | // This function can be called by multiple threads at the same time.
 74 | int  DAG_task_queue_get_task(DAG_task_queue_p tq)
 75 | {
 76 |     if (tq == NULL) return -1;
 77 |     
 78 |     // Get current task queue head index and increment it
 79 |     // If all tasks are finished, return directly
 80 |     int task_head = __atomic_fetch_add(&tq->task_head, 1, __ATOMIC_SEQ_CST);
 81 |     if (task_head >= tq->num_task) return -1;
 82 |     
 83 |     // Atomic load the task id, task_id = -1 means the task_head-th task is not 
 84 |     // available yet, otherwise we have a valid task_id and return
 85 |     int task_id = __atomic_load_n(&tq->task_queue[task_head], __ATOMIC_SEQ_CST);
 86 |     while (task_id == -1)
 87 |     {
 88 |         //usleep(10);
 89 |         task_id = __atomic_load_n(&tq->task_queue[task_head], __ATOMIC_SEQ_CST);
 90 |     }
 91 |     //if (task_id == -1) printf("[Warning] task_head = %d, task_id = -1\n", task_head);
 92 |     return task_id;
 93 | }
 94 | 
 95 | // Finish a task and push new available tasks to a DAG_task_queue task queue.
 96 | // This function can be called by multiple threads at the same time.
 97 | void DAG_task_queue_finish_task(DAG_task_queue_p tq, const int task_id)
 98 | {
 99 |     if (tq == NULL) return;
100 |     for (int j = tq->DAG_src_ptr[task_id]; j < tq->DAG_src_ptr[task_id + 1]; j++)
101 |     {
102 |         // For a destination vertex, subtract its current indegree count by 1
103 |         // and get its new indegree count to see if it is available now
104 |         int dst = tq->DAG_dst_idx[j];
105 |         int dst_indeg = __atomic_sub_fetch(tq->curr_indeg + dst, 1, __ATOMIC_SEQ_CST);
106 |         
107 |         // If the destination vertex is now available, push it to task queue
108 |         if (dst_indeg == 0)
109 |         {
110 |             int task_tail = __atomic_fetch_add(&tq->task_tail, 1, __ATOMIC_SEQ_CST);
111 |             __atomic_store_n(&tq->task_queue[task_tail], dst, __ATOMIC_SEQ_CST);
112 |         }
113 |         //if (dst_indeg < 0) printf("Warning: from task %d, set %d indeg = %d\n", task_id, dst, dst_indeg);
114 |     }
115 | }
116 | 
117 | // Reset the task queue in a DAG_task_queue structure. 
118 | void DAG_task_queue_reset(DAG_task_queue_p tq)
119 | {
120 |     if (tq == NULL) return;
121 |     tq->task_head = 0;
122 |     tq->task_tail = 0;
123 |     for (int i = 0; i < tq->max_task_id; i++)
124 |     {
125 |         tq->curr_indeg[i] = tq->indeg[i];
126 |         tq->task_queue[i] = -1;  // Mark all the tasks in the queue as unavailable
127 |         if (tq->indeg[i] == 0)
128 |         {
129 |             tq->task_queue[tq->task_tail] = i;
130 |             tq->task_tail++;
131 |         }
132 |     }
133 | }
134 | 


--------------------------------------------------------------------------------
/src/DAG_task_queue.h:
--------------------------------------------------------------------------------
 1 | #ifndef __DAG_TASK_QUEUE_H__
 2 | #define __DAG_TASK_QUEUE_H__
 3 | 
 4 | struct DAG_task_queue
 5 | {
 6 |     int max_task_id;    // Max task id + 1
 7 |     int num_task;       // Number of actual tasks
 8 |     int task_head;      // Head index of currently avail tasks in the queue
 9 |     int task_tail;      // Tail index of currently avail tasks in the queue
10 |     int *DAG_src_ptr;   // Size max_task_id+1, DAG CSR matrix row_ptr array
11 |     int *DAG_dst_idx;   // Size unknown, DAG CSR matrix col_idx array
12 |     int *indeg;         // Size max_task_id, indegree of DAG vertexes 
13 |     int *curr_indeg;    // Size max_task_id, indegree of DAG vertexes in running
14 |     int *task_queue;    // Size max_task_id, task queue
15 | };
16 | typedef struct DAG_task_queue  DAG_task_queue_s;
17 | typedef struct DAG_task_queue* DAG_task_queue_p;
18 | 
19 | #ifdef __cplusplus
20 | extern "C" {
21 | #endif
22 | 
23 | // Initialize a DAG_task_queue structure with a DAG stored in CSR matrix. 
24 | // DAG(i, j) is nonzero means that task j relies on task i. If DAG(i, i) is 
25 | // nonzero, task i will be skipped. 
26 | // Input parameters:
27 | //   max_task_id : Max task id + 1, Tasks are indexed from 0 to max_task_id-1
28 | //   num_dep     : Number of dependencies (nonzeros in DAG matrix)
29 | //   DAG_src_ptr : Size max_task_id+1, CSR matrix row_ptr array
30 | //   DAG_dst_idx : Size num_dep, CSR matrix col_idx array
31 | // Output parameter:
32 | //   *tq_ : Pointer to an initialized DAG_task_queue structure
33 | void DAG_task_queue_init(
34 |     const int max_task_id, const int num_dep, const int *DAG_src_ptr, 
35 |     const int *DAG_dst_idx, DAG_task_queue_p *tq_
36 | );
37 | 
38 | // Destroy a DAG_task_queue structure.
39 | // Input parameter:
40 | //   tq : A DAG_task_queue structure to be destroyed
41 | void DAG_task_queue_destroy(DAG_task_queue_p *tq_);
42 | 
43 | // Get a new task from a DAG_task_queue structure and update its task queue.  
44 | // This function can be called by multiple threads at the same time.
45 | // Input parameter:
46 | //   tq : Target DAG_task_queue structure
47 | // Output parameters:
48 | //   tq       : Target DAG_task_queue structure with updated task queue info
49 | //   <return> : Index of the new task. -1 means all tasks are finished.
50 | int  DAG_task_queue_get_task(DAG_task_queue_p tq);
51 | 
52 | // Finish a task and push new available tasks to a DAG_task_queue task queue.
53 | // This function can be called by multiple threads at the same time.
54 | // Input parameters:
55 | //   tq      : Target DAG_task_queue structure
56 | //   task_id : Index of the finished task
57 | // Output parameter:
58 | //   tq : Target DAG_task_queue structure with updated task queue info
59 | void DAG_task_queue_finish_task(DAG_task_queue_p tq, const int task_id);
60 | 
61 | // Reset the task queue in a DAG_task_queue structure. 
62 | // Input parameter:
63 | //   tq : Target DAG_task_queue structure
64 | // Output parameter:
65 | //   tq : Target DAG_task_queue structure with updated task queue info
66 | void DAG_task_queue_reset(DAG_task_queue_p tq);
67 | 
68 | #ifdef __cplusplus
69 | }
70 | #endif
71 | 
72 | #endif  // End of "#ifndef __DAG_TASK_QUEUE_H__"
73 | 
74 | 


--------------------------------------------------------------------------------
/src/GCC-OpenBLAS.make:
--------------------------------------------------------------------------------
 1 | CC           = gcc
 2 | USE_MKL      = 0
 3 | USE_OPENBLAS = 1
 4 | 
 5 | include common.make
 6 | 
 7 | # GCC 10 need to manually specify using SVE, -march=native is not enough
 8 | # On A64FX SVE vector bits = 512, on other SVE supported processors this value might be different
 9 | USE_AARCH64_SVE = 0
10 | SVE_VECTOR_BITS = 512
11 | ifeq ($(strip $(USE_AARCH64_SVE)), 1)
12 | CFLAGS := $(subst -march=native, -march=armv8.2-a+sve -msve-vector-bits=$(SVE_VECTOR_BITS), $(CFLAGS))
13 | endif


--------------------------------------------------------------------------------
/src/H2Pack.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_H__
 2 | #define __H2PACK_H__
 3 | 
 4 | // H2Pack configurations
 5 | #include "H2Pack_config.h"
 6 | 
 7 | // H2Pack data structure
 8 | #include "H2Pack_typedef.h"
 9 | 
10 | // H2Pack auxiliary data structures
11 | #include "H2Pack_aux_structs.h"
12 | 
13 | // H2Pack hierarchical point partitioning
14 | #include "H2Pack_partition.h"
15 | 
16 | // H2Pack hierarchical point partitioning for periodic system
17 | #include "H2Pack_partition_periodic.h"
18 | 
19 | // H2Pack interpolative decomposition compression
20 | #include "H2Pack_ID_compress.h"
21 | 
22 | // H2Pack generate proxy points
23 | #include "H2Pack_gen_proxy_point.h"
24 | 
25 | // H2Pack build H2/HSS representation
26 | #include "H2Pack_build.h"
27 | 
28 | // H2Pack build H2 representation for periodic system
29 | #include "H2Pack_build_periodic.h"
30 | 
31 | // H2Pack H2/HSS fast matrix-vector multiplication
32 | #include "H2Pack_matvec.h"
33 | 
34 | // H2Pack H2 fast matrix-vector multiplication for periodic system
35 | #include "H2Pack_matvec_periodic.h"
36 | 
37 | // H2Pack H2/HSS fast matrix-matrix multiplication
38 | #include "H2Pack_matmul.h"
39 | 
40 | // H2Pack H2 fast matrix-matrix multiplication for periodic system
41 | #include "H2Pack_matmul_periodic.h"
42 | 
43 | // H2Pack HSS ULV decomposition and solve
44 | #include "H2Pack_HSS_ULV.h"
45 | 
46 | // H2Pack SPDHSS H2 build
47 | #include "H2Pack_SPDHSS_H2.h"
48 | 
49 | // H2Pack file IO
50 | #include "H2Pack_file_IO.h"
51 | 
52 | // H2Pack build H2/HSS representation with sample points
53 | #include "H2Pack_build_with_sample_point.h"
54 | 
55 | // Linear algebra library (BLAS, LAPACK) wrapper header
56 | #include "linalg_lib_wrapper.h"
57 | 
58 | // Vector wrapper function wrapper
59 | #include "ASTER/include/aster.h"
60 | 
61 | // Helper functions
62 | #include "utils.h"
63 | 
64 | #endif
65 | 


--------------------------------------------------------------------------------
/src/H2Pack_HSS_ULV.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_HSS_ULV_H__
 2 | #define __H2PACK_HSS_ULV_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // Construct the ULV LU factorization for a HSS matrix
12 | // Input parameters:
13 | //   h2pack : H2Pack structure with constructed HSS representation
14 | //   shift  : Shift coefficient k to make (A + k * I) non-singular
15 | // Output parameter:
16 | //   h2pack : H2Pack structure with ULV LU factorization
17 | void H2P_HSS_ULV_LU_factorize(H2Pack_p h2pack, const DTYPE shift);
18 | 
19 | // Solve the linear system A_{HSS} * x = b using the HSS ULV LU factorization,
20 | // where A_{HSS} = L_{HSS} * U_{HSS}.
21 | // Input parameters:
22 | //   h2pack : H2Pack structure with ULV LU factorization
23 | //   op     : Operation type, 1, 2, or 3
24 | //   b      : Size >= h2pack->krnl_mat_size, right-hand side vector
25 | // Output parameter:
26 | //   x : Size >= h2pack->krnl_mat_size, solution vector. 
27 | //       If op == 1, x satisfies L_{HSS} * x = b.
28 | //       If op == 2, x satisfies U_{HSS} * x = b.
29 | //       If op == 3, x satisfies A_{HSS} * x = b.
30 | void H2P_HSS_ULV_LU_solve(H2Pack_p h2pack, const int op, const DTYPE *b, DTYPE *x);
31 | 
32 | // Construct the ULV Cholesky factorization for a HSS matrix
33 | // Input parameters:
34 | //   h2pack : H2Pack structure with constructed HSS representation
35 | //   shift  : Shift coefficient k to make (A + k * I) S.P.D.
36 | // Output parameter:
37 | //   h2pack : H2Pack structure with ULV Cholesky factorization
38 | void H2P_HSS_ULV_Cholesky_factorize(H2Pack_p h2pack, const DTYPE shift);
39 | 
40 | // Solve the linear system A_{HSS} * x = b using the HSS ULV Cholesky factorization,
41 | // where A_{HSS} = L_{HSS} * L_{HSS}^T.
42 | // Input parameters:
43 | //   h2pack : H2Pack structure with ULV Cholesky factorization
44 | //   op     : Operation type, 1, 2, or 3
45 | //   b      : Size >= h2pack->krnl_mat_size, right-hand side vector
46 | // Output parameter:
47 | //   x : Size >= h2pack->krnl_mat_size, solution vector. 
48 | //       If op == 1, x satisfies L_{HSS}^T * x = b.
49 | //       If op == 2, x satisfies L_{HSS}   * x = b.
50 | //       If op == 3, x satisfies A_{HSS}   * x = b.
51 | void H2P_HSS_ULV_Cholesky_solve(H2Pack_p h2pack, const int op, const DTYPE *b, DTYPE *x);
52 | 
53 | #ifdef __cplusplus
54 | }
55 | #endif
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/src/H2Pack_ID_compress.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_ID_COMPRESS_H__
 2 | #define __H2PACK_ID_COMPRESS_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_aux_structs.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // Interpolative Decomposition (ID) using partial QR over rows of a target 
12 | // matrix. Partial pivoting QR may need to be upgraded to SRRQR later. 
13 | // Given an m*n matrix A, an rank-k ID approximation of A is of form
14 | //         A = U * A(J, :)
15 | // where J is a row index subset of size k, and U is a m*k matrix (if 
16 | // SRRQR is used, entries of U are bounded by a parameter 'f'). A(J,:) 
17 | // and U are usually called the skeleton and projection matrix. 
18 | // Input parameters:
19 | //   A          : Target matrix, stored in row major
20 | //   stop_type  : Partial QR stop criteria: QR_RANK, QR_REL_NRM, or QR_ABS_NRM
21 | //   stop_param : Pointer to partial QR stop parameter
22 | //   n_thread   : Number of threads used in this function
23 | //   QR_buff    : Working buffer for partial pivoting QR. If kdim == 1, size A->nrow.
24 | //                If kdim > 1, size (2*kdim+2)*A->ncol + (kdim+1)*A->nrow.
25 | //   ID_buff    : Size 4 * A->nrow, working buffer for ID compression
26 | //   kdim       : Dimension of tensor kernel's return (column block size)
27 | // Output parameters:
28 | //   U_ : Projection matrix, will be initialized in this function. If U_ == NULL,
29 | //        the projection matrix will not be calculated.
30 | //   J  : Row indices of the skeleton A
31 | void H2P_ID_compress(
32 |     H2P_dense_mat_p A, const int stop_type, void *stop_param, H2P_dense_mat_p *U_, 
33 |     H2P_int_vec_p J, const int n_thread, DTYPE *QR_buff, int *ID_buff, const int kdim
34 | );
35 | 
36 | #ifdef __cplusplus
37 | }
38 | #endif
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/src/H2Pack_SPDHSS_H2.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_SPDHSS_H2_H__
 2 | #define __H2PACK_SPDHSS_H2_H__
 3 | 
 4 | #include "H2Pack_typedef.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | // Construct an SPD HSS matrix from a H2 matrix
11 | // Input parameters:
12 | //   max_rank : Maximum rank of the HSS matrix
13 | //   reltol   : Relative tolerance in column-pivoted QR
14 | //   shift    : Diagonal shifting
15 | //   h2mat    : Constructed H2 matrix
16 | // Output parameter:
17 | //   *hssmat_ : The constructed SPD HSS matrix, A_{HSS} ~= A_{H2} + shift * I
18 | void H2P_SPDHSS_H2_build(
19 |     const int max_rank, const DTYPE reltol, const DTYPE shift, 
20 |     H2Pack_p h2mat, H2Pack_p *hssmat_
21 | );
22 | 
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | 
27 | #endif
28 | 
29 | 


--------------------------------------------------------------------------------
/src/H2Pack_build.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_BUILD_H__
 2 | #define __H2PACK_BUILD_H__
 3 | 
 4 | #include "H2Pack_typedef.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | // Build H2 representation with a kernel function
11 | // Input parameters:
12 | //   h2pack          : H2Pack structure with point partitioning info
13 | //   pp              : Array of proxy points for each level
14 | //   BD_JIT          : 0 or 1, if B and D matrices are computed just-in-time in matvec
15 | //   krnl_param      : Pointer to kernel function parameter array
16 | //   krnl_eval       : Pointer to kernel matrix evaluation function
17 | //   krnl_bimv       : Pointer to kernel matrix bi-matvec function
18 | //   krnl_bimv_flops : FLOPs needed in kernel bi-matvec
19 | // Output parameter:
20 | //   h2pack : H2Pack structure with H2 representation matrices
21 | void H2P_build(
22 |     H2Pack_p h2pack, H2P_dense_mat_p *pp, const int BD_JIT, void *krnl_param, 
23 |     kernel_eval_fptr krnl_eval, kernel_bimv_fptr krnl_bimv, const int krnl_bimv_flops
24 | );
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/src/H2Pack_build_periodic.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <string.h>
  3 | #include <stdlib.h>
  4 | #include <assert.h>
  5 | #include <math.h>
  6 | #include <time.h>
  7 | #include <omp.h>
  8 | 
  9 | #include "H2Pack_config.h"
 10 | #include "H2Pack_typedef.h"
 11 | #include "H2Pack_aux_structs.h"
 12 | #include "H2Pack_build_periodic.h"
 13 | #include "H2Pack_utils.h"
 14 | #include "utils.h"
 15 | 
 16 | // Build periodic block for root node
 17 | void H2P_build_periodic_block(H2Pack_p h2pack)
 18 | {
 19 |     int pt_dim    = h2pack->pt_dim;
 20 |     int xpt_dim   = h2pack->xpt_dim;
 21 |     int krnl_dim  = h2pack->krnl_dim;
 22 |     int root_idx  = h2pack->root_idx;
 23 |     int n_lattice = h2pack->n_lattice;
 24 |     void  *krnl_param   = h2pack->krnl_param;
 25 |     void  *pkrnl_param  = h2pack->pkrnl_param;
 26 |     DTYPE *enbox0_width = h2pack->enbox + (root_idx * (2 * pt_dim) + pt_dim);
 27 |     DTYPE *per_lattices = h2pack->per_lattices;
 28 |     H2P_dense_mat_p  root_J_coord   = h2pack->J_coord[root_idx];
 29 |     H2P_dense_mat_p  root_J_coord_s = h2pack->tb[0]->mat0;
 30 |     H2P_dense_mat_p  krnl_mat_blk   = h2pack->tb[0]->mat1;
 31 |     kernel_eval_fptr krnl_eval  = h2pack->krnl_eval;
 32 |     kernel_eval_fptr pkrnl_eval = h2pack->pkrnl_eval;
 33 | 
 34 |     int n_point_root = root_J_coord->ncol;
 35 |     int per_blk_size = n_point_root * krnl_dim;
 36 |     DTYPE *per_blk = (DTYPE*) malloc_aligned(sizeof(DTYPE) * per_blk_size * per_blk_size, 64);
 37 |     ASSERT_PRINTF(per_blk != NULL, "Failed to allocate periodic block of size %d^2\n", per_blk_size);
 38 | 
 39 |     // O = pkernel({root_J_coord, root_J_coord});
 40 |     pkrnl_eval(
 41 |         root_J_coord->data, root_J_coord->ld, root_J_coord->ncol,
 42 |         root_J_coord->data, root_J_coord->ld, root_J_coord->ncol,
 43 |         pkrnl_param, per_blk, per_blk_size
 44 |     );
 45 |     DTYPE shift[8] = {0, 0, 0, 0, 0, 0, 0, 0};
 46 |     H2P_dense_mat_resize(krnl_mat_blk, per_blk_size, per_blk_size);
 47 |     H2P_dense_mat_resize(root_J_coord_s, xpt_dim, n_point_root);
 48 |     copy_matrix(
 49 |         sizeof(DTYPE), xpt_dim, n_point_root, root_J_coord->data, root_J_coord->ld, 
 50 |         root_J_coord_s->data, root_J_coord_s->ld, 0
 51 |     );
 52 |     for (int l = 0; l < n_lattice; l++)
 53 |     {
 54 |         // shift = lattice(l, 1 : pt_dim) .* root_box(pt_dim+1 : 2 * pt_dim);
 55 |         // shift = [shift, zeros(1, xpt_dim - pt_dim)];
 56 |         DTYPE *lattice_l = per_lattices + l * pt_dim;
 57 |         for (int j = 0; j < pt_dim; j++) shift[j] = enbox0_width[j] * lattice_l[j];
 58 |         // root_J_coord_s = coord_shift(root_J_coord, shift, 1);
 59 |         H2P_shift_coord(root_J_coord_s, shift,  1.0);
 60 |         // O = O - kernel({root_J_coord, root_J_coord_s});
 61 |         krnl_eval(
 62 |             root_J_coord->data,   root_J_coord->ld,   root_J_coord->ncol,
 63 |             root_J_coord_s->data, root_J_coord_s->ld, root_J_coord->ncol,
 64 |             krnl_param, krnl_mat_blk->data, krnl_mat_blk->ld
 65 |         );
 66 |         #pragma omp simd
 67 |         for (int i = 0; i < per_blk_size * per_blk_size; i++)
 68 |             per_blk[i] -= krnl_mat_blk->data[i];
 69 |         // Reset root_J_coord_s = root_J_coord
 70 |         H2P_shift_coord(root_J_coord_s, shift, -1.0);
 71 |     }
 72 | 
 73 |     h2pack->per_blk = per_blk;
 74 | }
 75 | 
 76 | // Build H2 representation with a regular kernel function and
 77 | // a periodic system kernel (Ewald summation) function
 78 | void H2P_build_periodic(
 79 |     H2Pack_p h2pack, H2P_dense_mat_p *pp, const int BD_JIT, 
 80 |     void *krnl_param,  kernel_eval_fptr krnl_eval, 
 81 |     void *pkrnl_param, kernel_eval_fptr pkrnl_eval, 
 82 |     kernel_mv_fptr krnl_mv, const int krnl_mv_flops
 83 | )
 84 | {
 85 |     double st, et;
 86 |     double *timers = h2pack->timers;
 87 | 
 88 |     if (pp == NULL)
 89 |     {
 90 |         ERROR_PRINTF("You need to provide a set of proxy points.\n");
 91 |         return;
 92 |     }
 93 |     
 94 |     if (krnl_eval == NULL)
 95 |     {
 96 |         ERROR_PRINTF("You need to provide a valid krnl_eval().\n");
 97 |         return;
 98 |     }
 99 | 
100 |     if (BD_JIT != 1)
101 |     {
102 |         ERROR_PRINTF("Only support BD_JIT=1 in this function for the moment.\n");
103 |         return;
104 |     }
105 | 
106 |     h2pack->pp = pp;
107 |     h2pack->BD_JIT = BD_JIT;
108 |     h2pack->krnl_param  = krnl_param;
109 |     h2pack->krnl_eval   = krnl_eval;
110 |     h2pack->pkrnl_param = pkrnl_param;
111 |     h2pack->pkrnl_eval  = pkrnl_eval;
112 |     h2pack->krnl_mv     = krnl_mv;
113 |     h2pack->krnl_bimv_flops = krnl_mv_flops - 2;
114 |     if (BD_JIT == 1 && krnl_mv == NULL) 
115 |         WARNING_PRINTF("krnl_eval() will be used in BD_JIT matvec. For better performance, consider using a krnl_mv().\n");
116 | 
117 |     // 1. Build projection matrices and skeleton row sets
118 |     st = get_wtime_sec();
119 |     H2P_build_H2_UJ_proxy(h2pack);
120 |     et = get_wtime_sec();
121 |     timers[U_BUILD_TIMER_IDX] = et - st;
122 | 
123 |     // 2. Generate H2 generator matrices metadata
124 |     st = get_wtime_sec();
125 |     H2P_generate_B_metadata(h2pack);
126 |     et = get_wtime_sec();
127 |     timers[B_BUILD_TIMER_IDX] = et - st;
128 |     
129 |     // 3. Generate H2 dense blocks metadata
130 |     st = get_wtime_sec();
131 |     H2P_generate_D_metadata(h2pack);
132 |     et = get_wtime_sec();
133 |     timers[D_BUILD_TIMER_IDX] = et - st;
134 | 
135 |     // 4. Build periodic block for root node, add its timing to B build timing
136 |     st = get_wtime_sec();
137 |     H2P_build_periodic_block(h2pack);
138 |     et = get_wtime_sec();
139 |     timers[B_BUILD_TIMER_IDX] = et - st;
140 | 
141 |     // 5. Set up forward and backward permutation indices
142 |     int n_point    = h2pack->n_point;
143 |     int krnl_dim   = h2pack->krnl_dim;
144 |     int *coord_idx = h2pack->coord_idx;
145 |     int *fwd_pmt_idx = (int*) malloc(sizeof(int) * n_point * krnl_dim);
146 |     int *bwd_pmt_idx = (int*) malloc(sizeof(int) * n_point * krnl_dim);
147 |     for (int i = 0; i < n_point; i++)
148 |     {
149 |         for (int j = 0; j < krnl_dim; j++)
150 |         {
151 |             fwd_pmt_idx[i * krnl_dim + j] = coord_idx[i] * krnl_dim + j;
152 |             bwd_pmt_idx[coord_idx[i] * krnl_dim + j] = i * krnl_dim + j;
153 |         }
154 |     }
155 |     h2pack->fwd_pmt_idx = fwd_pmt_idx;
156 |     h2pack->bwd_pmt_idx = bwd_pmt_idx;
157 | }
158 | 


--------------------------------------------------------------------------------
/src/H2Pack_build_periodic.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_BUILD_PERIODIC_H__
 2 | #define __H2PACK_BUILD_PERIODIC_H__
 3 | 
 4 | #include "H2Pack_typedef.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | // Build H2 representation with a regular kernel function and
11 | // a periodic system kernel (Ewald summation) function
12 | // Input parameters:
13 | //   h2pack        : H2Pack structure with point partitioning info
14 | //   pp            : Array of proxy points for each level
15 | //   BD_JIT        : 0 or 1, if B and D matrices are computed just-in-time in matvec
16 | //   krnl_param    : Pointer to kernel function parameter array
17 | //   krnl_eval     : Pointer to kernel matrix evaluation function
18 | //   pkrnl_param   : Pointer to periodic system kernel (Ewald summation) function parameter array
19 | //   pkrnl_eval    : Pointer to periodic system kernel (Ewald summation) matrix evaluation function
20 | //   krnl_mv       : Pointer to kernel matvec function
21 | //   krnl_mv_flops : FLOPs needed in kernel bi-matvec
22 | // Output parameter:
23 | //   h2pack : H2Pack structure with H2 representation matrices
24 | void H2P_build_periodic(
25 |     H2Pack_p h2pack, H2P_dense_mat_p *pp, const int BD_JIT, 
26 |     void *krnl_param,  kernel_eval_fptr krnl_eval, 
27 |     void *pkrnl_param, kernel_eval_fptr pkrnl_eval, 
28 |     kernel_mv_fptr krnl_mv, const int krnl_mv_flops
29 | );
30 | 
31 | #ifdef __cplusplus
32 | }
33 | #endif
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/src/H2Pack_build_with_sample_point.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_BUILD_WITH_SAMPLE_POINT_H__
 2 | #define __H2PACK_BUILD_WITH_SAMPLE_POINT_H__
 3 | 
 4 | #include "H2Pack_typedef.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | // Select sample points for constructing H2 projection and skeleton matrices 
11 | // This algorithm is based on the MATLAB code provided by the author of the paper
12 | // doi/10.1109/IPDPS47924.2020.00082, but the algorithm is not discussed in the paper
13 | // Input parameters:
14 | //   h2pack      : Initialized H2Pack structure
15 | //   krnl_param  : Pointer to kernel function parameter array
16 | //   krnl_eval   : Pointer to kernel matrix evaluation function
17 | //   tau         : Separation threshold, usually is 0.7
18 | // Output parameter:
19 | //   *sample_points_  : Array of sample points for each node
20 | void H2P_select_sample_point(
21 |     H2Pack_p h2pack, const void *krnl_param, kernel_eval_fptr krnl_eval, 
22 |     const DTYPE tau, H2P_dense_mat_p **sample_points_
23 | );
24 | 
25 | // Build H2 representation with a kernel function and sample points
26 | // Input parameters:
27 | //   h2pack          : H2Pack structure with point partitioning info
28 | //   sample_pt       : Array of sample points for each node
29 | //   BD_JIT          : 0 or 1, if B and D matrices are computed just-in-time in matvec
30 | //   krnl_param      : Pointer to kernel function parameter array
31 | //   krnl_eval       : Pointer to kernel matrix evaluation function
32 | //   krnl_bimv       : Pointer to kernel matrix bi-matvec function
33 | //   krnl_bimv_flops : FLOPs needed in kernel bi-matvec
34 | // Output parameter:
35 | //   h2pack : H2Pack structure with H2 representation matrices
36 | void H2P_build_with_sample_point(
37 |     H2Pack_p h2pack, H2P_dense_mat_p *sample_pt, const int BD_JIT, void *krnl_param, 
38 |     kernel_eval_fptr krnl_eval, kernel_bimv_fptr krnl_bimv, const int krnl_bimv_flops
39 | );
40 | 
41 | #ifdef __cplusplus
42 | }
43 | #endif
44 | 
45 | #endif
46 | 


--------------------------------------------------------------------------------
/src/H2Pack_config.h:
--------------------------------------------------------------------------------
  1 | #ifndef __H2PACK_CONFIG_H__
  2 | #define __H2PACK_CONFIG_H__
  3 | 
  4 | // Parameters used in H2Pack
  5 | 
  6 | #define DOUBLE_SIZE     8
  7 | #define FLOAT_SIZE      4
  8 | 
  9 | #ifndef DTYPE_SIZE
 10 | #define DTYPE_SIZE      DOUBLE_SIZE     // Matrix data type: double or float
 11 | #endif
 12 | 
 13 | #if DTYPE_SIZE == DOUBLE_SIZE           // Marcos for double data type
 14 | #define DTYPE           double          // Data type
 15 | #define DTYPE_FMTSTR    "%lf"           // Data type format string
 16 | #define DABS            fabs            // Abs function
 17 | #define DLOG            log             // Natural logarithm function
 18 | #define DLOG2           log2            // Base-2 logarithm function
 19 | #define DEXP            exp             // Exponential function
 20 | #define DPOW            pow             // Power function
 21 | #define DSQRT           sqrt            // Sqrt function
 22 | #define DSIN            sin             // Sine function
 23 | #define DCOS            cos             // Cosine function
 24 | #define DERF            erf             // Erf function
 25 | #define DERFC           erfc            // Erfc function
 26 | #define DFLOOR          floor           // Floor function
 27 | #define DROUND          round           // Rounding function
 28 | #define DCEIL           ceil            // Ceiling function
 29 | #define DFMOD           fmod            // Floating point remainder function
 30 | #define CBLAS_NRM2      cblas_dnrm2     // CBLAS vector 2-norm 
 31 | #define CBLAS_DOT       cblas_ddot      // CBLAS vector dot product
 32 | #define CBLAS_GEMV      cblas_dgemv     // CBLAS matrix-vector multiplication
 33 | #define CBLAS_GEMM      cblas_dgemm     // CBLAS matrix-matrix multiplication
 34 | #define CBLAS_SYRK      cblas_dsyrk     // CBLAS symmetric rank-k update
 35 | #define CBLAS_TRSM      cblas_dtrsm     // CBLAS triangle solve
 36 | #define CBLAS_TRMM      cblas_dtrmm     // CBLAS triangle matrix multiplication
 37 | #define LAPACK_GETRF    LAPACKE_dgetrf  // LAPACK LU factorization
 38 | #define LAPACK_GETRS    LAPACKE_dgetrs  // LAPACK linear system solve using LU factorization
 39 | #define LAPACK_GETRI    LAPACKE_dgetri  // LAPACK LU inverse matrix
 40 | #define LAPACK_POTRF    LAPACKE_dpotrf  // LAPACK Cholesky factorization
 41 | #define LAPACK_POTRS    LAPACKE_dpotrs  // LAPACK linear system solve using Cholesky factorization
 42 | #define LAPACK_POTRI    LAPACKE_dpotri  // LAPACK Cholesky inverse matrix
 43 | #define LAPACK_GEQRF    LAPACKE_dgeqrf  // LAPACK QR factorization
 44 | #define LAPACK_GEQPF    LAPACKE_dgeqpf  // LAPACK QR factorization with column pivoting
 45 | #define LAPACK_ORGQR    LAPACKE_dorgqr  // LAPACK QR Q matrix explicitly construction
 46 | #define LAPACK_ORMQR    LAPACKE_dormqr  // LAPACK QR Q matrix multiples another matrix
 47 | #define LAPACK_SYEVD    LAPACKE_dsyevd  // LAPACK eigenvalue decomposition
 48 | #define LAPACK_GESVD    LAPACKE_dgesvd  // LAPACK singular value decomposition
 49 | #define N_DTYPE_64B     8               // 8 double == 64 bytes, for alignment
 50 | #define SIMD_LEN        SIMD_LEN_D      // SIMD vector length
 51 | #define D_EPS           DBL_EPSILON     // Double precision machine epsilon
 52 | #define ASTER_DTYPE_DOUBLE
 53 | #endif
 54 | 
 55 | 
 56 | #if DTYPE_SIZE == FLOAT_SIZE            // Marcos for float data type
 57 | #define DTYPE           float
 58 | #define DTYPE_FMTSTR    "%f"
 59 | #define DABS            fabsf
 60 | #define DLOG            logf
 61 | #define DLOG2           log2f
 62 | #define DEXP            expf
 63 | #define DPOW            powf
 64 | #define DSQRT           sqrtf
 65 | #define DSIN            sinf
 66 | #define DCOS            cosf
 67 | #define DERF            erff
 68 | #define DERFC           erfcf
 69 | #define DFLOOR          floorf
 70 | #define DROUND          roundf
 71 | #define DFMOD           fmodf
 72 | #define DCEIL           ceilf
 73 | #define CBLAS_NRM2      cblas_snrm2
 74 | #define CBLAS_DOT       cblas_sdot
 75 | #define CBLAS_GEMV      cblas_sgemv
 76 | #define CBLAS_GEMM      cblas_sgemm
 77 | #define CBLAS_SYRK      cblas_ssyrk
 78 | #define CBLAS_TRSM      cblas_strsm
 79 | #define CBLAS_TRMM      cblas_strmm
 80 | #define LAPACK_GETRF    LAPACKE_sgetrf
 81 | #define LAPACK_GETRS    LAPACKE_sgetrs
 82 | #define LAPACK_GETRI    LAPACKE_sgetri
 83 | #define LAPACK_POTRF    LAPACKE_spotrf
 84 | #define LAPACK_POTRS    LAPACKE_spotrs
 85 | #define LAPACK_POTRI    LAPACKE_spotri
 86 | #define LAPACK_GEQRF    LAPACKE_sgeqrf
 87 | #define LAPACK_GEQPF    LAPACKE_sgeqpf
 88 | #define LAPACK_ORGQR    LAPACKE_sorgqr
 89 | #define LAPACK_ORMQR    LAPACKE_sormqr
 90 | #define LAPACK_SYEVD    LAPACKE_ssyevd
 91 | #define LAPACK_GESVD    LAPACKE_sgesvd
 92 | #define N_DTYPE_64B     16
 93 | #define SIMD_LEN        SIMD_LEN_S
 94 | #define D_EPS           FLT_EPSILON
 95 | #define ASTER_DTYPE_FLOAT
 96 | #endif
 97 | 
 98 | #define QR_RANK         0               // Partial QR stop criteria: maximum rank
 99 | #define QR_REL_NRM      1               // Partial QR stop criteria: maximum relative column 2-norm
100 | #define QR_ABS_NRM      2               // Partial QR stop criteria: maximum absolute column 2-norm
101 | 
102 | #define ALIGN_SIZE      64              // Memory allocation alignment
103 | #define ALPHA_H2        0.999999        // Admissible coefficient for H2,  == 1 here
104 | #define ALPHA_HSS       -0.000001       // Admissible coefficient for HSS, == 0 here
105 | 
106 | #define BD_NTASK_THREAD 10              // Average number of tasks each thread has in B & D build
107 | 
108 | #include "linalg_lib_wrapper.h"
109 | #include "ASTER/include/aster.h"
110 | 
111 | #endif
112 | 


--------------------------------------------------------------------------------
/src/H2Pack_file_IO.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_FILE_IO_H__
 2 | #define __H2PACK_FILE_IO_H__
 3 | 
 4 | #include "H2Pack_typedef.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | // Store a constructed H2 representation to a set of files 
11 | // Input parameters:
12 | //   h2pack          : H2Pack structure after calling H2P_build()
13 | //   meta_json_fname : Metadata JSON file name
14 | //   aux_json_fname  : Auxiliary JSON file name
15 | //   binary_fname    : Binary data file name
16 | void H2P_store_to_file(
17 |     H2Pack_p h2pack, const char *meta_json_fname, 
18 |     const char *aux_json_fname, const char *binary_fname
19 | );
20 | 
21 | // Load a constructed H2 representation from a set of files
22 | // Input parameters:
23 | //   meta_json_fname : Metadata JSON file name
24 | //   aux_json_fname  : Auxiliary JSON file name, can be NULL
25 | //   binary_fname    : Binary data file name
26 | //   BD_JIT          : If H2Pack should use just-in-time matvec mode, 0 or 1
27 | //   krnl_param      : Pointer to the krnl_eval parameter buffer
28 | //   krnl_eval       : Pointer to the kernel matrix evaluation function, can be NULL
29 | //   krnl_bimv       : Pointer to the kernel matrix bi-matvec function, can be NULL
30 | //   krnl_bimv_flops : Number of flops required for each bi-matvec operation, for performance statistic only
31 | // Output parameter:
32 | //   *h2pack_ : H2Pack structure constructed from given files
33 | // Notes:
34 | //   If only meta_json_fname and binary_fname are valid non-empty values, the constructed
35 | //   H2Pack matrix can only be used to perform H2P_matvec(). Performing other operations
36 | //   may crash the program. 
37 | void H2P_read_from_file(
38 |     H2Pack_p *h2pack_, const char *meta_json_fname, const char *aux_json_fname, 
39 |     const char *binary_fname, const int BD_JIT, void *krnl_param, 
40 |     kernel_eval_fptr krnl_eval, kernel_bimv_fptr krnl_bimv, const int krnl_bimv_flops
41 | );
42 | 
43 | #ifdef __cplusplus
44 | }
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/src/H2Pack_gen_proxy_point.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_GEN_PROXY_POINT_H__
 2 | #define __H2PACK_GEN_PROXY_POINT_H__
 3 | 
 4 | #include "H2Pack_typedef.h"
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | void H2P_generate_proxy_point_ID(
11 |     const int pt_dim, const int krnl_dim, const DTYPE reltol, const int max_level, const int min_level,
12 |     DTYPE max_L, const void *krnl_param, kernel_eval_fptr krnl_eval, H2P_dense_mat_p **pp_
13 | );
14 | 
15 | // Calculate the enclosing box of a given set of points and adjust it if the proxy point file is provided
16 | // Input parameters:
17 | //   pt_dim  : Point dimension
18 | //   n_point : Number of points
19 | //   coord   : Size pt_dim-by-npt, each column is a point coordinate
20 | //   fname   : Proxy point file name, can be NULL
21 | // Output parameter:
22 | //   enbox_ : Box that encloses all points in this node.
23 | //            enbox[0 : pt_dim-1] are the corner with the smallest
24 | //            x/y/z/... coordinates. enbox[pt_dim : 2*pt_dim-1] are 
25 | //            the sizes of this box.
26 | void H2P_calc_enclosing_box(const int pt_dim, const int n_point, const DTYPE *coord, const char *fname, DTYPE **enbox_);
27 | 
28 | // Write a set of proxy points to a text file
29 | // Input parameters:
30 | //   fname     : File name
31 | //   pt_dim    : Point dimension
32 | //   reltol    : Proxy point selection relative error tolerance
33 | //   L3_nlayer : Y box exterior boundary size factor
34 | //   minL      : Radius of the minimal proxy point set (pp[0])
35 | //   num_pp    : Number of proxy point sets
36 | //   pp        : Proxy point sets. Radius of pp[i] should == 2 * radius of pp[i-1]
37 | void H2P_write_proxy_point_file(
38 |     const char *fname, const int pt_dim, const DTYPE reltol, const int L3_nlayer, 
39 |     const DTYPE minL, const int num_pp, H2P_dense_mat_p *pp
40 | );
41 | 
42 | // Generate proxy points for constructing H2 projection and skeleton matrices using 
43 | // ID compress, also try to load proxy points from a file and update this file
44 | // Input parameters:
45 | //   h2pack     : Initialized H2Pack structure
46 | //   krnl_param : Pointer to kernel function parameter array
47 | //   krnl_eval  : Pointer to kernel matrix evaluation function
48 | //   fname      : Proxy point file name, if == NULL or cannot find that file, compute all proxy points
49 | // Output parameter:
50 | //   pp_  : Array of proxy points for each level
51 | void H2P_generate_proxy_point_ID_file(
52 |     H2Pack_p h2pack, const void *krnl_param, kernel_eval_fptr krnl_eval, 
53 |     const char *fname, H2P_dense_mat_p **pp_
54 | );
55 | 
56 | // Generate uniformly distributed proxy points on a box surface for constructing
57 | // H2 projection and skeleton matrices for SOME kernel function.
58 | // This function is isolated because if the enclosing box for all points are fixed,
59 | // we only need to generate proxy points once and use them repeatedly.
60 | // Input parameters:
61 | //   pt_dim     : Dimension of point coordinate
62 | //   xpt_dim    : Dimension of extended point coordinate (for RPY xpt_dim == pt_dim+1, otherwise set xpt_dim == pt_dim)
63 | //   min_npt    : Minimum number of proxy points on the box surface
64 | //   max_level  : Maximum level (included) of a H2 tree, (root level == 0)
65 | //   min_level  : Minimum level that needs proxy points
66 | //   max_L      : The size of the root node's enclosing box
67 | // Output parameter:
68 | //   pp_  : Array of proxy points for each level
69 | void H2P_generate_proxy_point_surface(
70 |     const int pt_dim, const int xpt_dim, const int min_npt, const int max_level, 
71 |     const int min_level, DTYPE max_L, H2P_dense_mat_p **pp_
72 | );
73 | 
74 | #ifdef __cplusplus
75 | }
76 | #endif
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/src/H2Pack_kernels.h:
--------------------------------------------------------------------------------
1 | #ifndef __H2PACK_KERNELS_H__
2 | #define __H2PACK_KERNELS_H__
3 | 
4 | #include "H2Pack_2D_kernels.h"
5 | 
6 | #include "H2Pack_3D_kernels.h"
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/src/H2Pack_matmul.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_MATMUL_H__
 2 | #define __H2PACK_MATMUL_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // H2 representation multiplies a dense general matrix
12 | // Input parameters:
13 | //   h2pack : H2Pack structure with H2 representation matrices
14 | //   layout : CblasRowMajor/CblasColMajor if x & y are stored in row/column-major style
15 | //   n_vec  : Number of column vectors in mat_x
16 | //   mat_x  : Size >= h2pack->krnl_mat_size * ldx if layout == CblasRowMajor, 
17 | //            size >=                 n_vec * ldx if layout == CblasColMajor, 
18 | //            input dense matrix, the leading h2pack->krnl_mat_size-by-n_vec part of 
19 | //            mat_x will be used
20 | //   ldx    : Leading dimension of mat_x, must >= n_vec if layout == CblasRowMajor,
21 | //            must >= h2pack->krnl_mat_size if layout == CblasColMajor
22 | //   ldy    : Leading dimension of mat_y, the same requirement of ldx
23 | // Output parameter:
24 | //   mat_y  : Size is the same as mat_x, output dense matrix, mat_y := A_{H2} * mat_x
25 | void H2P_matmul(
26 |     H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 
27 |     const DTYPE *mat_x, const int ldx, DTYPE *mat_y, const int ldy
28 | );
29 | 
30 | // Permute rows of the multiplicand matrix from the original point ordering to 
31 | // the sorted point ordering inside H2Pack (forward), or vise versa (backward)
32 | // for the output matrix. 
33 | // These two functions will be called automatically in H2P_matmul(), you 
34 | // don't need to manually call them. We just provide the interface here.
35 | //   h2pack : H2Pack structure with H2 representation matrices
36 | //   layout : CblasRowMajor/CblasColMajor if x & y are stored in row/column-major style
37 | //   n_vec  : Number of column vectors in mat_x
38 | //   mat_x  : Size >= h2pack->krnl_mat_size * ldx if layout == CblasRowMajor, 
39 | //            size >=                 n_vec * ldx if layout == CblasColMajor, 
40 | //            dense matrix to be permuted, the leading h2pack->krnl_mat_size-by-n_vec 
41 | //            part of mat_x will be used
42 | //   ldx    : Leading dimension of mat_x, must >= n_vec if layout == CblasRowMajor,
43 | //            must >= h2pack->krnl_mat_size if layout == CblasColMajor
44 | //   ldp    : Leading dimension of pmt_mat_x, the same requirement of ldx
45 | // Output parameter:
46 | //   pmt_mat_x : Size is the same as mat_x, permuted dense matrix
47 | void H2P_permute_matrix_row_forward(
48 |     H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 
49 |     const DTYPE *mat_x, const int ldx, DTYPE *pmt_mat_x, const int ldp
50 | );
51 | void H2P_permute_matrix_row_backward(
52 |     H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 
53 |     const DTYPE *mat_x, const int ldx, DTYPE *pmt_mat_x, const int ldp
54 | );
55 | 
56 | #ifdef __cplusplus
57 | }
58 | #endif
59 | 
60 | #endif
61 | 


--------------------------------------------------------------------------------
/src/H2Pack_matmul_periodic.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_MATMUL_PERIODIC_H__
 2 | #define __H2PACK_MATMUL_PERIODIC_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // H2 representation multiplies a dense general matrix, for periodic system
12 | // Input parameters:
13 | //   h2pack : H2Pack structure with H2 representation matrices
14 | //   layout : CblasRowMajor/CblasColMajor if x & y are stored in row/column-major style
15 | //   n_vec  : Number of column vectors in mat_x
16 | //   mat_x  : Size >= h2pack->krnl_mat_size * ldx, input dense matrix, the leading 
17 | //            h2pack->krnl_mat_size-by-n_vec part of mat_x will be used
18 | //   ldx    : Leading dimension of mat_x, should >= n_vec if layout == CblasRowMajor,
19 | //            should >= h2pack->krnl_mat_size if layout == CblasColMajor
20 | //   ldy    : Leading dimension of mat_y, the same requirement of ldx
21 | // Output parameter:
22 | //   mat_y  : Size is the same as mat_x, output dense matrix, mat_y := A_{H2} * mat_x
23 | void H2P_matmul_periodic(
24 |     H2Pack_p h2pack, const CBLAS_LAYOUT layout, const int n_vec, 
25 |     const DTYPE *mat_x, const int ldx, DTYPE *mat_y, const int ldy
26 | );
27 | 
28 | #ifdef __cplusplus
29 | }
30 | #endif
31 | 
32 | #endif
33 | 


--------------------------------------------------------------------------------
/src/H2Pack_matvec.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_MATVEC_H__
 2 | #define __H2PACK_MATVEC_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // H2 representation multiplies a column vector
12 | // Input parameters:
13 | //   h2pack : H2Pack structure with H2 representation matrices
14 | //   x      : Input dense vector
15 | // Output parameter:
16 | //   y : Output dense vector
17 | void H2P_matvec(H2Pack_p h2pack, const DTYPE *x, DTYPE *y);
18 | 
19 | // Permute the multiplicand vector from the original point ordering to the 
20 | // sorted point ordering inside H2Pack (forward), or vise versa (backward)
21 | // for the output vector. 
22 | // These two functions will be called automatically in H2P_matvec(), you 
23 | // don't need to manually call them. We just provide the interface here.
24 | // Input parameters:
25 | //   h2pack : H2Pack structure with H2 representation matrices
26 | //   x      : Vector to be permuted
27 | // Output parameter:
28 | //   pmt_x  : Permuted vector
29 | void H2P_permute_vector_forward (H2Pack_p h2pack, const DTYPE *x, DTYPE *pmt_x);
30 | void H2P_permute_vector_backward(H2Pack_p h2pack, const DTYPE *x, DTYPE *pmt_x);
31 | 
32 | #ifdef __cplusplus
33 | }
34 | #endif
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/src/H2Pack_matvec_periodic.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_MATVEC_PERIODIC_H__
 2 | #define __H2PACK_MATVEC_PERIODIC_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // H2 representation multiplies a column vector, for periodic system
12 | // Input parameters:
13 | //   h2pack : H2Pack structure with H2 representation matrices
14 | //   x      : Input dense vector
15 | // Output parameter:
16 | //   y : Output dense vector
17 | void H2P_matvec_periodic(H2Pack_p h2pack, const DTYPE *x, DTYPE *y);
18 | 
19 | #ifdef __cplusplus
20 | }
21 | #endif
22 | 
23 | #endif
24 | 


--------------------------------------------------------------------------------
/src/H2Pack_partition.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_PARTITION_H__
 2 | #define __H2PACK_PARTITION_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // Hierarchical point partitioning for H2 / HSS construction
12 | // Input parameters:
13 | //   h2pack          : H2Pack structure initialized using H2P_init()
14 | //   n_point         : Number of points for the kernel matrix
15 | //   coord           : Matrix, size h2pack->pt_dim * n_point, each column is a point coordinate
16 | //   max_leaf_points : Maximum point in a leaf node's box. If <= 0, will use 200 for
17 | //                     2D points and 400 for other dimensions
18 | //   max_leaf_size   : Maximum size of a leaf node's box. If == 0, max_leaf_points
19 | //                     will be the only restriction.
20 | // Output parameter:
21 | //   h2pack : H2Pack structure with point partitioning info
22 | void H2P_partition_points(
23 |     H2Pack_p h2pack, const int n_point, const DTYPE *coord, 
24 |     int max_leaf_points, DTYPE max_leaf_size
25 | );
26 | 
27 | // Calculate reduced (in)admissible pairs for HSS
28 | // Input parameter:
29 | //   h2pack : H2Pack structure after calling H2P_partition_points()
30 | // Output parameter:
31 | //   h2pack : H2Pack structure with reduced (in)admissible pairs for HSS
32 | void H2P_HSS_calc_adm_inadm_pairs(H2Pack_p h2pack);
33 | 
34 | #ifdef __cplusplus
35 | }
36 | #endif
37 | 
38 | #endif
39 | 


--------------------------------------------------------------------------------
/src/H2Pack_partition_periodic.h:
--------------------------------------------------------------------------------
 1 | #ifndef __H2PACK_PARTITION_PERIODIC_H__
 2 | #define __H2PACK_PARTITION_PERIODIC_H__
 3 | 
 4 | #include "H2Pack_config.h"
 5 | #include "H2Pack_typedef.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | // Hierarchical point partitioning for periodic system H2 construction 
12 | // Input parameters:
13 | //   h2pack          : H2Pack structure initialized using H2P_init()
14 | //   n_point         : Number of points for the kernel matrix
15 | //   coord           : Matrix, size h2pack->pt_dim * n_point, each column is a point coordinate
16 | //   max_leaf_points : Maximum point in a leaf node's box. If <= 0, will use 200 for
17 | //                     2D points and 400 for other dimensions
18 | //   max_leaf_size   : Maximum size of a leaf node's box. If == 0, max_leaf_points
19 | //                     will be the only restriction.
20 | //   unit_cell       : Array, size 2 * h2pack->pt_dim, unit cell of the periodic system, 
21 | //                     == the largest enclosing box for all points
22 | // Output parameter:
23 | //   h2pack : H2Pack structure with point partitioning info
24 | void H2P_partition_points_periodic(
25 |     H2Pack_p h2pack, const int n_point, const DTYPE *coord, int max_leaf_points, 
26 |     DTYPE max_leaf_size, DTYPE *unit_cell
27 | );
28 | 
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 | 
33 | #endif
34 | 


--------------------------------------------------------------------------------
/src/ICC-MKL.make:
--------------------------------------------------------------------------------
1 | CC           = icc
2 | USE_MKL      = 1
3 | USE_OPENBLAS = 0
4 | 
5 | include common.make


--------------------------------------------------------------------------------
/src/common.make:
--------------------------------------------------------------------------------
 1 | LIB_A   = libH2Pack.a
 2 | LIB_SO  = libH2Pack.so
 3 | 
 4 | C_SRCS  = $(wildcard *.c)
 5 | C_OBJS  = $(C_SRCS:.c=.c.o)
 6 | 
 7 | AR      = ar rcs
 8 | DEFS    = 
 9 | INCS    = 
10 | CFLAGS  = $(INCS) -Wall -g -std=gnu11 -O3 -fPIC $(DEFS)
11 | 
12 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "icc"), 1)
13 | CFLAGS += -qopenmp -xHost
14 | endif
15 | 
16 | ifeq ($(shell $(CC) --version 2>&1 | grep -c "gcc"), 1)
17 | CFLAGS += -fopenmp -march=native -Wno-unused-result -Wno-unused-function
18 | endif
19 | 
20 | ifeq ($(strip $(USE_MKL)), 1)
21 | DEFS   += -DUSE_MKL
22 | CFLAGS += -mkl
23 | endif
24 | 
25 | # If you use OpenBLAS, modify OPENBLAS_INSTALL_DIR here
26 | OPENBLAS_INSTALL_DIR = ../../OpenBLAS-git/install
27 | ifeq ($(strip $(USE_OPENBLAS)), 1)
28 | DEFS   += -DUSE_OPENBLAS
29 | INCS   += -I$(OPENBLAS_INSTALL_DIR)/include
30 | endif
31 | 
32 | # Delete the default old-fashion double-suffix rules
33 | .SUFFIXES:
34 | 
35 | .SECONDARY: $(C_OBJS)
36 | 
37 | all: install
38 | 
39 | install: $(LIB_A) $(LIB_SO)
40 | 	mkdir -p ../lib
41 | 	mkdir -p ../include
42 | 	mkdir -p ../include/ASTER/include
43 | 	cp -u $(LIB_A)  ../lib/$(LIB_A)
44 | 	cp -u $(LIB_SO) ../lib/$(LIB_SO)
45 | 	cp -u *.h ../include/
46 | 	cp -u ASTER/include/*.h ../include/ASTER/include
47 | 
48 | $(LIB_A): $(C_OBJS) 
49 | 	$(AR) $@ $^
50 | 
51 | $(LIB_SO): $(C_OBJS) 
52 | 	$(CC) -shared -o $@ $^
53 | 
54 | %.c.o: %.c
55 | 	$(CC) $(CFLAGS) -c $^ -o $@
56 | 
57 | clean:
58 | 	rm -f $(C_OBJS) $(LIB_A) $(LIB_SO)
59 | 


--------------------------------------------------------------------------------
/src/linalg_lib_wrapper.h:
--------------------------------------------------------------------------------
 1 | #ifndef __LINALG_LIB_WRAPPER_H__
 2 | #define __LINALG_LIB_WRAPPER_H__
 3 | 
 4 | // Wrapper for linear algebra library (BLAS, LAPACK)
 5 | 
 6 | #if !defined(USE_MKL) && !defined(USE_OPENBLAS)
 7 | #define USE_OPENBLAS
 8 | #endif
 9 | 
10 | #ifdef USE_MKL
11 | #include <mkl.h>
12 | #define BLAS_SET_NUM_THREADS mkl_set_num_threads
13 | #endif
14 | 
15 | #ifdef USE_OPENBLAS
16 | #include <cblas.h>
17 | #include <lapacke.h>
18 | #define BLAS_SET_NUM_THREADS openblas_set_num_threads
19 | #endif
20 | 
21 | #endif
22 | 
23 | 


--------------------------------------------------------------------------------