├── LICENSE ├── Makefile.unix ├── Makefile.win_intel ├── README.ja.md ├── README.md ├── bicgstab_csr_cusparse.c ├── bicgstab_csr_magma.c ├── bicgstab_csr_mkl.c ├── bicgstab_mkl.c ├── blas1.c ├── blas1_norm.c ├── blas2.c ├── blas3.c ├── complex_first.c ├── complex_first_cpp.cc ├── complex_matvec_mul.c ├── complex_row_column_major.c ├── cublas_dgemv.c ├── dgemv_bench_mt.c ├── first.c ├── get_sec.c ├── get_secv.h ├── integral_eq ├── Makefile.unix ├── Makefile.win_intel ├── ex1.c ├── gauss_integral.c ├── gauss_integral.h ├── iteration.c ├── prob1.c └── prob2.c ├── invpower_eig.c ├── jacobi_iteration.c ├── jacobi_iteration_csr_mkl.c ├── jacobi_iteration_mkl.c ├── jacobi_iteration_spblas.c ├── lapack_complex_row_column_major.c ├── lapack_complex_row_column_major_cpp.cc ├── lapack_dgecon.c ├── lapack_dgeev.c ├── lapack_dgeev_magma.c ├── lapack_dsyev.c ├── lapack_gcc.inc ├── lapack_icc.inc ├── lapack_icx.inc ├── lapack_lamch.c ├── lapack_ssyev.c ├── lapack_win_intel.inc ├── linear_eq.c ├── linear_eq_dgbsv.c ├── linear_eq_dgetrf.c ├── linear_eq_dsgesv.c ├── linear_eq_dsposv.c ├── linear_eq_dsysv.c ├── linear_eq_magma.c ├── matvec_mul.c ├── matvec_mul_cublas.c ├── matvec_mul_magma.c ├── matvec_mul_magma_pure.c ├── mm ├── diagtest.mtx ├── matrix_market_io.c ├── matrix_market_io.h ├── sptest.mtx ├── sptest_array.mtx ├── sptest_big.mtx └── sptest_pattern.mtx ├── my_linear_eq.c ├── my_linear_eq_omp.c ├── my_matvec_mul.c ├── my_matvec_mul_pt.c ├── mycuda.c ├── mycuda.h ├── mycuda_daxpy.cu ├── power_eig.c ├── readme.txt ├── row_column_major.c ├── tkaux.c └── tkaux.h /Makefile.unix: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Makefile for Linux gcc or icc environment # 4 | # Last Update: 2016-12-02 (Fri) T.Kouya # 5 | #*************************************************# 6 | # ------------------------ start of configuration -------------------------- 7 | # Intel C compiler (Old) 8 | #include lapack_icc.inc 9 | 10 | # Intel OneAPI (New) 11 | include lapack_icx.inc 12 | 13 | # GNU Compiler Collection 14 | #include lapack_gcc.inc 15 | 16 | # CUDA environment 17 | #NVCC=nvcc 18 | NVCC=nvcc -arch=sm_30 19 | CUDA_INC = -I/usr/local/cuda/include 20 | #CUDA_LIB = -L/usr/local/cuda/lib64 -lcublas -lcudart -lcusparse 21 | CUDA_LIB = -L/usr/local/cuda/lib64 -lcusparse -lcublas -lcudart -lpthread -ldl 22 | 23 | # MAGMA 1.5.0 to 1.6.2 24 | MAGMA_INC = -I/usr/local/magma/include -I/usr/local/magma/include/sparse-iter/include -O3 -fPIC -DADD_ -Wall -openmp -DMAGMA_SETAFFINITY -DMAGMA_WITH_MKL 25 | MAGMA_LIB = -L/usr/local/magma/lib -lmagma_sparse -lmagma -lstdc++ -lgomp 26 | 27 | # ------------------------ end of configuration -------------------------- 28 | 29 | all: first matvec_mul linear_eq blas row_column_major eig dgecon openmp pthread 30 | #all: first matvec_mul linear_eq blas row_column_major gpu eig dgecon openmp pthread spblas_mkl 31 | 32 | get_sec: get_secv.h get_sec.c 33 | $(CC) -c get_sec.c -o get_sec.o 34 | 35 | tkaux: tkaux.h tkaux.c 36 | $(CC) -c tkaux.c -o tkaux.o 37 | 38 | first: first.c complex_first.c complex_first_cpp.cc 39 | $(CC) first.c -o first $(LIB) 40 | $(CC) complex_first.c -o complex_first $(LIB) 41 | $(CPP) complex_first_cpp.cc -o complex_first_cpp $(LIB) 42 | 43 | matvec_mul: matvec_mul.c my_matvec_mul.c 44 | $(CC) $(LAPACKE_INC) matvec_mul.c -o matvec_mul $(LAPACKE_LIB) 45 | $(CC) my_matvec_mul.c -o my_matvec_mul 46 | $(CC) $(LAPACKE_INC) complex_matvec_mul.c -o complex_matvec_mul $(LAPACKE_LIB) 47 | 48 | linear_eq: linear_eq.c my_linear_eq.c linear_eq_dgetrf.c linear_eq_dsgesv.c linear_eq_dsposv.c linear_eq_dsysv.c linear_eq_dgbsv.c invpower_eig.c 49 | $(CC) $(LAPACKE_INC) linear_eq.c -o linear_eq $(LAPACKE_LIB) 50 | $(CC) $(LAPACKE_INC) linear_eq_dgetrf.c -o linear_eq_dgetrf $(LAPACKE_LIB) 51 | $(CC) $(LAPACKE_INC) linear_eq_dsgesv.c -o linear_eq_dsgesv $(LAPACKE_LIB) 52 | $(CC) $(LAPACKE_INC) linear_eq_dsposv.c -o linear_eq_dsposv $(LAPACKE_LIB) 53 | $(CC) $(LAPACKE_INC) linear_eq_dsysv.c -o linear_eq_dsysv $(LAPACKE_LIB) 54 | $(CC) $(LAPACKE_INC) linear_eq_dgbsv.c -o linear_eq_dgbsv $(LAPACKE_LIB) 55 | $(CC) my_linear_eq.c -o my_linear_eq 56 | $(CC) $(LAPACKE_INC) invpower_eig.c -o invpower_eig $(LAPACKE_LIB) 57 | 58 | blas: blas1.c blas2.c blas3.c blas1_norm.c jacobi_iteration.c power_eig.c 59 | $(CC) $(CBLAS_INC) blas1.c -o blas1 $(CBLAS_LIB) 60 | $(CC) $(CBLAS_INC) blas1_norm.c -o blas1_norm $(CBLAS_LIB) 61 | $(CC) $(CBLAS_INC) blas2.c -o blas2 $(CBLAS_LIB) 62 | $(CC) $(CBLAS_INC) blas3.c -o blas3 $(CBLAS_LIB) 63 | $(CC) $(CBLAS_INC) jacobi_iteration.c -o jacobi_iteration $(CBLAS_LIB) 64 | $(CC) $(CBLAS_INC) power_eig.c -o power_eig $(CBLAS_LIB) 65 | 66 | mycuda: mycuda.h mycuda.c 67 | $(CC) $(CUDA_INC) -c mycuda.c -o mycuda.o 68 | 69 | gpu: matvec_mul_cublas.c matvec_mul_magma.c linear_eq_magma.c lapack_dgeev_magma.c bicgstab_csr_cusparse.c mycuda get_sec tkaux 70 | $(NVCC) $(CBLAS_INC) -c mycuda_daxpy.cu 71 | $(CC) mycuda_daxpy.o mycuda.o tkaux.o -o mycuda_daxpy $(CBLAS_LIB) $(CUDA_LIB) 72 | $(CC) $(LAPACKE_INC) $(CUDA_INC) matvec_mul_cublas.c -o matvec_mul_cublas mycuda.o $(LAPACKE_LIB) $(CUDA_LIB) 73 | $(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) matvec_mul_magma.c -o matvec_mul_magma mycuda.o $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB) 74 | $(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) matvec_mul_magma_pure.c -o matvec_mul_magma_pure $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB) 75 | $(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) linear_eq_magma.c -o linear_eq_magma $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB) 76 | $(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) -DUSE_MAGMA lapack_dgeev_magma.c tkaux.o -o lapack_dgeev_magma $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB) 77 | $(CC) $(CUDA_INC) $(CBLAS_INC) bicgstab_csr_cusparse.c mm/matrix_market_io.c tkaux.o get_sec.o mycuda.o -o bicgstab_csr_cusparse $(CUDA_LIB) $(CBLAS_LIB) 78 | 79 | row_column_major: row_column_major.c complex_row_column_major.c lapack_complex_row_column_major.c 80 | $(CC) $(INC) row_column_major.c -o row_column_major $(LAPACKE_LIB) 81 | $(CC) $(INC) complex_row_column_major.c -o complex_row_column_major $(LAPACKE_LIB) 82 | $(CC) $(INC) $(LAPACKE_INC) lapack_complex_row_column_major.c -o lapack_complex_row_column_major $(LAPACKE_LIB) 83 | 84 | dgecon: lapack_dgecon.c lapack_lamch.c 85 | $(CC) $(INC) $(LAPACKE_INC) lapack_dgecon.c -o lapack_dgecon $(LAPACKE_LIB) $(CBLAS_LIB) 86 | $(CC) $(INC) $(LAPACKE_INC) lapack_lamch.c -o lapack_lamch $(LAPACKE_LIB) $(CBLAS_LIB) 87 | 88 | eig: lapack_dgeev.c lapack_dsyev.c lapack_ssyev.c tkaux 89 | $(CC) $(LAPACKE_INC) lapack_dgeev.c tkaux.o -o lapack_dgeev $(LAPACKE_LIB) 90 | $(CC) $(LAPACKE_INC) lapack_dsyev.c -o lapack_dsyev $(LAPACKE_LIB) 91 | $(CC) $(LAPACKE_INC) lapack_ssyev.c -o lapack_ssyev $(LAPACKE_LIB) 92 | 93 | # need IMKL! 94 | spblas_mkl: jacobi_iteration_mkl.c jacobi_iteration_csr_mkl.c bicgstab_mkl.c bicgstab_csr_mkl.c tkaux 95 | $(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_mkl.c mm/matrix_market_io.c -o jacobi_iteration_mkl tkaux.o $(IMKL_LIB) 96 | $(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_csr_mkl.c mm/matrix_market_io.c -o jacobi_iteration_csr_mkl tkaux.o $(IMKL_LIB) 97 | $(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_mkl.c mm/matrix_market_io.c -o bicgstab_mkl tkaux.o $(IMKL_LIB) 98 | $(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_csr_mkl.c mm/matrix_market_io.c -o bicgstab_csr_mkl tkaux.o $(IMKL_LIB) 99 | 100 | openmp: my_matvec_mul.c my_linear_eq_omp.c 101 | $(CC) $(OPENMP) $(CBLAS_INC) my_matvec_mul.c -o my_matvec_mul_omp $(CBLAS_LIB) 102 | $(CC) $(OPENMP) $(CBLAS_INC) my_linear_eq_omp.c -o my_linear_eq_omp $(CBLAS_LIB) 103 | 104 | pthread: my_matvec_mul_pt.c 105 | $(CC) $(CBLAS_INC) my_matvec_mul_pt.c -o my_matvec_mul_pt $(CBLAS_LIB) -lpthread 106 | 107 | clean: 108 | -rm *.o 109 | -rm first 110 | -rm complex_first 111 | -rm complex_first_cpp 112 | -rm matvec_mul 113 | -rm my_matvec_mul 114 | -rm complex_matvec_mul 115 | -rm linear_eq 116 | -rm linear_eq_dgetrf 117 | -rm linear_eq_dgbsv 118 | -rm linear_eq_dsysv 119 | -rm linear_eq_dsgesv 120 | -rm linear_eq_dsposv 121 | -rm my_linear_eq 122 | -rm invpower_eig 123 | -rm blas1 124 | -rm blas1_norm 125 | -rm blas2 126 | -rm blas3 127 | -rm jacobi_iteration 128 | -rm power_eig 129 | -rm mycuda_daxpy 130 | -rm matvec_mul_cublas 131 | -rm matvec_mul_magma 132 | -rm matvec_mul_magma_pure 133 | -rm linear_eq_magma 134 | -rm lapack_dgeev_magma 135 | -rm bicgstab_csr_cusparse 136 | -rm row_column_major 137 | -rm complex_row_column_major 138 | -rm lapack_complex_row_column_major 139 | -rm lapack_dgecon 140 | -rm lapack_lamch 141 | -rm lapack_dgeev 142 | -rm lapack_dsyev 143 | -rm lapack_ssyev 144 | -rm jacobi_iteration_mkl 145 | -rm jacobi_iteration_csr_mkl 146 | -rm bicgstab_mkl 147 | -rm bicgstab_csr_mkl 148 | -rm my_matvec_mul_omp 149 | -rm my_linear_eq_omp 150 | -rm my_matvec_mul_pt 151 | 152 | tarzip: 153 | -rm ../lapack-dist.tar.gz 154 | (cd ../; tar zcvf lapack-dist.tar.gz lapack-dist/*.txt lapack-dist/*.h lapack-dist/*.c lapack-dist/*.cc lapack-dist/Makefile lapack-dist/*.cu lapack-dist/*.inc lapack-dist/mm/*.h lapack-dist/mm/*.c lapack-dist/mm/*.mtx lapack-dist/integral_eq/*.c lapack-dist/integral_eq/Makefile lapack-dist/integral_eq/*.h; cd lapack-dist;) 155 | 156 | zip: 157 | -rm ../lapack-dist.zip 158 | (cd ../; zip -ll -v lapack-dist.zip lapack-dist/*.txt lapack-dist/*.h lapack-dist/*.c lapack-dist/*.cc lapack-dist/Makefile.* lapack-dist/*.cu lapack-dist/*.inc lapack-dist/mm/*.h lapack-dist/mm/*.c lapack-dist/mm/*.mtx lapack-dist/integral_eq/*.c lapack-dist/integral_eq/Makefile.* lapack-dist/integral_eq/*.h; cd ./lapack-dist) 159 | 160 | -------------------------------------------------------------------------------- /Makefile.win_intel: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Makefile for Intel C compiler on Windows # 4 | # Last Update: 2016-12-02 (Fri) T.Kouya # 5 | #*************************************************# 6 | # ------------------------ start of configuration -------------------------- 7 | include ./lapack_win_intel.inc 8 | INC = /I.\windows 9 | # ------------------------ end of configuration -------------------------- 10 | 11 | all: first matvec_mul linear_eq blas row_column_major eig spblas dgecon openmp 12 | 13 | get_sec: get_secv.h get_sec.c 14 | $(CC) -c get_sec.c 15 | 16 | first: first.c complex_first.c complex_first_cpp.cc 17 | $(CC) first.c 18 | $(CC) complex_first.c 19 | $(CPP) complex_first_cpp.cc 20 | 21 | matvec_mul: matvec_mul.c my_matvec_mul.c 22 | $(CC) $(LAPACKE_INC) matvec_mul.c -o matvec_mul $(LAPACKE_LIB) 23 | $(CC) my_matvec_mul.c -o my_matvec_mul 24 | $(CC) $(LAPACKE_INC) complex_matvec_mul.c -o complex_matvec_mul $(LAPACKE_LIB) 25 | 26 | linear_eq: linear_eq.c my_linear_eq.c linear_eq_dgetrf.c linear_eq_dsgesv.c linear_eq_dsposv.c linear_eq_dsysv.c linear_eq_dgbsv.c invpower_eig.c 27 | $(CC) $(LAPACKE_INC) linear_eq.c $(LAPACKE_LIB) 28 | $(CC) $(LAPACKE_INC) linear_eq_dgetrf.c $(LAPACKE_LIB) 29 | $(CC) $(LAPACKE_INC) linear_eq_dsgesv.c $(LAPACKE_LIB) 30 | $(CC) $(LAPACKE_INC) linear_eq_dsposv.c $(LAPACKE_LIB) 31 | $(CC) $(LAPACKE_INC) linear_eq_dsysv.c $(LAPACKE_LIB) 32 | $(CC) $(LAPACKE_INC) linear_eq_dgbsv.c $(LAPACKE_LIB) 33 | $(CC) my_linear_eq.c 34 | $(CC) $(LAPACKE_INC) invpower_eig.c $(LAPACKE_LIB) 35 | 36 | blas: blas1.c blas2.c blas3.c blas1_norm.c jacobi_iteration.c power_eig.c 37 | $(CC) $(CBLAS_INC) blas1.c $(CBLAS_LIB) 38 | $(CC) $(CBLAS_INC) blas1_norm.c $(CBLAS_LIB) 39 | $(CC) $(CBLAS_INC) blas2.c $(CBLAS_LIB) 40 | $(CC) $(CBLAS_INC) blas3.c $(CBLAS_LIB) 41 | $(CC) $(CBLAS_INC) jacobi_iteration.c $(CBLAS_LIB) 42 | $(CC) $(CBLAS_INC) power_eig.c $(CBLAS_LIB) 43 | 44 | row_column_major: row_column_major.c complex_row_column_major.c lapack_complex_row_column_major.c 45 | $(CC) $(INC) row_column_major.c -o row_column_major $(LAPACKE_LIB) 46 | $(CC) $(INC) complex_row_column_major.c -o complex_row_column_major $(LAPACKE_LIB) 47 | # $(CC) $(INC) $(LAPACKE_INC) lapack_complex_row_column_major.c -o lapack_complex_row_column_major $(LAPACKE_LIB) 48 | 49 | dgecon: lapack_dgecon.c lapack_lamch.c 50 | $(CC) $(INC) $(LAPACKE_INC) lapack_dgecon.c $(LAPACKE_LIB) $(CBLAS_LIB) 51 | $(CC) $(INC) $(LAPACKE_INC) lapack_lamch.c $(LAPACKE_LIB) $(CBLAS_LIB) 52 | 53 | eig: lapack_dgeev.c lapack_dsyev.c lapack_ssyev.c 54 | $(CC) $(LAPACKE_INC) lapack_dgeev.c tkaux.c $(LAPACKE_LIB) 55 | $(CC) $(LAPACKE_INC) lapack_dsyev.c $(LAPACKE_LIB) 56 | $(CC) $(LAPACKE_INC) lapack_ssyev.c $(LAPACKE_LIB) 57 | 58 | spblas: jacobi_iteration_mkl.c jacobi_iteration_csr_mkl.c bicgstab_mkl.c bicgstab_csr_mkl.c 59 | $(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB) 60 | $(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_csr_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB) 61 | $(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB) 62 | $(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_csr_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB) 63 | 64 | openmp: my_matvec_mul.c my_linear_eq_omp.c 65 | $(CC) $(OPENMP) $(CBLAS_INC) my_matvec_mul.c -o my_matvec_mul_omp $(CBLAS_LIB) 66 | $(CC) $(OPENMP) $(CBLAS_INC) my_linear_eq_omp.c -o my_linear_eq_omp $(CBLAS_LIB) 67 | 68 | clean: 69 | -del *.exe 70 | -del *.obj 71 | 72 | zip: 73 | -del ..\lapack-dist.zip 74 | cd ../ 75 | zip -ll -v lapack-dist.zip lapack-dist/*.txt lapack-dist/*.h lapack-dist/*.c lapack-dist/*.cc lapack-dist/Makefile.* lapack-dist/*.cu lapack-dist/*.inc lapack-dist/mm/*.h lapack-dist/mm/*.c lapack-dist/mm/*.mtx lapack-dist/integral_eq/*.c lapack-dist/integral_eq/Makefile.* lapack-dist/integral_eq/*.h lapack-dist/windows/ 76 | cd ./lapack-dist 77 | 78 | -------------------------------------------------------------------------------- /README.ja.md: -------------------------------------------------------------------------------- 1 | LAPACK/BLAS 入門:サンプルプログラム 2 | ============================================================ 3 | 4 | 2016-12-02 (Fri) 幸谷 智紀 5 | --------------------------------- 6 | 7 |  以下のファイルは「LAPACK/BLAS入門」(森北出版)で解説しているプログラムです。詳細については本文を参照して下さい。 8 | 9 |  本文及び演習問題で使われているプログラムはLinux環境下で実行を確認したものです。Windows環境下ではCygwinを使うことで同等の環境を整えることができます。Windows環境下での,Visual C++&Intel C++ compilerを使ったBLAS, LAPACK用プログラムのコンパイルも可能です。 10 | 11 | ☆サンプルプログラム一覧 12 | ----------------------------- 13 | 14 | ### 共通 15 | > lapack_gcc.inc ... GCC用設定ファイル (Linux) 16 | > lapack_icc.inc ... Intel C compiler用設定ファイル (Linux) 17 | > lapack_win_intel.inc ... Intel C compiler用設定ファイル (Windows) 18 | > Makefile.unix ... Linux用メイクファイル → GCCかIntel C compilerかを選び,設定ファイルを読み込ませて,"make -f Makefile.unix"で生成 19 | > Makefile.win_intel ... Windows用メイクファイル → Intel C compilerの設定ファイルを読み込ませて,"make -f Makefile.win_intel"で生成 20 | > windows\ ... Windows環境下でCBLAS, LAPACKEのインクルードファイルを置いておくフォルダ 21 | 22 | ### 第1章 23 | > first.c ... 単精度,倍精度基本演算と相対誤差の導出 24 | > complex_first.c ... 複素数演算(C言語用) 25 | > complex_first_cpp.c ... 複素数演算(C++用) 26 | 27 | ### 第2章 28 | > my_matvec_mul.c ... 行列・ベクトル積 29 | > matvec_mul.c ... DGEMVを用いた実行列・ベクトル積 30 | > complex_matvec_mul.c ... ZGEMVを用いた複素行列・ベクトル積 31 | > my_linear_eq.c ... 連立一次方程式の求解 32 | > linear_eq.c ... DGESVを用いた連立一次方程式の求解 33 | > row_column_major.c ... 行優先,列優先行列格納形式 34 | > complex_row_column_major.c ... 複素数行優先,列優先行列格納形式 35 | > lapack_complex_row_column_major.c ... LAPACK関数を用いた複素数行優先,列優先行列格納形式 36 | > lapack_complex_row_column_major.cc ... LAPACK関数を用いた複素数行優先,列優先行列格納形式(C++) 37 | 38 | ### 第3章 39 | > blas1.c ... BLAS1関数サンプル 40 | > blas2.c ... BLAS2関数サンプル 41 | > blas3.c ... BLAS3関数サンプル 42 | > jacobi_iteration.c ... ヤコビ反復法 43 | > power_eig.c ... べき乗法 44 | 45 | ### 第4章 46 | > linear_eq_dgetrf.c ... LU分解,前進代入・後退代入 47 | > linear_eq_dsgesv.c ... 混合精度反復改良法 48 | > linear_eq_dsposv.c ... 実対称行列用の混合精度反復改良法 49 | > lapack_dgecon.c ... 条件数の計算 50 | > lapack_lamch.c ... マシンイプシロン等の導出 51 | > invpower_eig.c ... 逆べき乗法 52 | > lapack_dgeev.c ... 実非対称行列用固有値・固有ベクトル計算 53 | > lapack_dsyev.c ... 実対称行列用固有値・固有ベクトル計算 54 | > lapack_ssyev.c ... 実対称行列用固有値・固有ベクトル計算(単精度) 55 | 56 | ### 第5章 57 | > my_matvec_mul_pt.c ... Pthreadで並列化した行列・ベクトル積計算 58 | > my_matvec_mul_omp.c ... OpenMPで並列化した行列・ベクトル積計算 59 | > my_linear_eq_omp.c ... OpenMPで並列化したLU分解,前進代入・後退代入計算 60 | 61 | ### 第6章 62 | > jacobi_iteration_mkl.c ... COO形式疎行列用のJacobi反復法 63 | > jacobi_iteration_csr_mkl.c... CSR形式疎行列用のJacobi反復法 64 | > bicgstab_mkl.c ... COO形式疎行列用のBiCGSTAB法 65 | > bicgstab_csr_mkl.c ... CSR形式疎行列用のBiCGSTAB法 66 | > mm/matrix_market_io.h ... MatrixMarketフォーマット用関数定義 67 | > mm/matrix_market_io.c ... MatrixMarketフォーマット用関数群 68 | 69 | ### 第7章 (Windows環境下の実行はサポートしていません) 70 | > mycuda_daxpy.cu ... CUDAサンプルプログラム 71 | > matvec_mul_cublas.c ... CUBLASを用いた行列・ベクトル積 72 | > matvec_mul_magma.c ... MAGMAとCUBLASを用いた行列・ベクトル積 73 | > matvec_mul_magma_pure.c ... MAGMAだけを用いた行列・ベクトル積 74 | > linear_eq_magma.c ... MAGMAを用いた連立一次方程式の求解 75 | > lapack_dgeev_magma.c ... MAGMAを用いた実非対称行列用固有値・固有ベクトル計算 76 | > bicgstab_csr_cusparse.c ... cuSPARSEを用いたBiCGSTAB法 77 | 78 | ### 第8章 79 | > integral_eq/Makefile.unix ... 積分方程式求解プログラムのコンパイル(Linux) 80 | > integral_eq/Makefile.win_intel ... 積分方程式求解プログラムのコンパイル(Windows) 81 | > integral_eq/gauss_integral.h ... ガウス積分公式導出のためのヘッダファイル 82 | > integral_eq/gauss_integral.c ... ガウス積分公式の導出 83 | > integral_eq/iteration.c ... 割線法とデリバティブフリー解法 84 | 85 | ☆コンパイル条件 86 | ----------------------------- 87 | 88 |  本プログラムはLinux, Windowsソフトウェア開発環境下でコンパイル&実行可能であることを下記の環境で確認しております。 89 | 90 | ・Linux ... GCC 4.4.7, Intel C/C++/Fortran compiler 13.1.3, Intel Math Kernel Library 11.0.5, LAPACK 3.6.0, MAGMA 1.6.0, CUDA 7.5 on CentOS 6.5 x86_64 91 | ... GCC 4.4.7, Iitel C/C++/Fortran compiler 14.0.2, Intel Math Kernel Library 11.1.2, LAPACK 3.6.0, MAGMA 1.6.1, CUDA 7.0 on CentOS 6.3 x86_64 92 | ・Windows ... Intel C++ compiler 16.0.1.146, Intel Math Kernel Library 11.3.2 on Windows 8.1 x64 93 | 94 |  Linux, Windows環境下での本プログラムのコンパイル&実行は,上記のソフトウェア環境が整っているCUIで行って下さい。それ以外の環境下での諸問題については確認ができませんので,お答えすることも不可能です。 95 | 96 | 97 | ☆コンパイル方法 ... Linux 98 | ----------------------------- 99 | 100 | 0. Intel Math Kernel, CUDA, MAGMA, LAPACKE/CBLASをインストールし,インストール先のディレクトリをMakefile.unix内の適切なマクロ名に設定 101 | 1. Intel C/C++ Compilerの場合はlapack_icc.incを,GCCの場合はlapack_gcc.incを環境に合わせて修正し,それぞれのコンパイラが適切に動作するよう環境設定を行い,Makefile.unixが読み込むファイルを設定 102 | 2. make -f ./Makefile.unix でコンパイル 103 | 3. make -f ./Makefile.unix clean でobjectファイル,実行ファイルが消去される 104 | 105 | ☆コンパイル方法 ... Windows 106 | ----------------------------- 107 | 108 | 0. Intel Math Kernelをインストールし,インストール先のディレクトリをMakefile.win_intel内の適切なマクロ名に設定。また,LAPACKEとCBLASのインクルードファイルをwindowsフォルダにコピーしておく(デフォルト設定の場合)。 109 | 1. Intel C/C++ CompilerとVisual C++が適切に動作するよう環境設定を行う 110 | 2. nmake -f ./Makefile.win_intel でコンパイル 111 | 3. nmake -f ./Makefile.win_intel clean でobjectファイル,実行ファイルが消去される 112 | 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | LAPACK/BLAS Tutorial: List of sample programs 2 | ============================================================ 3 | 4 | 2016-12-02 (Fri) Tomonori Kouya 5 | --------------------------------- 6 | 7 | The following programs are published for all readers of "LAPACK/BLAS Tutorial" written in JAPANESE. 8 | You can read "README.ja.md"(Japanese!) if you want to get detailed description about them. 9 | 10 | *Common* 11 | lapack_gcc.inc 12 | lapack_icc.inc 13 | lapack_win_intel.inc 14 | Makefile.unix 15 | Makefile.win_intel 16 | windows\ 17 | 18 | *Chapter 1* 19 | first.c 20 | complex_first.c 21 | complex_first_cpp.c 22 | 23 | *Chapter 2* 24 | my_matvec_mul.c 25 | matvec_mul.c 26 | complex_matvec_mul.c 27 | my_linear_eq.c 28 | linear_eq.c 29 | row_column_major.c 30 | complex_row_column_major.c 31 | lapack_complex_row_column_major.c 32 | lapack_complex_row_column_major.cc 33 | 34 | *Chapter 3* 35 | blas1.c 36 | blas2.c 37 | blas3.c 38 | jacobi_iteration.c 39 | power_eig.c 40 | 41 | *Chapter 4* 42 | linear_eq_dgetrf.c 43 | linear_eq_dsgesv.c 44 | linear_eq_dsposv.c 45 | lapack_dgecon.c 46 | lapack_lamch.c 47 | invpower_eig.c 48 | lapack_dgeev.c 49 | lapack_dsyev.c 50 | lapack_ssyev.c 51 | 52 | *Chapter 5* 53 | my_matvec_mul_pt.c 54 | my_matvec_mul_omp.c 55 | my_linear_eq_omp.c 56 | 57 | *Chapter 6* 58 | jacobi_iteration_mkl.c 59 | jacobi_iteration_csr_mkl.c 60 | bicgstab_mkl.c 61 | bicgstab_csr_mkl.c 62 | mm/matrix_market_io.h 63 | mm/matrix_market_io.c 64 | 65 | *Chapter 7* (Caution: These programs cannot be compiled on Windows!) 66 | mycuda_daxpy.cu 67 | matvec_mul_cublas.c 68 | matvec_mul_magma.c 69 | matvec_mul_magma_pure.c 70 | linear_eq_magma.c 71 | lapack_dgeev_magma.c 72 | bicgstab_csr_cusparse.c 73 | 74 | *Chapter 8* 75 | integral_eq/Makefile.unix 76 | integral_eq/Makefile.win_intel 77 | integral_eq/gauss_integral.h 78 | integral_eq/gauss_integral.c 79 | integral_eq/iteration.c 80 | 81 | -------------------------------------------------------------------------------- /bicgstab_csr_cusparse.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_csr_cusparse.c -------------------------------------------------------------------------------- /bicgstab_csr_magma.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_csr_magma.c -------------------------------------------------------------------------------- /bicgstab_csr_mkl.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_csr_mkl.c -------------------------------------------------------------------------------- /bicgstab_mkl.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_mkl.c -------------------------------------------------------------------------------- /blas1.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of BLAS Level1 */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | #include "cblas.h" 11 | 12 | int main() 13 | { 14 | int i, dim; 15 | int inc_vb, inc_vc, inc_va; 16 | double *va, *vb, *vc; 17 | double alpha; 18 | 19 | // input dimension 20 | printf("Dim = "); scanf("%d", &dim); 21 | 22 | if(dim <= 0) 23 | { 24 | printf("Illegal dimension! (dim = %d)\n", dim); 25 | return EXIT_FAILURE; 26 | } 27 | 28 | // Initialize 29 | va = (double *)calloc(dim, sizeof(double)); 30 | vb = (double *)calloc(dim, sizeof(double)); 31 | vc = (double *)calloc(dim, sizeof(double)); 32 | 33 | // input va and vb 34 | for(i = 0; i < dim; i++) 35 | { 36 | va[i] = i + 1; 37 | vb[i] = dim - (i + 1); 38 | } 39 | 40 | //vc := vb 41 | inc_vb = inc_vc = inc_va = 1; 42 | cblas_dcopy(dim, vb, inc_vb, vc, inc_vc); 43 | 44 | // vc := 1.0 * va + vb 45 | alpha = 1.0; 46 | cblas_daxpy(dim, alpha, va, inc_va, vc, inc_vc); 47 | 48 | // print 49 | for(i = 0; i < dim; i++) 50 | printf("%10.3f + %10.3f = %10.3f\n", *(va + i), *(vb + i), *(vc + i)); 51 | 52 | // free 53 | free(va); 54 | free(vb); 55 | free(vc); 56 | 57 | return EXIT_SUCCESS; 58 | } 59 | -------------------------------------------------------------------------------- /blas1_norm.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of BLAS Level1 */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | #include "cblas.h" 11 | 12 | int main() 13 | { 14 | int i, dim; 15 | int inc_vb, inc_vc, inc_va; 16 | double *va, *vb, *vc; 17 | double alpha, norm1, norm2, normi; 18 | 19 | // input dimension 20 | printf("Dim = "); scanf("%d", &dim); 21 | 22 | if(dim <= 0) 23 | { 24 | printf("Illegal dimension! (dim = %d)\n", dim); 25 | return EXIT_FAILURE; 26 | } 27 | 28 | // Initialize 29 | va = (double *)calloc(dim, sizeof(double)); 30 | vb = (double *)calloc(dim, sizeof(double)); 31 | vc = (double *)calloc(dim, sizeof(double)); 32 | 33 | // input va and vb 34 | for(i = 0; i < dim; i++) 35 | { 36 | va[i] = i + 1; 37 | vb[i] = dim - (i + 1); 38 | } 39 | 40 | //vc := vb 41 | inc_vb = inc_vc = inc_va = 1; 42 | cblas_dcopy(dim, vb, inc_vb, vc, inc_vc); 43 | 44 | // vc := 1.0 * va + vb 45 | alpha = 1.0; 46 | cblas_daxpy(dim, alpha, va, inc_va, vc, inc_vc); 47 | 48 | // print 49 | for(i = 0; i < dim; i++) 50 | printf("%10.3f + %10.3f = %10.3f\n", *(va + i), *(vb + i), *(vc + i)); 51 | 52 | // norm_1, 2, i 53 | norm1 = cblas_dasum(dim, va, 1); 54 | norm2 = cblas_dnrm2(dim, va, 1); 55 | normi = fabs(vc[cblas_idamax(dim, va, 1)]); 56 | 57 | printf("norm1, norm2, normi = %10.3e, %10.3e, %10.3e\n", norm1, norm2, normi); 58 | 59 | // free 60 | free(va); 61 | free(vb); 62 | free(vc); 63 | 64 | return EXIT_SUCCESS; 65 | } 66 | -------------------------------------------------------------------------------- /blas2.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of BLAS Level2 */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2013-08-05 (Mon) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | #include "cblas.h" 11 | 12 | int main() 13 | { 14 | int i, j, dim; 15 | int inc_vb, inc_vc; 16 | double *ma, *vb, *vc; 17 | double alpha, beta; 18 | 19 | // input dimension 20 | printf("Dim = "); scanf("%d", &dim); 21 | 22 | if(dim <= 0) 23 | { 24 | printf("Illegal dimension! (dim = %d)\n", dim); 25 | return EXIT_FAILURE; 26 | } 27 | 28 | // Initialize 29 | ma = (double *)calloc(dim * dim, sizeof(double)); 30 | vb = (double *)calloc(dim, sizeof(double)); 31 | vc = (double *)calloc(dim, sizeof(double)); 32 | 33 | // input ma and vb 34 | for(i = 0; i < dim; i++) 35 | { 36 | for(j = 0; j < dim; j++) 37 | ma[i * dim + j] = sqrt(2.0) * (double)(dim - (i + j + 1)); 38 | vb[i] = sqrt(2.0) * (double)(i + 1); 39 | } 40 | 41 | //vc := vb 42 | inc_vb = inc_vc = 1; 43 | 44 | // vc := 1.0 * ma * vb 45 | alpha = 1.0; 46 | beta = 0.0; 47 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, alpha, ma, dim, vb, inc_vb, beta, vc, inc_vc); 48 | 49 | // print 50 | for(i = 0; i < dim; i++) 51 | { 52 | printf("["); 53 | for(j = 0; j < dim; j++) 54 | printf("%10.3f ", ma[i * dim + j]); 55 | printf("] %10.3f = %10.3f\n", vb[i], vc[i]); 56 | } 57 | 58 | // free 59 | free(ma); 60 | free(vb); 61 | free(vc); 62 | 63 | return EXIT_SUCCESS; 64 | } 65 | -------------------------------------------------------------------------------- /blas3.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of BLAS Level3 */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | #include "cblas.h" 12 | 13 | int main() 14 | { 15 | int i, j, dim; 16 | double *ma, *mb, *mc; 17 | double alpha, beta; 18 | 19 | // input dimension 20 | printf("Dim = "); scanf("%d", &dim); 21 | 22 | if(dim <= 0) 23 | { 24 | printf("Illegal dimension! (dim = %d)\n", dim); 25 | return EXIT_FAILURE; 26 | } 27 | 28 | // Initialize 29 | ma = (double *)calloc(dim * dim, sizeof(double)); 30 | mb = (double *)calloc(dim * dim, sizeof(double)); 31 | mc = (double *)calloc(dim * dim, sizeof(double)); 32 | 33 | // input va and vb 34 | for(i = 0; i < dim; i++) 35 | { 36 | for(j = 0; j < dim; j++) 37 | { 38 | ma[i * dim + j] = sqrt(2.0) * (i + j + 1); 39 | mb[i * dim + j] = sqrt(2.0) * (dim * 2 - (i + j + 1)); 40 | } 41 | } 42 | 43 | // print 44 | for(i = 0; i < dim; i++) 45 | { 46 | printf("["); 47 | for(j = 0; j < dim; j++) 48 | printf("%10.3f ", ma[i * dim + j]); 49 | printf("] ["); 50 | for(j = 0; j < dim; j++) 51 | printf("%10.3f ", mb[i * dim + j]); 52 | printf("]\n"); 53 | } 54 | 55 | // mc := 1.0 * ma * mb + 0.0 * mc 56 | alpha = 1.0; 57 | beta = 0.0; 58 | cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, mb, dim, beta, mc, dim); 59 | 60 | // print 61 | printf(" = \n"); 62 | for(i = 0; i < dim; i++) 63 | { 64 | printf("%3d: [", i); 65 | for(j = 0; j < dim; j++) 66 | printf("%10.3f ", mc[i * dim + j]); 67 | printf("]\n"); 68 | } 69 | 70 | return EXIT_SUCCESS; 71 | } 72 | -------------------------------------------------------------------------------- /complex_first.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample program with C99 complex type */ 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include // C99 Complex data type 9 | 10 | int main() 11 | { 12 | float complex cc = 0.0, ca = -2.0 + 2.0 * I, cb = 3.0 - 3.0 * I; 13 | double complex zc = 0.0, za = -2.0 + 2.0 * I, zb = 3.0 - 3.0 * I; 14 | double relerr; 15 | 16 | // basic arithmetic: float complex 17 | printf("--- float data type(single precsion floating-point number) ---\n"); 18 | cc = ca + cb; 19 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca), crealf(cb), cimagf(cb)); 20 | cc = ca - cb; 21 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca), crealf(cb), cimagf(cb)); 22 | cc = ca * cb; 23 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca), crealf(cb), cimagf(cb)); 24 | cc = ca / cb; 25 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimag(ca), creal(cb), cimagf(cb)); 26 | 27 | // absolute value and square root: float 28 | cc = cabsf(ca); 29 | printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca)); 30 | cc = csqrtf(cb); 31 | printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(cb), cimagf(cb)); 32 | 33 | // basic arithmetic: double 34 | printf("--- double data type(double precsion floating-point number) ---\n"); 35 | zc = za + zb; 36 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb)); 37 | zc = za - zb; 38 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb)); 39 | zc = za * zb; 40 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb)); 41 | zc = za / zb; 42 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb)); 43 | 44 | // absolute value and square root: double 45 | zc = cabs(za); 46 | printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", creal(zc), cimag(zc), creal(za), cimag(za)); 47 | zc = csqrt(zb); 48 | printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(zb), cimag(zb)); 49 | 50 | // relative error of float square root 51 | relerr = cabs(cc - zc); 52 | if(cabs(zc) > 0.0) 53 | relerr /= cabs(zc); 54 | 55 | printf("Single Prec. : %25.17e + %25.17e * I\n", creal(cc), cimag(cc)); 56 | printf("Double Prec. : %25.17e + %25.17e * I\n", creal(zc), cimag(zc)); 57 | printf("Relative Error: %10.3e\n", relerr); 58 | 59 | // real part 60 | relerr = fabs(creal(cc) - creal(zc)); 61 | if(fabs(creal(zc)) > 0.0) 62 | relerr /= fabs(creal(zc)); 63 | 64 | printf("Relative Error(real): %10.3e\n", relerr); 65 | 66 | // imaginary part 67 | relerr = fabs(cimag(cc) - cimag(zc)); 68 | if(fabs(cimag(zc)) > 0.0) 69 | relerr /= fabs(cimag(zc)); 70 | 71 | printf("Relative Error(imag): %10.3e\n", relerr); 72 | 73 | return 0; 74 | } 75 | -------------------------------------------------------------------------------- /complex_first_cpp.cc: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample program with C++ complex type */ 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace std; 11 | 12 | int main() 13 | { 14 | complex cc = 0.0, ca = complex(-2.0, 2.0), cb = complex(3.0, -3.0); 15 | complex zc = 0.0, za = complex(-2.0, 2.0), zb = complex(3.0, -3.0); 16 | double relerr; 17 | 18 | // basic arithmetic: float complex 19 | cout << "--- float data type(single precsion floating-point number) ---" << endl; 20 | cc = ca + cb; 21 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag()); 22 | cc = ca - cb; 23 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag()); 24 | cc = ca * cb; 25 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag()); 26 | cc = ca / cb; 27 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag()); 28 | 29 | // absolute value and square root: float 30 | cc = abs(ca); 31 | printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", cc.real(), cc.imag(), ca.real(), ca.imag()); 32 | cc = sqrt(cb); 33 | printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), cb.real(), cb.imag()); 34 | 35 | // basic arithmetic: double 36 | printf("--- double data type(double precsion floating-point number) ---\n"); 37 | zc = za + zb; 38 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag()); 39 | zc = za - zb; 40 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag()); 41 | zc = za * zb; 42 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag()); 43 | zc = za / zb; 44 | printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag()); 45 | 46 | // absolute value and square root: double 47 | zc = abs(za); 48 | printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", zc.real(), zc.imag(), za.real(), za.imag()); 49 | zc = sqrt(zb); 50 | printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), zb.real(), zb.imag()); 51 | 52 | // relative error of float square root 53 | relerr = abs((complex)cc - zc); 54 | if(abs(zc) > 0.0) 55 | relerr /= abs(zc); 56 | 57 | printf("Single Prec. : %25.17e %+-25.17e * I\n", cc.real(), cc.imag()); 58 | printf("Double Prec. : %25.17e %+-25.17e * I\n", zc.real(), zc.imag()); 59 | printf("Relative Error: %10.3e\n", relerr); 60 | 61 | // real part 62 | relerr = abs(cc.real() - zc.real()); 63 | if(abs(zc.real()) > 0.0) 64 | relerr /= abs(zc.real()); 65 | 66 | printf("Relative Error(real): %10.3e\n", relerr); 67 | 68 | // imaginary part 69 | relerr = abs(cc.imag() - zc.imag()); 70 | if(abs(zc.imag()) > 0.0) 71 | relerr /= abs(zc.imag()); 72 | 73 | printf("Relative Error(imag): %10.3e\n", relerr); 74 | 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /complex_matvec_mul.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Multiplication of C99 complex matrix */ 4 | /* and vector */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | #include // C99 complex data type 11 | 12 | #include "cblas.h" 13 | 14 | int main() 15 | { 16 | int i, j, dim; 17 | int inc_vec_x, inc_vec_b; 18 | 19 | double complex *mat_a, *vec_b, *vec_x; 20 | double complex alpha, beta; 21 | 22 | // input dimension 23 | printf("Dim = "); scanf("%d", &dim); 24 | 25 | if(dim <= 0) 26 | { 27 | printf("Illegal dimension! (dim = %d)\n", dim); 28 | return EXIT_FAILURE; 29 | } 30 | 31 | // initialize a matrix and vectors 32 | mat_a = (double complex *)calloc(dim * dim, sizeof(double complex)); 33 | vec_x = (double complex *)calloc(dim, sizeof(double complex)); 34 | vec_b = (double complex *)calloc(dim, sizeof(double complex)); 35 | 36 | // input mat_a and vec_x 37 | for(i = 0; i < dim; i++) 38 | { 39 | for(j = 0; j < dim; j++) 40 | { 41 | mat_a[i * dim + j] = (double)(i + j + 1) - (double)(i + j + 1) * I; 42 | if((i + j + 1) % 2 != 0) 43 | mat_a[i * dim + j] *= -1.0; 44 | } 45 | vec_x[i] = 1.0 / (double)(i + 1) + 1.0 / (double)(i + 1) * I; 46 | } 47 | 48 | // size(vec_x) == size(vec_b) 49 | inc_vec_x = inc_vec_b = 1; 50 | 51 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 52 | alpha = 1.0; 53 | beta = 0.0; 54 | cblas_zgemv(CblasRowMajor, CblasNoTrans, dim, dim, (void *)&alpha, mat_a, dim, vec_x, inc_vec_x, (void *)&beta, vec_b, inc_vec_b); 55 | 56 | // print 57 | for(i = 0; i < dim; i++) 58 | { 59 | printf("["); 60 | for(j = 0; j < dim; j++) 61 | printf("%6.3lf %+-6.3lf * I ", creal(mat_a[i * dim + j]), cimag(mat_a[i * dim + j])); 62 | printf("] %6.3lf %+-6.3lf * I = %6.3lf %+-6.3lf * I\n", creal(vec_x[i]), cimag(vec_x[i]), creal(vec_b[i]), cimag(vec_b[i])); 63 | } 64 | 65 | // free 66 | free(mat_a); 67 | free(vec_x); 68 | free(vec_b); 69 | 70 | return EXIT_SUCCESS; 71 | } 72 | -------------------------------------------------------------------------------- /complex_row_column_major.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample probram with C99 complex row-major */ 4 | /* and column major matrices */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | #include // C99 complex data type 11 | 12 | #include "cblas.h" 13 | 14 | int main() 15 | { 16 | int i, j, row_dim, col_dim; 17 | 18 | double complex *mat_a; 19 | 20 | // input dimension 21 | printf("Row Dim = "); scanf("%d", &row_dim); 22 | printf("Column Dim = "); scanf("%d", &col_dim); 23 | 24 | if((row_dim <= 0) || (col_dim <= 0)) 25 | { 26 | printf("Illegal dimension! (row_dim = %d, col_dim = %d)\n", row_dim, col_dim); 27 | return EXIT_FAILURE; 28 | } 29 | 30 | // initialize matrix area 31 | mat_a = (double complex *)calloc(row_dim * col_dim, sizeof(double complex)); 32 | 33 | printf("Row Major: %d x %d\n", row_dim, col_dim); 34 | 35 | // Row Major 36 | // mat_a = A 37 | // A = [1+i 2+2i ....... n+ni] 38 | // [n+i n+2i ...... 2n+ni] 39 | // [.....................] 40 | // [(m-1)n+1+i ..... mn+ni] 41 | for(i = 0; i < row_dim; i++) 42 | { 43 | for(j = 0; j < col_dim; j++) 44 | mat_a[i * col_dim + j] = (double)(i * col_dim + j + 1) + (double)(j + 1) * I; 45 | } 46 | 47 | // print (1) 48 | printf("1 dimension: \n"); 49 | printf("["); 50 | for(i = 0; i < row_dim * col_dim; i++) 51 | printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i]), cimag(mat_a[i])); 52 | printf("]\n"); 53 | 54 | // print(2) 55 | printf("2 dimension: \n"); 56 | for(i = 0; i < row_dim; i++) 57 | { 58 | printf("["); 59 | for(j = 0; j < col_dim; j++) 60 | printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i * col_dim + j]), cimag(mat_a[i * col_dim + j])); 61 | printf("]\n"); 62 | } 63 | 64 | printf("Column Major: %d x %d\n", row_dim, col_dim); 65 | 66 | // Column Major 67 | // mat_a = A 68 | // A = [1+i 2+2i ....... n+ni] 69 | // [n+i n+2i ...... 2n+ni] 70 | // [.....................] 71 | // [(m-1)n+1+i ..... mn+ni] 72 | for(j = 0; j < col_dim; j++) 73 | { 74 | for(i = 0; i < row_dim; i++) 75 | mat_a[i + row_dim * j] = (double)(i * col_dim + j + 1) + (double)(j + 1) * I; 76 | } 77 | 78 | // print (1) 79 | printf("1 dimension: \n"); 80 | printf("["); 81 | for(i = 0; i < row_dim * col_dim; i++) 82 | printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i]), cimag(mat_a[i])); 83 | printf("]\n"); 84 | 85 | // print (2) 86 | printf("2 dimension: \n"); 87 | for(i = 0; i < row_dim; i++) 88 | { 89 | printf("["); 90 | for(j = 0; j < col_dim; j++) 91 | printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i + row_dim * j]), cimag(mat_a[i + row_dim * j])); 92 | printf("]\n"); 93 | } 94 | 95 | // free 96 | free(mat_a); 97 | 98 | return EXIT_SUCCESS; 99 | } 100 | -------------------------------------------------------------------------------- /cublas_dgemv.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of BLAS Level3 */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include "get_secv.h" 10 | #ifdef USE_IMKL 11 | #include "mkl.h" 12 | #include "mkl_cblas.h" // for Intel Math Kernel Library 13 | #else 14 | #include "cblas.h" 15 | #endif 16 | 17 | #ifdef USE_CUDA 18 | #include "cuda.h" 19 | #include "cublas_v2.h" 20 | #ifdef USE_MAGMA 21 | #include "magma.h" 22 | #endif 23 | #endif 24 | 25 | // normal 3 loops version 26 | void handmade_dmatmul(double *c, double *a, double *b, int dim) 27 | { 28 | int i, j, k; 29 | 30 | for(i = 0; i < dim; i++) 31 | { 32 | for(j = 0; j < dim; j++) 33 | { 34 | *(c + i * dim + j) = 0.0; 35 | for(k = 0; k < dim; k++) 36 | *(c + i * dim + j) += *(a + i * dim + k) * *(b + k * dim + j); 37 | } 38 | } 39 | } 40 | 41 | // flag = 0 ... use DGEMM in BLAS 42 | // flag = 1 ... use handmade matmul 43 | double get_gflops_dgemm(int flag, int dim) 44 | { 45 | #ifdef USE_ATLAS 46 | int i, j; 47 | #else 48 | MKL_INT i, j; // dimension of vectors 49 | #endif 50 | double *ma, *mb, *mc; 51 | double alpha, beta; 52 | double running_time; 53 | double gflops; 54 | int itimes, max_itimes = 10000; 55 | double tmp; 56 | 57 | // Initialize 58 | ma = (double *)calloc(dim * dim, sizeof(double)); 59 | mb = (double *)calloc(dim * dim, sizeof(double)); 60 | mc = (double *)calloc(dim * dim, sizeof(double)); 61 | 62 | // column major 63 | for(i = 0; i < dim; i++) 64 | { 65 | for(j = 0; j < dim; j++) 66 | { 67 | *(ma + i + j * dim) = sqrt(2.0) * (i + j + 1); 68 | *(mb + i + j * dim) = sqrt(2.0) * (dim * 2 - (i + j + 1)); 69 | } 70 | } 71 | 72 | if(flag == 0) 73 | { 74 | // mc := 1.0 * ma * mb + 0.0 * mc 75 | alpha = 1.0; 76 | beta = 0.0; 77 | tmp = 0.0; 78 | for(itimes = 0; itimes < max_itimes; itimes++) 79 | { 80 | running_time = get_real_secv(); 81 | cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, mb, dim, beta, mc, dim); 82 | tmp += get_real_secv() - running_time; // end 83 | if(tmp > 0.0) 84 | { 85 | running_time = tmp / (itimes + 1); 86 | break; 87 | } 88 | } 89 | } 90 | else if(flag == 1) 91 | { 92 | tmp = 0.0; 93 | for(itimes = 0; itimes < max_itimes; itimes++) 94 | { 95 | running_time = get_real_secv(); 96 | handmade_dmatmul(mc, ma, mb, dim); 97 | tmp += get_real_secv() - running_time; // end 98 | if(tmp > 0.0) 99 | { 100 | running_time = tmp / (itimes + 1); 101 | break; 102 | } 103 | } 104 | 105 | } 106 | 107 | // print 108 | /* for(i = 0; i < dim; i++) 109 | { 110 | printf("%3d: ", i); 111 | for(j = 0; j < dim; j++) 112 | printf("%10f ", *(mc + i * dim + j)); 113 | printf("\n"); 114 | } 115 | */ 116 | // printf("dim = %d, Running Time(sec) = %f\n", dim, running_time); 117 | gflops = 2.0 * (double)dim * (double)dim * (double)dim / running_time / 1024.0 / 1024.0 / 1024.0; 118 | // printf("%f Gflops\n", gflops); 119 | 120 | free(ma); 121 | free(mb); 122 | free(mc); 123 | 124 | return gflops; 125 | } 126 | 127 | // GPGPU 128 | #ifdef USE_CUDA 129 | 130 | // GPU上に行列格納領域を確保 131 | void *mycuda_calloc(int num_elements, size_t size_element) 132 | { 133 | cudaError_t cuda_error; 134 | void *ret = NULL; 135 | 136 | cuda_error = cudaMalloc((void **)&ret, num_elements * size_element); 137 | 138 | if(cuda_error != cudaSuccess) 139 | { 140 | printf("device memory allocation failed!(num_elements = %d, size = %d)\n", num_elements, size_element); 141 | return NULL; 142 | } 143 | 144 | return ret; 145 | } 146 | 147 | // GPU上のメモリ領域を解放 148 | void mycuda_free(void *mem) 149 | { 150 | cudaFree(mem); 151 | } 152 | 153 | // flag = 0 ... use DGEMM in cuBLAS 154 | // flag = 1 ... use magma_dgemm in MAGMA 155 | double get_gflops_cuda_dgemm(int flag, int dim, double *total_running_time) 156 | { 157 | int i, j; // dimension of vectors 158 | double *ma, *mb, *mc, *mc_host; // on CPU 159 | double *dev_ma, *dev_mb, *dev_mc; // on GPU 160 | double alpha, beta; 161 | double running_time, tmp_time, tmp_total_time; 162 | double gflops; 163 | int itimes, max_itimes = 10000; 164 | cublasStatus_t status; 165 | cublasHandle_t handle; 166 | 167 | // Initialize on CPU 168 | ma = (double *)calloc(dim * dim, sizeof(double)); 169 | mb = (double *)calloc(dim * dim, sizeof(double)); 170 | mc = (double *)calloc(dim * dim, sizeof(double)); 171 | mc_host = (double *)calloc(dim * dim, sizeof(double)); 172 | 173 | // Initialize on GPU 174 | dev_ma = (double *)mycuda_calloc(dim * dim, sizeof(double)); 175 | dev_mb = (double *)mycuda_calloc(dim * dim, sizeof(double)); 176 | dev_mc = (double *)mycuda_calloc(dim * dim, sizeof(double)); 177 | 178 | // input ma and mb 179 | for(j = 0; j < dim; j++) 180 | { 181 | for(i = 0; i < dim; i++) 182 | { 183 | // column major 184 | ma[i + j * dim] = sqrt(2.0) * (i + j + 1); 185 | mb[i + j * dim] = sqrt(2.0) * (dim * 2 - (i + j + 1)); 186 | mc[i + j * dim] = 0.0; 187 | mc_host[i + j * dim] = 0.0; 188 | } 189 | } 190 | 191 | // set matrix 192 | status = cublasCreate(&handle); 193 | if(status != CUBLAS_STATUS_SUCCESS) 194 | { 195 | printf("cuBLASの初期化に失敗しました。\n"); 196 | 197 | mycuda_free(dev_ma); 198 | mycuda_free(dev_mb); 199 | mycuda_free(dev_mc); 200 | cublasDestroy(handle); 201 | 202 | return 0; 203 | } 204 | 205 | /* status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim); 206 | if(status != CUBLAS_STATUS_SUCCESS) 207 | printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n"); 208 | status = cublasSetMatrix(dim, dim, sizeof(double), mb, dim, dev_mb, dim); 209 | if(status != CUBLAS_STATUS_SUCCESS) 210 | printf("mb -> dev_mb: cublasSetMatrix失敗しました。\n"); 211 | */ 212 | 213 | // cublasDgemm 214 | if(flag == 0) 215 | { 216 | // mc := 1.0 * ma * mb + 0.0 * mc 217 | alpha = 1.0; // on CPU 218 | beta = 0.0; // on CPU 219 | tmp_time = 0.0; 220 | tmp_total_time = 0.0; 221 | for(itimes = 0; itimes < max_itimes; itimes++) 222 | { 223 | tmp_total_time = get_real_secv(); 224 | 225 | status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim); 226 | if(status != CUBLAS_STATUS_SUCCESS) 227 | printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n"); 228 | status = cublasSetMatrix(dim, dim, sizeof(double), mb, dim, dev_mb, dim); 229 | if(status != CUBLAS_STATUS_SUCCESS) 230 | printf("mb -> dev_mb: cublasSetMatrix失敗しました。\n"); 231 | 232 | tmp_time = get_real_secv(); 233 | status = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, dev_ma, dim, dev_mb, dim, &beta, dev_mc, dim); 234 | //magma_dgemm(MagmaNoTrans, MagmaNoTrans, dim, dim, dim, alpha, dev_ma, dim, dev_mb, dim, beta, dev_mc, dim); 235 | cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須! 236 | running_time += get_real_secv() - tmp_time; // end 237 | if(status != CUBLAS_STATUS_SUCCESS) 238 | printf("cublasDgemm失敗しました。\n"); 239 | 240 | //printf("%f \n", get_real_secv() - running_time); 241 | 242 | status = cublasGetMatrix(dim, dim, sizeof(double), dev_mc, dim, mc, dim); 243 | if(status != CUBLAS_STATUS_SUCCESS) 244 | printf("dev_mc -> mc: cublasGetMatrix失敗しました。\n"); 245 | 246 | *total_running_time += get_real_secv() - tmp_total_time; 247 | 248 | if(*total_running_time > 1.0) 249 | break; 250 | } 251 | } 252 | #ifdef USE_MAGMA 253 | // magmablas_dgemm 254 | else if(flag == 1) 255 | { 256 | // mc := 1.0 * ma * mb + 0.0 * mc 257 | alpha = 1.0; // on CPU 258 | beta = 0.0; // on CPU 259 | tmp_time = 0.0; 260 | tmp_total_time = 0.0; 261 | for(itimes = 0; itimes < max_itimes; itimes++) 262 | { 263 | tmp_total_time = get_real_secv(); 264 | 265 | status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim); 266 | if(status != CUBLAS_STATUS_SUCCESS) 267 | printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n"); 268 | status = cublasSetMatrix(dim, dim, sizeof(double), mb, dim, dev_mb, dim); 269 | if(status != CUBLAS_STATUS_SUCCESS) 270 | printf("mb -> dev_mb: cublasSetMatrix失敗しました。\n"); 271 | 272 | tmp_time = get_real_secv(); 273 | magma_dgemm(MagmaNoTrans, MagmaNoTrans, dim, dim, dim, alpha, dev_ma, dim, dev_mb, dim, beta, dev_mc, dim); 274 | cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須! 275 | running_time += get_real_secv() - tmp_time; // end 276 | if(status != CUBLAS_STATUS_SUCCESS) 277 | printf("cublasDgemm失敗しました。\n"); 278 | 279 | //printf("%f \n", get_real_secv() - running_time); 280 | 281 | status = cublasGetMatrix(dim, dim, sizeof(double), dev_mc, dim, mc, dim); 282 | if(status != CUBLAS_STATUS_SUCCESS) 283 | printf("dev_mc -> mc: cublasGetMatrix失敗しました。\n"); 284 | 285 | *total_running_time += get_real_secv() - tmp_total_time; 286 | 287 | if(*total_running_time > 1.0) 288 | break; 289 | } 290 | } 291 | #endif // USE_MAGMA 292 | 293 | running_time /= (itimes + 1); 294 | *total_running_time /= (itimes + 1); 295 | //printf("itimes = %d, total_time, running_time = %f, %f \n", itimes, *total_running_time, running_time); 296 | 297 | // on CPU 298 | cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, mb, dim, beta, mc_host, dim); 299 | // ||mc_host - mc||_F 300 | cblas_daxpy(dim * dim, -1.0, mc, 1, mc_host, 1); 301 | //printf("||mc - mc_host||_F = %25.17e\n", cblas_dnrm2(dim * dim, mc_host, 1) / cblas_dnrm2(dim * dim, mc, 1)); 302 | 303 | // print 304 | /* for(i = 0; i < dim; i++) 305 | { 306 | printf("%3d: ", i); 307 | for(j = 0; j < dim; j++) 308 | printf("%10f ", *(mc + i * dim + j)); 309 | printf("\n"); 310 | } 311 | */ 312 | // printf("dim = %d, Running Time(sec) = %lf (%lf ms)\n", dim, running_time, running_time * 1000); 313 | gflops = 2.0 * (double)dim * (double)dim * (double)dim / running_time / 1024.0 / 1024.0 / 1024.0; 314 | //printf("%f Gflops\n", gflops); 315 | 316 | // free on CPU 317 | free(ma); 318 | free(mb); 319 | free(mc); 320 | 321 | // free on GPU 322 | mycuda_free(dev_ma); 323 | mycuda_free(dev_mb); 324 | mycuda_free(dev_mc); 325 | 326 | cublasDestroy(handle); 327 | 328 | return gflops; 329 | } 330 | #endif 331 | 332 | int main() 333 | { 334 | int start_dim, end_dim, step_dim, dim; 335 | char str_mkl_version[1024]; 336 | int max_num_threads; 337 | double total_time[2] = {0.0, 0.0}; 338 | 339 | // print MKL version 340 | #ifndef USE_IMKL 341 | MKL_Get_Version_String(str_mkl_version, 1024); 342 | printf("%s\n", str_mkl_version); 343 | 344 | max_num_threads = mkl_get_max_threads(); 345 | printf("Max Number of Threads: %d\n", max_num_threads); 346 | mkl_set_num_threads(max_num_threads); 347 | #endif 348 | 349 | // input dimension 350 | printf("Start Dim = "); scanf("%d", &start_dim); 351 | printf("End Dim = "); scanf("%d", &end_dim); 352 | printf("Step Dim = "); scanf("%d", &step_dim); 353 | 354 | printf(" DIM DGEMM(GFlops)"); 355 | #ifdef USE_CUDA 356 | printf(" cuBLAS(GFlops)"); 357 | #endif 358 | #ifdef USE_MAGMA 359 | printf(" magmablas(GFlops)"); 360 | printf(" Total time(sec)"); 361 | #endif 362 | printf("\n"); 363 | 364 | for(dim = start_dim; dim <= end_dim; dim += step_dim) 365 | { 366 | printf("%5d %10.3f", dim, get_gflops_dgemm(0, dim)); 367 | #ifdef USE_CUDA 368 | printf(" %10.3f", get_gflops_cuda_dgemm(0, dim, &total_time[0])); 369 | #endif 370 | #ifdef USE_MAGMA 371 | printf(" %10.3f", get_gflops_cuda_dgemm(1, dim, &total_time[1])); 372 | #endif 373 | if(total_time[0] > 0.0) 374 | printf(" %10.3f", total_time[0]); 375 | if(total_time[1] > 0.0) 376 | printf(" %10.3f", total_time[1]); 377 | printf("\n"); 378 | } 379 | 380 | return EXIT_SUCCESS; 381 | } 382 | -------------------------------------------------------------------------------- /dgemv_bench_mt.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of BLAS Level 2 */ 4 | /* with Intel Math Kernel, */ 5 | /* cuBLAS and MAGMA */ 6 | /* Last Update: 2015-03-27 (Fri) T.Kouya */ 7 | /*************************************************/ 8 | #include 9 | #include 10 | #include "get_secv.h" 11 | #ifdef USE_IMKL 12 | #include "mkl.h" 13 | #include "mkl_cblas.h" // for Intel Math Kernel Library 14 | #else 15 | #include "cblas.h" 16 | #endif 17 | 18 | #ifdef USE_CUDA 19 | // mycuda_calloc, mycuda_free関数 20 | #include "mycuda.h" 21 | #include "cublas_v2.h" 22 | #ifdef USE_MAGMA 23 | #include "magma.h" 24 | #endif 25 | #endif 26 | 27 | // column major only 28 | // my_matvec_mul: vec_b := mat_a * vec_x 29 | void my_matvec_mul_col(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x) 30 | { 31 | int i, j; 32 | 33 | // メインループ 34 | for(i = 0; i < row_dim; i++) 35 | { 36 | vec_b[i] = 0.0; 37 | for(j = 0; j < col_dim; j++) 38 | vec_b[i] += mat_a[i + j * row_dim] * vec_x[j]; 39 | } 40 | } 41 | 42 | // flag = 0 ... use DGEMV in BLAS 43 | // flag = 1 ... use handmade mavec_mul 44 | double get_gflops_dgemv(int flag, int dim, double *total_running_time, int *iterative_times) 45 | { 46 | #ifdef USE_ATLAS 47 | int i, j; 48 | #else 49 | MKL_INT i, j; // dimension of vectors 50 | #endif 51 | double *ma, *vb, *vc; 52 | double alpha, beta; 53 | double running_time; 54 | double gflops; 55 | int inc_vb, inc_vc; 56 | int itimes, max_itimes = 200000; 57 | double tmp_time; 58 | 59 | // Initialize 60 | ma = (double *)calloc(dim * dim, sizeof(double)); 61 | vb = (double *)calloc(dim, sizeof(double)); 62 | vc = (double *)calloc(dim, sizeof(double)); 63 | 64 | // row major 65 | for(i = 0; i < dim; i++) 66 | { 67 | for(j = 0; j < dim; j++) 68 | ma[i + j * dim] = sqrtf(2.0) * (i + j + 1); 69 | 70 | vb[i] = sqrtf(2.0) * (dim * 2 - i); 71 | } 72 | 73 | if(flag == 0) 74 | { 75 | // vc := 1.0 * ma * vb + 0.0 * vc 76 | alpha = 1.0; 77 | beta = 0.0; 78 | tmp_time = 0.0; 79 | inc_vb = 1; 80 | inc_vc = 1; 81 | for(itimes = 0; itimes < max_itimes; itimes++) 82 | { 83 | running_time = get_real_secv(); 84 | cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, ma, dim, vb, inc_vb, beta, vc, inc_vc); 85 | tmp_time += get_real_secv() - running_time; // end 86 | if((tmp_time > 1.0) && (itimes > 5)) 87 | { 88 | running_time = tmp_time / (itimes + 1); 89 | break; 90 | } 91 | } 92 | *total_running_time = tmp_time; 93 | *iterative_times = itimes; 94 | } 95 | else if(flag == 1) 96 | { 97 | tmp_time = 0.0; 98 | for(itimes = 0; itimes < max_itimes; itimes++) 99 | { 100 | running_time = get_real_secv(); 101 | my_matvec_mul_col(vc, ma, dim, dim, vb); 102 | tmp_time += get_real_secv() - running_time; // end 103 | if((tmp_time > 1.0) && (itimes > 5)) 104 | { 105 | running_time = tmp_time / (itimes + 1); 106 | break; 107 | } 108 | } 109 | *total_running_time = tmp_time; 110 | *iterative_times = itimes; 111 | } 112 | 113 | // print 114 | /* for(i = 0; i < dim; i++) 115 | { 116 | printf("%3d: ", i); 117 | printf("%10f ", vc[i]); 118 | printf("\n"); 119 | } 120 | */ 121 | // printf("dim = %d, Running Time(sec) = %f\n", dim, running_time); 122 | gflops = (2.0 * (double)dim * (double)dim - (double)dim) / running_time / 1024.0 / 1024.0 / 1024.0; 123 | // printf("%f Gflops\n", gflops); 124 | 125 | free(ma); 126 | free(vb); 127 | free(vc); 128 | 129 | return gflops; 130 | } 131 | 132 | // GPGPU 133 | #ifdef USE_CUDA 134 | 135 | // flag = 0 ... use DGEMV in cuBLAS 136 | // flag = 1 ... use magma_dgemv in MAGMA 137 | double get_gflops_cuda_dgemv(int flag, int dim, double *total_running_time, int *iterative_times) 138 | { 139 | int i, j; // dimension of vectors 140 | double *ma, *vb, *vc, *vc_host; // on CPU 141 | double *dev_ma, *dev_vb, *dev_vc; // on GPU 142 | double alpha, beta; 143 | double running_time, tmp_time, tmp_total_time; 144 | double gflops; 145 | int inc_vb, inc_vc; 146 | int itimes, max_itimes = 10000; 147 | cublasStatus_t status; 148 | cublasHandle_t handle; 149 | 150 | // Initialize on CPU 151 | ma = (double *)calloc(dim * dim, sizeof(double)); 152 | vb = (double *)calloc(dim, sizeof(double)); 153 | vc = (double *)calloc(dim, sizeof(double)); 154 | vc_host = (double *)calloc(dim, sizeof(double)); 155 | 156 | // Initialize on GPU 157 | dev_ma = (double *)mycuda_calloc(dim * dim, sizeof(double)); 158 | dev_vb = (double *)mycuda_calloc(dim, sizeof(double)); 159 | dev_vc = (double *)mycuda_calloc(dim, sizeof(double)); 160 | 161 | // input ma and mb 162 | for(j = 0; j < dim; j++) 163 | { 164 | for(i = 0; i < dim; i++) 165 | { 166 | // column major 167 | ma[i + j * dim] = sqrt(2.0) * (i + j + 1); 168 | } 169 | vb[i] = sqrt(2.0) * (dim * 2 - i); 170 | vc[i] = 0.0; 171 | vc_host[i] = 0.0; 172 | } 173 | 174 | // set matrix 175 | status = cublasCreate(&handle); 176 | if(status != CUBLAS_STATUS_SUCCESS) 177 | { 178 | printf("cuBLASの初期化に失敗しました。\n"); 179 | 180 | mycuda_free(dev_ma); 181 | mycuda_free(dev_vb); 182 | mycuda_free(dev_vc); 183 | cublasDestroy(handle); 184 | 185 | return 0; 186 | } 187 | 188 | inc_vb = 1; 189 | inc_vc = 1; 190 | 191 | // cublasDgemm 192 | if(flag == 0) 193 | { 194 | // vc := 1.0 * ma * vb + 0.0 * vc 195 | alpha = 1.0; // on CPU 196 | beta = 0.0; // on CPU 197 | tmp_time = 0.0; 198 | tmp_total_time = 0.0; 199 | for(itimes = 0; itimes < max_itimes; itimes++) 200 | { 201 | tmp_total_time = get_real_secv(); 202 | 203 | status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim); 204 | if(status != CUBLAS_STATUS_SUCCESS) 205 | printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n"); 206 | status = cublasSetVector(dim, sizeof(double), (void *)vb, inc_vb, (void *)dev_vb, inc_vb); 207 | if(status != CUBLAS_STATUS_SUCCESS) 208 | printf("vb -> dev_vb: cublasSetVector失敗しました。\n"); 209 | 210 | tmp_time = get_real_secv(); 211 | status = cublasDgemv(handle, CUBLAS_OP_N, dim, dim, &alpha, dev_ma, dim, dev_vb, inc_vb, &beta, dev_vc, inc_vc); 212 | //magma_dgemv(MagmaNoTrans, dim, dim, alpha, dev_ma, dim, dev_vb, inv_vb, beta, dev_vc, inc_vc); 213 | cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須! 214 | running_time += get_real_secv() - tmp_time; // end 215 | if(status != CUBLAS_STATUS_SUCCESS) 216 | printf("cublasDgemv失敗しました。\n"); 217 | 218 | //printf("%f \n", get_real_secv() - running_time); 219 | 220 | status = cublasGetVector(dim, sizeof(double), dev_vc, inc_vc, vc, inc_vc); 221 | if(status != CUBLAS_STATUS_SUCCESS) 222 | printf("dev_vc -> vc: cublasGetMatrix失敗しました。\n"); 223 | 224 | *total_running_time += get_real_secv() - tmp_total_time; 225 | 226 | if((*total_running_time > 1.0) && (itimes > 5)) 227 | break; 228 | } 229 | 230 | *iterative_times = itimes; 231 | } 232 | #ifdef USE_MAGMA 233 | // magmablas_dgemm 234 | else if(flag == 1) 235 | { 236 | // initialize 237 | magma_init(); 238 | 239 | // mc := 1.0 * ma * mb + 0.0 * mc 240 | alpha = 1.0; // on CPU 241 | beta = 0.0; // on CPU 242 | tmp_time = 0.0; 243 | tmp_total_time = 0.0; 244 | for(itimes = 0; itimes < max_itimes; itimes++) 245 | { 246 | tmp_total_time = get_real_secv(); 247 | 248 | status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim); 249 | if(status != CUBLAS_STATUS_SUCCESS) 250 | printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n"); 251 | status = cublasSetVector(dim, sizeof(double), vb, inc_vb, dev_vb, inc_vb); 252 | if(status != CUBLAS_STATUS_SUCCESS) 253 | printf("vb -> dev_vb: cublasSetVector失敗しました。\n"); 254 | 255 | tmp_time = get_real_secv(); 256 | magma_dgemv(MagmaNoTrans, dim, dim, alpha, dev_ma, dim, dev_vb, inc_vb, beta, dev_vc, inc_vc); 257 | cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須! 258 | running_time += get_real_secv() - tmp_time; // end 259 | 260 | //printf("%f \n", get_real_secv() - running_time); 261 | 262 | status = cublasGetVector(dim, sizeof(double), dev_vc, inc_vc, vc, inc_vc); 263 | if(status != CUBLAS_STATUS_SUCCESS) 264 | printf("dev_vc -> vc: cublasGetVector失敗しました。\n"); 265 | 266 | *total_running_time += get_real_secv() - tmp_total_time; 267 | 268 | if((*total_running_time > 1.0) && (itimes > 5)) 269 | break; 270 | } 271 | 272 | // finalize 273 | magma_finalize(); 274 | 275 | *iterative_times = itimes; 276 | } 277 | #endif // USE_MAGMA 278 | 279 | running_time /= (itimes + 1); 280 | *total_running_time /= (itimes + 1); 281 | //printf("itimes = %d, total_time, running_time = %f, %f \n", itimes, *total_running_time, running_time); 282 | 283 | // on CPU 284 | cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, ma, dim, vb, inc_vb, beta, vc_host, inc_vc); 285 | // ||vc_host - vc||_F 286 | cblas_daxpy(dim, -1.0, vc, 1, vc_host, 1); 287 | //printf("||vc - vc_host||_F = %25.17e\n", cblas_snrm2(dim, vc_host, 1) / cblas_snrm2(dim, vc, 1)); 288 | 289 | // print 290 | /* for(i = 0; i < dim; i++) 291 | { 292 | printf("%3d: ", i); 293 | for(j = 0; j < dim; j++) 294 | printf("%10f ", *(mc + i * dim + j)); 295 | printf("\n"); 296 | } 297 | */ 298 | // printf("dim = %d, Running Time(sec) = %lf (%lf ms)\n", dim, running_time, running_time * 1000); 299 | gflops = (2.0 * (double)dim * (double)dim - (double)dim) / running_time / 1024.0 / 1024.0 / 1024.0; 300 | //printf("%f Gflops\n", gflops); 301 | 302 | // free on CPU 303 | free(ma); 304 | free(vb); 305 | free(vc); 306 | 307 | // free on GPU 308 | mycuda_free(dev_ma); 309 | mycuda_free(dev_vb); 310 | mycuda_free(dev_vc); 311 | 312 | cublasDestroy(handle); 313 | 314 | return gflops; 315 | } 316 | #endif 317 | 318 | #ifdef USE_ATLAS 319 | #define ALT_NTHREADS 4 320 | #endif 321 | 322 | int main() 323 | { 324 | int start_dim, end_dim, step_dim, dim; 325 | char str_mkl_version[1024]; 326 | int max_num_threads; 327 | double total_time[3] = {0.0, 0.0, 0.0}; 328 | int iterative_times[3] = {0, 0, 0}; 329 | 330 | // print MKL version 331 | #ifdef USE_IMKL 332 | MKL_Get_Version_String(str_mkl_version, 1024); 333 | printf("%s\n", str_mkl_version); 334 | 335 | max_num_threads = mkl_get_max_threads(); 336 | printf("Max Number of Threads: %d\n", max_num_threads); 337 | mkl_set_num_threads(max_num_threads); 338 | #endif 339 | 340 | // input dimension 341 | printf("Start Dim = "); scanf("%d", &start_dim); 342 | printf("End Dim = "); scanf("%d", &end_dim); 343 | printf("Step Dim = "); scanf("%d", &step_dim); 344 | 345 | printf(" DIM DGEMV(GFlops)"); 346 | #ifdef USE_CUDA 347 | printf(" cuBLAS(GFlops)"); 348 | #endif 349 | #ifdef USE_MAGMA 350 | printf(" magmablas(GFlops)"); 351 | printf(" Total time(sec)"); 352 | #endif 353 | printf("\n"); 354 | 355 | for(dim = start_dim; dim <= end_dim; dim += step_dim) 356 | { 357 | printf("%5d %10.3lg", dim, get_gflops_dgemv(0, dim, &total_time[0], &iterative_times[0])); 358 | #ifdef USE_CUDA 359 | printf(" %10.3lg", get_gflops_cuda_dgemv(0, dim, &total_time[1], &iterative_times[1])); 360 | #endif 361 | #ifdef USE_MAGMA 362 | printf(" %10.3lg", get_gflops_cuda_dgemv(1, dim, &total_time[2], &iterative_times[2])); 363 | #endif 364 | if(total_time[0] > 0.0) 365 | printf(" %10.3lg(%6d)", total_time[0], iterative_times[0]); 366 | if(total_time[1] > 0.0) 367 | printf(" %10.3lg(%6d)", total_time[1], iterative_times[1]); 368 | if(total_time[2] > 0.0) 369 | printf(" %10.3lg(%6d)", total_time[2], iterative_times[2]); 370 | printf("\n"); 371 | } 372 | 373 | return EXIT_SUCCESS; 374 | } 375 | -------------------------------------------------------------------------------- /first.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* First programming with float & double types */ 4 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | 9 | int main() 10 | { 11 | float sc = 0.0, sa = -2.0, sb = 3.0; 12 | double dc = 0.0, da = -2.0, db = 3.0; 13 | double relerr; 14 | 15 | // basic arithmetic: float 16 | printf("--- float data type(single precsion floating-point number) ---\n"); 17 | sc = sa + sb; 18 | printf("%25.17e := %25.17e + %25.17e\n", sc, sa, sb); 19 | sc = sa - sb; 20 | printf("%25.17e := %25.17e - %25.17e\n", sc, sa, sb); 21 | sc = sa * sb; 22 | printf("%25.17e := %25.17e * %25.17e\n", sc, sa, sb); 23 | sc = sa / sb; 24 | printf("%25.17e := %25.17e / %25.17e\n", sc, sa, sb); 25 | 26 | // absolute value and square root: float 27 | sc = fabsf(sa); 28 | printf("%25.17e := |%25.17e|\n", sc, sa); 29 | sc = sqrtf(sb); 30 | printf("%25.17e := sqrt(%25.17e)\n", sc, sb); 31 | 32 | // basic arithmetic: double 33 | printf("--- double data type(double precsion floating-point number) ---\n"); 34 | dc = da + db; 35 | printf("%25.17e := %25.17e + %25.17e\n", dc, da, db); 36 | dc = da - db; 37 | printf("%25.17e := %25.17e - %25.17e\n", dc, da, db); 38 | dc = da * db; 39 | printf("%25.17e := %25.17e * %25.17e\n", dc, da, db); 40 | dc = da / db; 41 | printf("%25.17e := %25.17e / %25.17e\n", dc, da, db); 42 | 43 | // absolute value and square root: double 44 | dc = fabs(da); 45 | printf("%25.17e := |%25.17e|\n", dc, da); 46 | dc = sqrt(db); 47 | printf("%25.17e := sqrt(%25.17e)\n", dc, db); 48 | 49 | // relative error of single precision square root 50 | relerr = fabs(sc - dc); 51 | if(fabs(dc) > 0.0) 52 | relerr /= fabs(dc); 53 | 54 | printf("Single Prec. : %25.17e\n", sc); 55 | printf("Double Prec. : %25.17e\n", dc); 56 | printf("Relative Error: %10.3e\n", relerr); 57 | 58 | return 0; 59 | } 60 | -------------------------------------------------------------------------------- /get_sec.c: -------------------------------------------------------------------------------- 1 | /**********************************************/ 2 | /* get_sec.c: */ 3 | /* Copyright (C) 2003-2016 Tomonori Kouya */ 4 | /* */ 5 | /* This library is free software; you can re- */ 6 | /* distribute it and/or modify it under the */ 7 | /* terms of the GNU Lesser General Public */ 8 | /* License as published by the Free Software */ 9 | /* Foundation; either version 2.1 of the */ 10 | /* License, or (at your option) any later */ 11 | /* version. */ 12 | /* */ 13 | /* This library is distributed in the hope */ 14 | /* that it will be useful, but WITHOUT ANY */ 15 | /* WARRANTY; without even the implied */ 16 | /* warranty of MERCHANTABILITY or FITNESS FOR */ 17 | /* A PARTICULAR PURPOSE. See the GNU Lesser */ 18 | /* General Public License for more details. */ 19 | /**********************************************/ 20 | #include 21 | #include 22 | 23 | #ifdef WIN32 24 | #include 25 | #else 26 | #include 27 | #include 28 | #ifndef CLK_TCK 29 | #include 30 | #define CLK_TCK (sysconf(_SC_CLK_TCK)) 31 | #endif 32 | #endif 33 | 34 | #include "get_secv.h" 35 | 36 | #define DIVTIMES 2 37 | 38 | //#define USE_CLOCK 39 | 40 | /* flag == 0: No print */ 41 | double get_sec(int flag) 42 | { 43 | #ifdef WIN32 44 | static int first = 1; 45 | static LARGE_INTEGER _tstart; 46 | static LARGE_INTEGER freq; 47 | 48 | if(first) { 49 | QueryPerformanceFrequency(&freq); 50 | first = 0; 51 | } 52 | QueryPerformanceCounter(&_tstart); 53 | return ((double)_tstart.QuadPart)/((double)freq.QuadPart); 54 | #else 55 | double ret; 56 | 57 | struct tms tmp; 58 | 59 | #ifdef USE_CLOCK 60 | if(flag != 0) 61 | printf("Time : %d / %d\n", (int)clock(), CLOCKS_PER_SEC); 62 | ret = (double)(clock()) / CLOCKS_PER_SEC; 63 | #else 64 | #ifdef USE_MPFRTIME 65 | //int 66 | //cputime () 67 | //{ 68 | #include 69 | #include 70 | // 71 | struct rusage rus; 72 | 73 | getrusage (0, &rus); 74 | //return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000; 75 | ret = rus.ru_utime.tv_sec + rus.ru_utime.tv_usec / 1000 / 1000; 76 | //} 77 | #else 78 | times(&tmp); 79 | if(flag != 0) 80 | { 81 | printf("User Time : %ld / %ld\n", tmp.tms_utime, CLK_TCK); 82 | printf("System Time: %ld / %ld\n", tmp.tms_stime, CLK_TCK); 83 | printf("CUser Time : %ld / %ld\n", tmp.tms_cutime, CLK_TCK); 84 | printf("CSystem Time: %ld / %ld\n", tmp.tms_cstime, CLK_TCK); 85 | printf("Ret(utime+stime) : %g\n", (double)(tmp.tms_utime + tmp.tms_stime) / CLK_TCK); 86 | printf("Ret(cutime+cstime) : %g\n", (double)(tmp.tms_cutime + tmp.tms_cstime) / CLK_TCK); 87 | } 88 | ret = (double)(tmp.tms_utime + tmp.tms_stime) / CLK_TCK; 89 | #endif 90 | #endif 91 | return ret; 92 | #endif 93 | } 94 | 95 | /* double get_secv(void) */ 96 | double get_secv(void) 97 | { 98 | return get_sec(0); 99 | } 100 | 101 | /* float fget_sec(int flag) */ 102 | float fget_sec(int flag) 103 | { 104 | return (float)get_sec(flag); 105 | } 106 | 107 | /* float fget_sec(void) */ 108 | float fget_secv(void) 109 | { 110 | return (float)get_sec(0); 111 | } 112 | 113 | /* get REAL time */ 114 | /* for Multi-threads programming */ 115 | double get_real_sec(int flag) 116 | { 117 | #ifdef WIN32 118 | static int first = 1; 119 | static LARGE_INTEGER _tstart; 120 | static LARGE_INTEGER freq; 121 | 122 | if(first) { 123 | QueryPerformanceFrequency(&freq); 124 | first = 0; 125 | } 126 | QueryPerformanceCounter(&_tstart); 127 | return ((double)_tstart.QuadPart)/((double)freq.QuadPart); 128 | #else 129 | double ret; 130 | struct timeval tmp; 131 | 132 | gettimeofday(&tmp, NULL); 133 | if(flag != 0) 134 | { 135 | printf("tv_sec : %ld\n", tmp.tv_sec); 136 | printf("tv_usec: %ld\n", tmp.tv_usec); 137 | printf("Ret : %g\n", (double)tmp.tv_sec + (double)tmp.tv_usec / 1000.0 / 1000.0); 138 | } 139 | ret = (double)tmp.tv_sec + (double)tmp.tv_usec / 1000.0 / 1000.0; 140 | 141 | return ret; 142 | #endif 143 | } 144 | 145 | double get_real_secv(void) 146 | { 147 | return get_real_sec(0); 148 | } 149 | 150 | float fget_real_sec(int flag) 151 | { 152 | return (float)get_real_sec(flag); 153 | } 154 | 155 | float fget_real_secv(void) 156 | { 157 | return (float)get_real_sec(0); 158 | } 159 | -------------------------------------------------------------------------------- /get_secv.h: -------------------------------------------------------------------------------- 1 | /**********************************************/ 2 | /* get_secv.h: */ 3 | /* Copyright (C) 2003-2016 Tomonori Kouya */ 4 | /* */ 5 | /* This library is free software; you can re- */ 6 | /* distribute it and/or modify it under the */ 7 | /* terms of the GNU Lesser General Public */ 8 | /* License as published by the Free Software */ 9 | /* Foundation; either version 2.1 of the */ 10 | /* License, or (at your option) any later */ 11 | /* version. */ 12 | /* */ 13 | /* This library is distributed in the hope */ 14 | /* that it will be useful, but WITHOUT ANY */ 15 | /* WARRANTY; without even the implied */ 16 | /* warranty of MERCHANTABILITY or FITNESS FOR */ 17 | /* A PARTICULAR PURPOSE. See the GNU Lesser */ 18 | /* General Public License for more details. */ 19 | /**********************************************/ 20 | #ifndef __TK_GET_SECV_H 21 | #define __TK_GET_SECV_H 22 | 23 | double get_sec(int flag); 24 | 25 | /* double get_secv(void) */ 26 | double get_secv(void); 27 | 28 | /* float fget_sec(int flag) */ 29 | float fget_sec(int flag); 30 | 31 | /* float fget_sec(void) */ 32 | float fget_secv(void); 33 | 34 | /* get REAL time */ 35 | /* for Multi-threads programming */ 36 | double get_real_sec(int flag); 37 | 38 | double get_real_secv(void); 39 | 40 | float fget_real_sec(int flag); 41 | 42 | float fget_real_secv(void); 43 | 44 | #endif // __TK_GET_SECV_H 45 | -------------------------------------------------------------------------------- /integral_eq/Makefile.unix: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Makefile for Linux gcc or icc environment # 4 | # Last Update: 2016-12-02 (Fri) T.Kouya # 5 | #*************************************************# 6 | # Intel C compiler 7 | include ../lapack_icc.inc 8 | 9 | # GNU Compiler Collection 10 | #include ../lapack_gcc.inc 11 | 12 | all: gauss_integral 13 | 14 | get_secv: ../get_secv.h ../get_sec.c 15 | $(CC) $(LAPACKE_INC) -I../ -c ../get_sec.c -o get_sec.o 16 | 17 | gauss_integral: get_secv gauss_integral.c iteration.c ../tkaux.c ../tkaux.h 18 | $(CC) -DDEBUG $(LAPACKE_INC) -I../ gauss_integral.c -o gauss_integral $(LAPACKE_LIB) 19 | $(CC) $(LAPACKE_INC) -I../ -c ../tkaux.c -o tkaux.o 20 | $(CC) $(LAPACKE_INC) -I../ -c gauss_integral.c -o gauss_integral.o 21 | $(CC) $(LAPACKE_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o secant $(LAPACKE_LIB) 22 | $(CC) -DUSE_NEW_SECANT $(LAPACKE_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o new_secant $(LAPACKE_LIB) 23 | 24 | gauss_integral_imkl: get_secv gauss_integral.c iteration.c ../tkaux.c ../tkaux.h 25 | $(CC) $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o secant_imkl $(IMKL_LIB) 26 | $(CC) -DUSE_PARALLEL_DIFFMAT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o secant_imkl_pd $(IMKL_LIB) 27 | $(CC) -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o new_secant_imkl $(IMKL_LIB) 28 | $(CC) -DUSE_PARALLEL_DIFFMAT -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o new_secant_imkl_pd $(IMKL_LIB) 29 | 30 | clean: 31 | -rm *.o 32 | -rm gauss_integral 33 | -rm secant 34 | -rm secant_imkl 35 | -rm secant_imkl_pd 36 | -rm new_secant 37 | -rm new_secant_imkl 38 | -rm new_secant_imkl_pd 39 | -------------------------------------------------------------------------------- /integral_eq/Makefile.win_intel: -------------------------------------------------------------------------------- 1 | include ../lapack_win_intel.inc 2 | 3 | INC = /I..\windows 4 | 5 | all: gauss_integral 6 | 7 | get_secv: ../get_secv.h ../get_sec.c 8 | $(CC) $(LAPACKE_INC) -I../ -c ../get_sec.c -o get_sec.o 9 | 10 | gauss_integral: get_secv gauss_integral.c iteration.c ../tkaux.c ../tkaux.h 11 | $(CC) -DDEBUG $(LAPACKE_INC) -I../ gauss_integral.c -o gauss_integral $(LAPACKE_LIB) 12 | $(CC) $(LAPACKE_INC) -I../ -c ../tkaux.c -o tkaux.obj 13 | $(CC) $(LAPACKE_INC) -I../ -c gauss_integral.c -o gauss_integral.obj 14 | $(CC) $(LAPACKE_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o secant $(LAPACKE_LIB) 15 | $(CC) $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o secant_imkl $(IMKL_LIB) 16 | $(CC) -DUSE_PARALLEL_DIFFMAT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o secant_imkl_pd $(IMKL_LIB) 17 | $(CC) -DUSE_NEW_SECANT $(LAPACKE_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o new_secant $(LAPACKE_LIB) 18 | $(CC) -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o new_secant_imkl $(IMKL_LIB) 19 | $(CC) -DUSE_PARALLEL_DIFFMAT -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o new_secant_imkl_pd $(IMKL_LIB) 20 | 21 | clean: 22 | -del *.obj 23 | -del *.exe 24 | -------------------------------------------------------------------------------- /integral_eq/ex1.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Hammerstein Integral equation */ 4 | /* Last Update: 2016-12-02 (Fri) T.Kouya */ 5 | /*************************************************/ 6 | // 7 | // Example 1: x(s) = 1 + 1/2 * int[0, 1] K(s, t) (|x(t)| + (x(t))^2) dt 8 | // where K(s, t) = (1 - s) * t (t \le s) or (1 - t) * s (s \le t) 9 | // 10 | double get_bij(int i, int j, double *abscissa, double *weight) 11 | { 12 | double ret; 13 | 14 | // abscissa[i] = s_i 15 | // abscissa[j] = t_j 16 | if(j <= i) 17 | ret = weight[j] * abscissa[j] * (1.0 - abscissa[i]); 18 | else 19 | ret = weight[j] * abscissa[i] * (1.0 - abscissa[j]); 20 | 21 | return ret; 22 | } 23 | 24 | void get_bmatrix(double *B, int row_dim, int col_dim, double *abscissa, double *weight) 25 | { 26 | int i, j; 27 | 28 | #ifdef _OPENMP 29 | #pragma omp parallel for private(j) 30 | #endif // _OPENMP 31 | for(i = 0; i < row_dim; i++) 32 | for(j = 0; j < col_dim; j++) 33 | B[i * col_dim + j] = get_bij(i, j, abscissa, weight); 34 | 35 | } 36 | 37 | double *dmat_b; 38 | double *dvec_abscissa, *dvec_weight, *dvec_xhat, *dvec_xsqr; 39 | 40 | void init_derivative_free_iteration_dvector(int dim) 41 | { 42 | dvec_abscissa = (double *)calloc(dim, sizeof(double)); 43 | dvec_weight = (double *)calloc(dim, sizeof(double)); 44 | dvec_xhat = (double *)calloc(dim, sizeof(double)); 45 | dvec_xsqr = (double *)calloc(dim, sizeof(double)); 46 | 47 | gauss_integral_eig_d(dvec_abscissa, dvec_weight, dim, GAUSS_LEGENDRE); 48 | dshifted_gauss_legendre(0.0, 1.0, dvec_abscissa, dvec_weight, dim); 49 | 50 | dmat_b = (double *)calloc(dim * dim, sizeof(double)); 51 | 52 | get_bmatrix(dmat_b, dim, dim, dvec_abscissa, dvec_weight); 53 | 54 | } 55 | 56 | void free_derivative_free_iteration_dvector(void) 57 | { 58 | free(dvec_abscissa); 59 | free(dvec_weight); 60 | free(dvec_xhat); 61 | free(dvec_xsqr); 62 | 63 | free(dmat_b); 64 | } 65 | 66 | // ret = |x| 67 | void abs_dvector(double *ret, double *x, int dim) 68 | { 69 | int i; 70 | 71 | #ifdef _OPENMP 72 | #pragma omp parallel for 73 | #endif // _OPENMP 74 | for(i = 0; i < dim; i++) 75 | ret[i] = fabs(x[i]); 76 | } 77 | 78 | // ret = x^2 79 | void sqr_dvector(double *ret, double *x, int dim) 80 | { 81 | int i; 82 | 83 | #ifdef _OPENMP 84 | #pragma omp parallel for 85 | #endif // _OPENMP 86 | for(i = 0; i < dim; i++) 87 | ret[i] = x[i] * x[i]; 88 | } 89 | 90 | // ret = 1 91 | void allone_dvector(double *ret, int dim) 92 | { 93 | int i; 94 | 95 | #ifdef _OPENMP 96 | #pragma omp parallel for 97 | #endif // _OPENMP 98 | for(i = 0; i < dim; i++) 99 | ret[i] = 1.0; 100 | } 101 | 102 | // double vfunc_index(int, DVector) 103 | double vf_index(int index, double *x, int dim) 104 | { 105 | double ret, tmp; 106 | int j; 107 | 108 | // 0.5 * B * (x_hat + x_sqr) 109 | tmp = 0.0; 110 | #ifdef _OPENMP 111 | #pragma omp parallel for reduction(+:tmp) 112 | #endif // _OPENMP 113 | for(j = 0; j < dim; j++) 114 | tmp += dmat_b[index * dim + j] * (fabs(x[j]) + x[j] * x[j]); 115 | 116 | ret = x[index] - 1.0 - 0.5 * tmp; 117 | 118 | return ret; 119 | } 120 | 121 | 122 | // void vfunc(DVector, DVector) 123 | void vf(double *ret_vec, double *x, int dim) 124 | { 125 | double ret; 126 | int i; 127 | double *tmp_vec; 128 | 129 | tmp_vec = (double *)calloc(dim, sizeof(double)); 130 | 131 | // 0.5 * B * (x_hat + x_sqr) 132 | abs_dvector(dvec_xhat, x, dim); 133 | sqr_dvector(dvec_xsqr, x, dim); 134 | cblas_daxpy(dim, 1.0, dvec_xhat, 1, dvec_xsqr, 1); 135 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, dmat_b, dim, dvec_xsqr, 1, 0.0, ret_vec, 1); 136 | cblas_dscal(dim, 0.5, ret_vec, 1); 137 | 138 | // x - 1 - 0.5 * B * (x_hat + x_sqr) 139 | allone_dvector(tmp_vec, dim); 140 | cblas_daxpy(dim, 1.0, ret_vec, 1, tmp_vec, 1); 141 | cblas_dcopy(dim, x, 1, ret_vec, 1); 142 | cblas_daxpy(dim, -1.0, tmp_vec, 1, ret_vec, 1); 143 | 144 | free(tmp_vec); 145 | } 146 | -------------------------------------------------------------------------------- /integral_eq/gauss_integral.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Gaussian integration formulas */ 4 | /* with LAPACK/BLAS */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef USE_IMKL 12 | #include "mkl.h" // for Intel Math Kernel Library 13 | #include "mkl_cblas.h" 14 | #include "mkl_lapacke.h" 15 | #else // USE_IMKL 16 | #include "cblas.h" 17 | #include "lapacke.h" 18 | #endif // USE_IMKL 19 | 20 | #include "gauss_integral.h" 21 | 22 | /* trimat:= [vec[1][0] vec[0][0 ] 0 ... 0 ] */ 23 | /* [vec[2][0] vec[1][1 ] vec[0][1 ] 0 ... 0 ] */ 24 | /* [ .............................. ] */ 25 | /* [0 ... 0 vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */ 26 | /* [0 ... 0 0 vec[2][n-2] vec[1][n-1] ] */ 27 | /* */ 28 | int init_dmatrix_tri(double *trimat[3], int dim) 29 | { 30 | if((trimat[0] = (double *)calloc(sizeof(double), dim - 1)) == NULL) 31 | return -1; 32 | 33 | if((trimat[1] = (double *)calloc(sizeof(double), dim)) == NULL) 34 | { 35 | free(trimat[0]); 36 | return -2; 37 | } 38 | if((trimat[2] = (double *)calloc(sizeof(double), dim - 1)) == NULL) 39 | { 40 | free(trimat[0]); 41 | free(trimat[1]); 42 | return -3; 43 | } 44 | 45 | return 0; 46 | } 47 | 48 | void free_dmatrix_tri(double *trimat[3]) 49 | { 50 | free(trimat[0]); 51 | free(trimat[1]); 52 | free(trimat[2]); 53 | } 54 | 55 | void print_dmatrix_tri(double *trimat[3], int dim) 56 | { 57 | long int i; 58 | 59 | printf("%3d: %25.17e %25.17e\n", 0, trimat[1][0], trimat[0][0]); 60 | for(i = 1; i < dim - 1; i++) 61 | printf("%3ld:%25.17e %25.17e %25.17e\n", i, trimat[2][i - 1], trimat[1][i], trimat[0][i]); 62 | printf("%3ld:%25.17e %25.17e\n", i, trimat[2][dim - 2], trimat[1][dim - 1]); 63 | } 64 | 65 | /* coef[2] x^2 + coef[1] x + coef[0] = 0 */ 66 | /* Output: ans_re[0] + sqrt(-1)*ans_im[0], ans_re[1] + sqrt(-1) * ans_im[1] */ 67 | int dquadratic_eq(double ans_re[2], double ans_im[2], double coef[3]) 68 | { 69 | double d, den, tmp; 70 | 71 | // printf("coef: %10.3e x^2 + %10.3e x + %10.3e = 0\n", coef[2], coef[1], coef[0]); 72 | 73 | ans_re[0] = 0.0; 74 | ans_re[1] = 0.0; 75 | ans_im[0] = 0.0; 76 | ans_im[1] = 0.0; 77 | 78 | if(coef[2] == 0.0) 79 | { 80 | if(coef[1] == 0.0) 81 | { 82 | fprintf(stderr, "ERROR: No answers (dquadraric_eq)\n"); 83 | return -1; 84 | } 85 | ans_re[0] = -coef[0] / coef[1]; 86 | return 1; // number of answers 87 | } 88 | 89 | den = 2.0 * coef[2]; 90 | d = coef[1] * coef[1] - 4.0 * coef[2] * coef[0]; 91 | ans_re[0] = -coef[1] / den; 92 | 93 | /* anss are real numbers */ 94 | if(d == 0.0) 95 | { 96 | ans_re[1] = ans_re[0]; 97 | } 98 | else if(d > 0) 99 | { 100 | tmp = sqrt(d); 101 | if(coef[1] > 0.0) 102 | tmp = -tmp / den; 103 | else 104 | tmp = tmp / den; 105 | ans_re[0] -= tmp; 106 | ans_re[1] = coef[0] / (coef[2] * ans_re[0]); 107 | } 108 | /* complex numbers */ 109 | else 110 | { 111 | tmp = sqrt(-d) / den; 112 | ans_re[1] = ans_re[0]; 113 | ans_im[0] = tmp; 114 | ans_im[1] = -tmp; 115 | } 116 | return 2; 117 | } 118 | 119 | 120 | /* Eigen polynomial of Unsymmetric Real Tridiagonal Matrix */ 121 | /* */ 122 | /* trimat:= [vec[1][0] vec[0][0 ] 0 ... 0 ] */ 123 | /* [vec[2][0] vec[1][1 ] vec[0][1 ] 0 ... 0 ] */ 124 | /* [ .............................. ] */ 125 | /* [0 ... 0 vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */ 126 | /* [0 ... 0 0 vec[2][n-2] vec[1][n-1] ] */ 127 | /* */ 128 | void get_dtrimat_legendre(double *trimat[3], double *diagmat, int dim) 129 | { 130 | long int i, j, k; 131 | double tmp, a1, a2, a3, a4, a, b, c; 132 | double di, dip1; 133 | 134 | /* Frank Matrix */ 135 | for(i = 0; i < dim; i++) 136 | { 137 | // Legendre 138 | a1 = (double)(i + 1); 139 | a2 = 0.0; 140 | a3 = (double)(2 * i + 1); 141 | a4 = (double)i; 142 | 143 | a = a3 / a1; 144 | b = a2 / a1; 145 | c = a4 / a1; 146 | for(j = 0; j < dim; j++) 147 | { 148 | if(j == (i - 1)) 149 | { 150 | tmp = c / a; 151 | trimat[2][j] = tmp; 152 | } 153 | else if(j == i) 154 | { 155 | tmp = -b / a; 156 | trimat[1][j] = tmp; 157 | } 158 | else if(j == (i + 1)) 159 | { 160 | tmp = 1.0 / a; 161 | trimat[0][j - 1] = tmp; 162 | } 163 | } 164 | } 165 | 166 | /* to be Symmetric */ 167 | di = 1.0; 168 | diagmat[0] = di; 169 | for(i = 0; i < dim - 1; i++) 170 | { 171 | dip1 = sqrt(trimat[0][i] / trimat[2][i]) * di; 172 | trimat[0][i] = di / dip1 * trimat[0][i]; 173 | trimat[2][i] = dip1 / di * trimat[2][i]; 174 | di = dip1; 175 | diagmat[i + 1] = dip1; 176 | } 177 | } 178 | 179 | 180 | /* Eigen polynomial of Unsymmetric Real Tridiagonal Matrix */ 181 | /* */ 182 | /* trimat:= [vec[1][0] vec[0][0 ] 0 ... 0 ] */ 183 | /* [vec[2][0] vec[1][1 ] vec[0][1 ] 0 ... 0 ] */ 184 | /* [ .............................. ] */ 185 | /* [0 ... 0 vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */ 186 | /* [0 ... 0 0 vec[2][n-2] vec[1][n-1] ] */ 187 | /* */ 188 | void get_dtrimat_leguerre(double *trimat[3], double *diagmat, int dim) 189 | { 190 | long int i, j, k; 191 | double tmp, a1, a2, a3, a4, a, b, c; 192 | double di, dip1; 193 | 194 | /* Frank Matrix */ 195 | for(i = 0; i < dim; i++) 196 | { 197 | // Leguerre: alpha = 0.0 198 | a1 = (double)(i + 1); 199 | a2 = (double)(2 * i + 1); 200 | a3 = (double)(-1); 201 | a4 = (double)i; 202 | 203 | a = a3 / a1; 204 | b = a2 / a1; 205 | c = a4 / a1; 206 | for(j = 0; j < dim; j++) 207 | { 208 | if(j == (i - 1)) 209 | { 210 | trimat[2][j] = c / a; 211 | } 212 | else if(j == i) 213 | { 214 | trimat[1][j] = -b / a; 215 | } 216 | else if(j == (i + 1)) 217 | { 218 | trimat[0][j - 1] = 1.0 / a; 219 | } 220 | } 221 | } 222 | 223 | /* to be Symmetric */ 224 | di = 1.0; 225 | diagmat[0] = di; 226 | 227 | for(i = 0; i < dim - 1; i++) 228 | { 229 | dip1 = sqrt(trimat[0][i] / trimat[2][i]) * di; 230 | trimat[0][i] = di / dip1 * trimat[0][i]; 231 | 232 | trimat[2][i] = dip1 / di * trimat[2][i]; 233 | di = dip1; 234 | diagmat[i + 1] = dip1; 235 | } 236 | 237 | } 238 | 239 | /* Eigen polynomial of Unsymmetric Real Tridiagonal Matrix */ 240 | /* */ 241 | /* trimat:= [vec[1][0] vec[0][0 ] 0 ... 0 ] */ 242 | /* [vec[2][0] vec[1][1 ] vec[0][1 ] 0 ... 0 ] */ 243 | /* [ .............................. ] */ 244 | /* [0 ... 0 vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */ 245 | /* [0 ... 0 0 vec[2][n-2] vec[1][n-1] ] */ 246 | /* */ 247 | void get_dtrimat_hermite(double *trimat[3], double *diagmat, int dim) 248 | { 249 | long int i, j, k; 250 | double tmp, a1, a2, a3, a4, a, b, c, next_a, next_c; 251 | double di, dip1; 252 | 253 | /* Frank Matrix */ 254 | for(i = 0; i < dim; i++) 255 | { 256 | // Hermite 257 | a1 = (double)(1); 258 | a2 = (double)(0); 259 | a3 = (double)(2); 260 | a4 = (double)(2 * i); 261 | 262 | a = a3 / a1; 263 | b = a2 / a1; 264 | c = a4 / a1; 265 | for(j = 0; j < dim; j++) 266 | { 267 | if(j == (i - 1)) 268 | { 269 | trimat[2][j] = c / a; 270 | } 271 | else if(j == i) 272 | { 273 | trimat[1][j] = -b / a; 274 | } 275 | else if(j == (i + 1)) 276 | { 277 | trimat[0][j - 1] = 1.0 / a; 278 | } 279 | } 280 | } 281 | 282 | /* to be Symmetric */ 283 | for(i = 0; i < dim; i++) 284 | { 285 | // Hermite 286 | a1 = (double)(1); 287 | a2 = (double)(0); 288 | a3 = (double)(2); 289 | a4 = (double)(2 * i); 290 | 291 | a = a3 / a1; 292 | b = a2 / a1; 293 | c = a4 / a1; 294 | next_a = a; // ONLY for Hermite!! 295 | next_c = (double)(2 * (i + 1)) / a1; // ONLY for Hermite!! 296 | for(j = 0; j < dim; j++) 297 | { 298 | if(j == (i - 1)) 299 | { 300 | trimat[2][j] = trimat[0][j]; 301 | } 302 | else if(j == i) 303 | { 304 | trimat[1][j] = -b / a; 305 | } 306 | else if(j == (i + 1)) 307 | { 308 | trimat[0][j - 1] = sqrt(next_c / (a * next_a)); 309 | } 310 | } 311 | 312 | } 313 | 314 | } 315 | 316 | // 317 | void gauss_integral_eig_d(double *abscissa, double *weight, int deg, int gauss_int_coef) 318 | { 319 | double *dtrimat[3], *dtrimat_org[3]; 320 | double *dinit_vec, *dweight_vecs; 321 | double dmu0, dweight, daeps, dreps; 322 | int i, j; 323 | 324 | /* Double */ 325 | /* initialize */ 326 | init_dmatrix_tri(dtrimat, deg); 327 | init_dmatrix_tri(dtrimat_org, deg); 328 | dinit_vec = (double *)calloc(sizeof(double), deg); 329 | dweight_vecs = (double *)calloc(sizeof(double), deg * deg); 330 | 331 | /* get problem */ 332 | /* abscissas & weight */ 333 | switch(gauss_int_coef) 334 | { 335 | case GAUSS_LEGUERRE: 336 | get_dtrimat_leguerre(dtrimat, dinit_vec, deg); 337 | get_dtrimat_leguerre(dtrimat_org, dinit_vec, deg); 338 | dmu0 = 1.0; 339 | break; 340 | case GAUSS_HERMITE: 341 | get_dtrimat_hermite(dtrimat, dinit_vec, deg); 342 | get_dtrimat_hermite(dtrimat_org, dinit_vec, deg); 343 | dmu0 = sqrt(M_PI); 344 | break; 345 | default: // gauss_legendre 346 | case GAUSS_LEGENDRE: 347 | get_dtrimat_legendre(dtrimat, dinit_vec, deg); 348 | get_dtrimat_legendre(dtrimat_org, dinit_vec, deg); 349 | dmu0 = 2.0; 350 | break; 351 | } 352 | 353 | // dweight_vecs := I 354 | for(i = 0; i < deg; i++) 355 | { 356 | for(j = 0; j < deg; j++) 357 | dweight_vecs[i * deg + j] = 0.0; 358 | dweight_vecs[i * deg + i] = 1.0; 359 | } 360 | 361 | // eigenvalues and eigenvectors of symmetric tridiagonal matrix 362 | LAPACKE_dsteqr(LAPACK_ROW_MAJOR, 'V', deg, dtrimat[1], dtrimat[2], dweight_vecs, deg); 363 | 364 | /* printf("Eigenvalues: \n"); 365 | for(i = 0; i < deg; i++) 366 | { 367 | printf("%3d: %10g\n", i, dtrimat[1][i]); 368 | for(j = 0; j < deg; j++) 369 | printf("\t %10g\n", dweight_vecs[i * deg]); 370 | } 371 | */ 372 | // print_dmatrix_tri(dtrimat); 373 | // print_dvector(dinit_vec); 374 | 375 | /* abscissa */ 376 | cblas_dcopy(deg, dtrimat[1], 1, abscissa, 1); 377 | 378 | for(i = 0; i < deg; i++) 379 | { 380 | dweight = dweight_vecs[i]; 381 | dweight = dweight * dweight * dmu0; 382 | //printf("%5d, %25.17e, %25.17e\n", i, dtrimat[1][i], dweight); 383 | 384 | /* weight */ 385 | weight[i] = dweight; 386 | } 387 | 388 | /* free */ 389 | free_dmatrix_tri(dtrimat); 390 | free_dmatrix_tri(dtrimat_org); 391 | free(dinit_vec); 392 | free(dweight_vecs); 393 | } 394 | 395 | //shifted_gauss_legendre(dvec_abscissa, dvec_weight, dim); 396 | void dshifted_gauss_legendre(double min_val, double max_val, double *abscissa, double *weight, int deg) 397 | { 398 | int i, div; 399 | double x, trans1, trans2; 400 | 401 | div = deg; 402 | 403 | /* transform of variables */ 404 | trans1 = (max_val - min_val) / 2.0; 405 | trans2 = (max_val + min_val) / 2.0; 406 | 407 | /* Main loop */ 408 | /* sum^N w_i * func(t) */ 409 | for(i = 0; i < div; i++) 410 | { 411 | /* x = (b - a) / 2 * t + (b + a) / 2 */ 412 | abscissa[i] = trans1 * abscissa[i] + trans2; 413 | 414 | //printf("%25.17e, %25.17e\n", abscissa[i], weight[i]); 415 | } 416 | } 417 | 418 | /* int[max_val, min_val] func(x) dx */ 419 | double dgauss_legendre_integral(double min_val, double max_val, double (* func)(double), double *abscissa, double *weight, int deg) 420 | { 421 | int i, div; 422 | double x, trans1, trans2, func_val, ret; 423 | 424 | div = deg; 425 | 426 | /* transform of variables */ 427 | trans1 = (max_val - min_val) / 2.0; 428 | trans2 = (max_val + min_val) / 2.0; 429 | 430 | ret = 0.0; 431 | 432 | /* Main loop */ 433 | /* sum^N w_i * func(t) */ 434 | for(i = 0; i < div; i++) 435 | { 436 | /* x = (b - a) / 2 * t + (b + a) / 2 */ 437 | x = trans1 * abscissa[i] + trans2; 438 | ret += func(x) * weight[i]; 439 | } 440 | ret *= trans1; 441 | 442 | return ret; 443 | } 444 | 445 | #ifdef DEBUG 446 | 447 | // usage 448 | void usage(const char *progname) 449 | { 450 | printf("$ %s [kind of Gaussian Integration Scheme] [deg]\n", progname); 451 | printf(" 0: Gauss-Legendre, 1: Gauss-Leguerre, 2: Gauss-Hermite\n"); 452 | } 453 | 454 | // ex1: cos(x) 455 | double ex1_func(double x) 456 | { 457 | return cos(x); 458 | } 459 | 460 | // ex2: x^2 461 | double ex2_func(double x) 462 | { 463 | return x * x; 464 | } 465 | 466 | int main(int argc, char *argv[]) 467 | { 468 | int deg = 3, kind_of_scheme, i; 469 | double *abscissa, *weight; 470 | 471 | if(argc <= 1) 472 | { 473 | usage(argv[0]); 474 | return EXIT_SUCCESS; 475 | } 476 | 477 | kind_of_scheme = atoi(argv[1]); 478 | 479 | if(kind_of_scheme == 1) 480 | { 481 | kind_of_scheme = GAUSS_LEGUERRE; 482 | printf("Gauss-Leguerre "); 483 | } 484 | else if(kind_of_scheme == 2) 485 | { 486 | kind_of_scheme = GAUSS_HERMITE; 487 | printf("Gauss-Hermite "); 488 | } 489 | else 490 | { 491 | kind_of_scheme = GAUSS_LEGENDRE; 492 | printf("Gauss-Legendre "); 493 | } 494 | 495 | if(argc >= 3) 496 | { 497 | deg = atoi(argv[2]); 498 | if(deg <= 1) 499 | deg = 2; 500 | } 501 | 502 | printf(" %d points Formura:\n", deg); 503 | 504 | abscissa = (double *)calloc(deg, sizeof(double)); 505 | weight = (double *)calloc(deg, sizeof(double)); 506 | 507 | gauss_integral_eig_d(abscissa, weight, deg, kind_of_scheme); 508 | 509 | for(i = 0; i < deg; i++) 510 | printf("%5d: %25.17e %25.17e\n", i, abscissa[i], weight[i]); 511 | 512 | if(kind_of_scheme == GAUSS_LEGENDRE) 513 | { 514 | // Check values of constant integrals with the given Gauss-Legedre formula 515 | printf("ex1: int[0, PI/2] cos x dx = 1 approx %25.17e\n", dgauss_legendre_integral(0.0, M_PI / 2, ex1_func, abscissa, weight, deg)); 516 | printf("ex2: int[0, 1 ] x^2 dx = 1/3 approx %25.17e\n", dgauss_legendre_integral(0.0, 1.0, ex2_func, abscissa, weight, deg)); 517 | } 518 | 519 | free(abscissa); 520 | free(weight); 521 | 522 | return EXIT_SUCCESS; 523 | } 524 | #endif // DEBUG -------------------------------------------------------------------------------- /integral_eq/gauss_integral.h: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Gaussian integration formulas */ 4 | /* with LAPACK/BLAS */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | /* types of Gauss qudrature rules */ 8 | #define GAUSS_LEGENDRE 10 9 | #define GAUSS_LEGUERRE 20 10 | #define GAUSS_HERMITE 30 11 | 12 | /* M_PI: PI */ 13 | #ifndef M_PI 14 | #define M_PI 3.1415926535897932384626433832795 15 | #endif // M_PI 16 | -------------------------------------------------------------------------------- /integral_eq/iteration.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Integral equation by using derivative free */ 4 | /* solver */ 5 | /* Last Update: 2016-12-02 (Fri) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | #ifdef USE_IMKL 12 | #include "mkl.h" // for Intel Math Kernel Library 13 | #include "mkl_cblas.h" 14 | #include "mkl_lapacke.h" 15 | #else // USE_IMKL 16 | #include "cblas.h" 17 | #include "lapacke.h" 18 | #endif // USE_IMKL 19 | 20 | #ifdef _OPENMP 21 | #include "omp.h" 22 | #endif // _OPENMP 23 | 24 | #include "tkaux.h" 25 | #include "get_secv.h" 26 | #include "gauss_integral.h" 27 | 28 | // Definition of the integral equation to be solved 29 | #include "ex1.c" // Example 1 30 | //#include "prob1.c" // Exersise 8.1 (1) 31 | //#include "prob2.c" // Exersise 8.1 (2) 32 | 33 | // difference: [u, v; F] 34 | // vfunc(ret_vec, vec): ret_vec = vfunc(vec) 35 | void difference_dmat_parallel(double *ret_mat, double *u, double *v, double (* vfunc_index)(int, double *, int), int dim) 36 | { 37 | int i, j, k, thread_index, num_threads; 38 | double *arg_vec[256]; 39 | double den[128], num[128]; // Max. #thread = 128 40 | 41 | #ifdef _OPENMP 42 | num_threads = omp_get_max_threads(); 43 | #else // _OPENMP 44 | num_threads = 1; 45 | #endif // _OPENMP 46 | 47 | // initialize 48 | for(i = 0; i < num_threads * 2; i++) 49 | arg_vec[i] = (double *)calloc(dim, sizeof(double)); 50 | 51 | // main loop 52 | for(i = 0; i < dim; i++) 53 | { 54 | #ifdef _OPENMP 55 | #pragma omp parallel for private(k, thread_index) 56 | #endif // _OPENMP 57 | for(j = 0; j < dim; j++) 58 | { 59 | #ifdef _OPENMP 60 | thread_index = omp_get_thread_num(); 61 | #else // _OPENMP 62 | thread_index = 0; 63 | #endif // _OPENMP 64 | 65 | // arg_vec[0] := [.........., u_j, v_j+1, ..., v_dim-1] 66 | for(k = 0; k <= j; k++) 67 | arg_vec[thread_index * 2][k] = u[k]; 68 | for(k = j + 1; k < dim; k++) 69 | arg_vec[thread_index * 2][k] = v[k]; 70 | 71 | // arg_vec[1] := [..., u_j-1, v_j, .........., v_dim-1] 72 | for(k = 0; k < j; k++) 73 | arg_vec[thread_index * 2 + 1][k] = u[k]; 74 | for(k = j; k < dim; k++) 75 | arg_vec[thread_index * 2 + 1][k] = v[k]; 76 | 77 | // (F_i(arg_vec[0]) - F_i(arg_vec[1])) / (u_j - v_j) 78 | num[thread_index] = vfunc_index(i, arg_vec[thread_index * 2], dim) - vfunc_index(i, arg_vec[thread_index * 2 + 1], dim); 79 | den[thread_index] = u[j] - v[j]; 80 | 81 | if(den[thread_index] == 0.0) 82 | { 83 | fprintf(stderr, "WARNING: dmat_difference (%d, %d) is divided by zero!\n", i, j); 84 | } 85 | else 86 | num[thread_index] /= den[thread_index]; 87 | 88 | ret_mat[i * dim + j] = num[thread_index]; 89 | } 90 | } 91 | 92 | // free 93 | for(i = 0; i < num_threads * 2; i++) 94 | free(arg_vec[i]); 95 | } 96 | 97 | 98 | // difference: [u, v; F] 99 | // vfunc(ret_vec, vec): ret_vec = vfunc(vec) 100 | void difference_dmat_serial(double *ret_mat, double *u, double *v, double (* vfunc_index)(int, double *, int), int dim) 101 | { 102 | int i, j, k; 103 | double *arg_vec[2]; 104 | double den, num; 105 | 106 | // initialize 107 | arg_vec[0] = (double *)calloc(dim, sizeof(double)); 108 | arg_vec[1] = (double *)calloc(dim, sizeof(double)); 109 | 110 | // main loop 111 | for(i = 0; i < dim; i++) 112 | { 113 | for(j = 0; j < dim; j++) 114 | { 115 | 116 | // arg_vec[0] := [.........., u_j, v_j+1, ..., v_dim-1] 117 | for(k = 0; k <= j; k++) 118 | arg_vec[0][k] = u[k]; 119 | for(k = j + 1; k < dim; k++) 120 | arg_vec[0][k] = v[k]; 121 | 122 | // arg_vec[1] := [..., u_j-1, v_j, .........., v_dim-1] 123 | for(k = 0; k < j; k++) 124 | arg_vec[1][k] = u[k]; 125 | for(k = j; k < dim; k++) 126 | arg_vec[1][k] = v[k]; 127 | 128 | // (F_i(arg_vec[0]) - F_i(arg_vec[1])) / (u_j - v_j) 129 | num = vfunc_index(i, arg_vec[0], dim) - vfunc_index(i, arg_vec[1], dim); 130 | den = u[j] - v[j]; 131 | 132 | if(den == 0.0) 133 | { 134 | fprintf(stderr, "WARNING: dmat_difference (%d, %d) is divided by zero!\n", i, j); 135 | } 136 | else 137 | num /= den; 138 | 139 | ret_mat[i * dim + j] = num; 140 | } 141 | } 142 | 143 | // free 144 | free(arg_vec[0]); 145 | free(arg_vec[1]); 146 | } 147 | 148 | // New Secant Method 149 | // work_vec[0] = delta + zeta 150 | void new_secant_1step_dvector(double *next_x, double *xm1, double *x0, double (* vfunc_index)(int, double *, int), void (* vfunc)(double *, double *, int), double *work_vec[4], double *diff_mat, int dim) 151 | { 152 | int i; 153 | int *pivot; 154 | 155 | pivot = (int *)calloc(dim, sizeof(int)); 156 | 157 | // Solve [x[-1], x[0]; F] delta = -F(x[0]) 158 | #ifdef USE_PARALLEL_DIFFMAT 159 | difference_dmat_parallel(diff_mat, xm1, x0, vfunc_index, dim); 160 | #else // USE_PARALLEL_DIFFMAT 161 | difference_dmat_serial(diff_mat, xm1, x0, vfunc_index, dim); 162 | #endif // USE_PARALLEL_DIFFMAT 163 | 164 | //print_dmatrix(diff_mat); 165 | 166 | vfunc(work_vec[0], x0, dim); 167 | cblas_dscal(dim, -1.0, work_vec[0], 1); 168 | 169 | LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, diff_mat, dim, pivot); 170 | 171 | LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, diff_mat, dim, pivot, work_vec[0], 1); 172 | 173 | // y = x[0] + delta 174 | cblas_dcopy(dim, work_vec[0], 1, work_vec[2], 1); 175 | cblas_daxpy(dim, 1.0, x0, 1, work_vec[2], 1); 176 | 177 | // Solve [x[-1], x[0]; F] zeta = -F(y) 178 | vfunc(work_vec[3], work_vec[2], dim); 179 | cblas_dscal(dim, -1.0, work_vec[3], 1); 180 | 181 | LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, diff_mat, dim, pivot, work_vec[3], 1); 182 | 183 | // x[1] := x[0] + delta + zeta 184 | cblas_daxpy(dim, 1.0, work_vec[3], 1, work_vec[0], 1); 185 | cblas_daxpy(dim, 1.0, work_vec[2], 1, work_vec[3], 1); 186 | 187 | cblas_dcopy(dim, work_vec[3], 1, next_x, 1); 188 | 189 | free(pivot); 190 | } 191 | 192 | // Secant Method 193 | // work_vec[0] = delta 194 | void secant_1step_dvector(double *next_x, double *xm1, double *x0, double (* vfunc_index)(int, double *, int), void (* vfunc)(double *, double *, int), double *work_vec[4], double *diff_mat, int dim) 195 | { 196 | int i; 197 | int *pivot; 198 | 199 | pivot = (int *)calloc(dim, sizeof(int)); 200 | 201 | // Solve [x[-1], x[0]; F] delta = -F(x[0]) 202 | #ifdef USE_PARALLEL_DIFFMAT 203 | difference_dmat_parallel(diff_mat, xm1, x0, vfunc_index, dim); 204 | #else // USE_PARALLEL_DIFFMAT 205 | difference_dmat_serial(diff_mat, xm1, x0, vfunc_index, dim); 206 | #endif // USE_PARALLEL_DIFFMAT 207 | 208 | //printf_dvector("%3d %25.17e\n", diff_mat, dim * dim, 1); 209 | 210 | vfunc(work_vec[0], x0, dim); 211 | cblas_dscal(dim, -1.0, work_vec[0], 1); 212 | 213 | LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, diff_mat, dim, pivot); 214 | 215 | LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, diff_mat, dim, pivot, work_vec[0], 1); 216 | 217 | // x[1] := x[0] + delta 218 | cblas_dcopy(dim, x0, 1, next_x, 1); 219 | cblas_daxpy(dim, 1.0, work_vec[0], 1, next_x, 1); 220 | 221 | free(pivot); 222 | } 223 | 224 | // New secant method 225 | int derivative_free_iteration_dvector(double *ret_vec, double *xm1, double *x0, double (* vfunc_index)(int, double *, int), void (* vfunc)(double *, double *, int), int dim, double rel_tol, double abs_tol, int maxtimes) 226 | { 227 | int iter_times, i; 228 | double *work_vec[4], *old_old_vec, *old_vec, *tmp_vec; 229 | double *work_mat; 230 | double start_time, end_time; 231 | 232 | // initialize 233 | init_derivative_free_iteration_dvector(dim); 234 | 235 | for(i = 0; i < 4; i++) 236 | work_vec[i] = (double *)calloc(dim, sizeof(double)); 237 | work_mat = (double *)calloc(dim * dim, sizeof(double)); 238 | 239 | old_old_vec = (double *)calloc(dim, sizeof(double)); 240 | cblas_dcopy(dim, xm1, 1, old_old_vec, 1); 241 | 242 | old_vec = (double *)calloc(dim, sizeof(double)); 243 | cblas_dcopy(dim, x0, 1, old_vec, 1); 244 | 245 | //printf("initial value:\n"); printf_dvector("%5d, %25.17e\n", xm1, dim, 1); printf_dvector("%5d, %25.17e\n", x0, dim, 1); 246 | 247 | // main loop 248 | start_time = get_real_secv(); 249 | for(iter_times = 0; iter_times < maxtimes; iter_times++) 250 | { 251 | 252 | #ifdef USE_NEW_SECANT 253 | // ret_vec := old_vec + delta + zeta 254 | new_secant_1step_dvector(ret_vec, old_old_vec, old_vec, vfunc_index, vfunc, work_vec, work_mat, dim); 255 | #else // USE_NEW_SECANT 256 | // ret_vec := old_vec + delta 257 | secant_1step_dvector(ret_vec, old_old_vec, old_vec, vfunc_index, vfunc, work_vec, work_mat, dim); 258 | #endif // USE_NEW_SECANT 259 | 260 | //printf("%ld:\n", iter_times); printf_dvector("%5d, %25.17e\n", ret_vec, dim, 1); 261 | printf("%5d, %25.17e\n", iter_times, cblas_dnrm2(dim, work_vec[0], 1)); 262 | 263 | // ||tmp_vec|| < rel_tol * ||old_vec|| + abs_tol ? 264 | if(cblas_dnrm2(dim, work_vec[0], 1) < rel_tol * cblas_dnrm2(dim, old_vec, 1) + abs_tol) 265 | break; 266 | 267 | // old_old_vec := old_vec 268 | // old_vec := ret_vec 269 | cblas_dcopy(dim, old_vec, 1, old_old_vec, 1); 270 | cblas_dcopy(dim, ret_vec, 1, old_vec, 1); 271 | } 272 | end_time = get_real_secv() - start_time; 273 | 274 | printf("elapsed time(s) of derivative_free_iteration_dvector: %f\n", end_time); 275 | 276 | if(iter_times >= maxtimes) 277 | fprintf(stderr, "Warning: derivative_free_iteration is not convergent!(%d iter_times)\n", iter_times); 278 | 279 | // free 280 | for(i = 0; i < 4; i++) 281 | free(work_vec[i]); 282 | free(work_mat); 283 | 284 | free(old_old_vec); 285 | free(old_vec); 286 | 287 | free_derivative_free_iteration_dvector(); 288 | 289 | return iter_times; 290 | } 291 | 292 | int main(int argc, char *argv[]) 293 | { 294 | int i, dim, itimes; 295 | double *dvec_u, *dvec_v, *dvec_ans; 296 | double *dmat; 297 | double start_time, end_time; 298 | 299 | // dim = 4; 300 | // dim = 8; 301 | // dim = 16; 302 | // dim = 32; 303 | // dim = 64; 304 | // dim = 512; 305 | // dim = 1024; 306 | 307 | if(argc <= 1) 308 | { 309 | printf("Usage: %s [dimension] \n", argv[0]); 310 | return EXIT_SUCCESS; 311 | } 312 | 313 | dim = atoi(argv[1]); 314 | 315 | if(dim <= 0) 316 | { 317 | fprintf(stderr, "ERROR: dimension( = %d) is illegal!\n", dim); 318 | return EXIT_FAILURE; 319 | } 320 | 321 | dvec_u = (double *)calloc(dim, sizeof(double)); 322 | dvec_v = (double *)calloc(dim, sizeof(double)); 323 | dvec_ans = (double *)calloc(dim, sizeof(double)); 324 | dmat = (double *)calloc(dim * dim, sizeof(double)); 325 | 326 | #ifdef USE_IMKL 327 | char str_mkl_version[1024]; 328 | int max_num_threads; 329 | 330 | MKL_Get_Version_String(str_mkl_version, 1024); 331 | printf("%s\n", str_mkl_version); 332 | 333 | max_num_threads = mkl_get_max_threads(); 334 | printf("Max Number of Threads: %d\n", max_num_threads); 335 | mkl_set_num_threads(max_num_threads); 336 | #endif 337 | 338 | srand(10); 339 | 340 | #ifdef _OPENMP 341 | #pragma omp parallel for 342 | #endif // _OPENMP 343 | for(i = 0; i < dim; i++) 344 | { 345 | dvec_u[i] = (double)rand() / (double)RAND_MAX; 346 | dvec_v[i] = (double)rand() / (double)RAND_MAX; 347 | } 348 | 349 | // difference_dmat(dmat, dvec_u, dvec_v, vf_index, dim); 350 | // difference_dmat(dmat, dvec_u, dvec_v, vf1_index, dim); 351 | 352 | // printf_dvector("%3d %25.17e\n", dmat, dim * dim, 1); 353 | 354 | // secant or new secant method 355 | start_time = get_real_secv(); 356 | itimes = derivative_free_iteration_dvector(dvec_ans, dvec_u, dvec_v, vf_index, vf, dim, 1.0e-10, 1.0e-50, dim * 2); 357 | end_time = get_real_secv() - start_time; 358 | 359 | printf("dvec_ans: \n"); printf_dvector2("%5d, %25.17e, %25.17e\n", dvec_abscissa, dvec_ans, dim, 1); 360 | printf("elapsed time(s): %f\n", end_time); 361 | 362 | free(dvec_u); 363 | free(dvec_v); 364 | free(dvec_ans); 365 | free(dmat); 366 | 367 | return EXIT_SUCCESS; 368 | } 369 | -------------------------------------------------------------------------------- /integral_eq/prob1.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Hammerstein Integral equation */ 4 | /* Last Update: 2016-12-02 (Fri) T.Kouya */ 5 | /*************************************************/ 6 | // 7 | // Problem 8.1: x(s) = exp(s) - s * sin(s) + int[0, 1] K(s, t) (x(t))^2 dt 8 | // where K(s, t) = exp(-2t) * sin(s) 9 | // 10 | // [caution!] original problem is defined in [0, 1], but cannot converge! 11 | // So smaller intervals such as [0, 0.2] are recommended. 12 | // 13 | double get_bij(int i, int j, double *abscissa, double *weight) 14 | { 15 | double ret; 16 | 17 | // abscissa[i] = s_i 18 | // abscissa[j] = t_j 19 | ret = weight[j] * exp(-2.0 * abscissa[j]) * sin(abscissa[i]); 20 | 21 | return ret; 22 | } 23 | 24 | void get_bmatrix(double *B, int row_dim, int col_dim, double *abscissa, double *weight) 25 | { 26 | int i, j; 27 | 28 | #ifdef _OPENMP 29 | #pragma omp parallel for private(j) 30 | #endif // _OPENMP 31 | for(i = 0; i < row_dim; i++) 32 | for(j = 0; j < col_dim; j++) 33 | B[i * col_dim + j] = get_bij(i, j, abscissa, weight); 34 | 35 | } 36 | 37 | double *dmat_b; 38 | double *dvec_abscissa, *dvec_weight, *dvec_exp_s_sin, *dvec_xsqr; 39 | 40 | void init_derivative_free_iteration_dvector(int dim) 41 | { 42 | dvec_abscissa = (double *)calloc(dim, sizeof(double)); 43 | dvec_weight = (double *)calloc(dim, sizeof(double)); 44 | dvec_xsqr = (double *)calloc(dim, sizeof(double)); 45 | dvec_exp_s_sin = (double *)calloc(dim, sizeof(double)); 46 | 47 | gauss_integral_eig_d(dvec_abscissa, dvec_weight, dim, GAUSS_LEGENDRE); 48 | dshifted_gauss_legendre(0.0, 0.2, dvec_abscissa, dvec_weight, dim); // easy ! 49 | //dshifted_gauss_legendre(0.0, 1.0, dvec_abscissa, dvec_weight, dim); // impossible ! 50 | 51 | dmat_b = (double *)calloc(dim * dim, sizeof(double)); 52 | 53 | get_bmatrix(dmat_b, dim, dim, dvec_abscissa, dvec_weight); 54 | 55 | } 56 | 57 | void free_derivative_free_iteration_dvector(void) 58 | { 59 | free(dvec_abscissa); 60 | free(dvec_weight); 61 | free(dvec_xsqr); 62 | free(dvec_exp_s_sin); 63 | 64 | free(dmat_b); 65 | } 66 | 67 | // ret = x^2 68 | void sqr_dvector(double *ret, double *x, int dim) 69 | { 70 | int i; 71 | 72 | #ifdef _OPENMP 73 | #pragma omp parallel for 74 | #endif // _OPENMP 75 | for(i = 0; i < dim; i++) 76 | ret[i] = x[i] * x[i]; 77 | } 78 | 79 | // ret = exp(s) - s * sin(s) 80 | void exp_s_sin_dvector(double *ret, double *s, int dim) 81 | { 82 | int i; 83 | 84 | #ifdef _OPENMP 85 | #pragma omp parallel for 86 | #endif // _OPENMP 87 | for(i = 0; i < dim; i++) 88 | ret[i] = exp(s[i]) - s[i] * sin(s[i]); 89 | } 90 | 91 | 92 | // double vfunc_index(int, DVector) 93 | double vf_index(int index, double *x, int dim) 94 | { 95 | double ret, tmp; 96 | int j; 97 | 98 | // B * x_sqr 99 | tmp = 0.0; 100 | #ifdef _OPENMP 101 | #pragma omp parallel for reduction(+:tmp) 102 | #endif // _OPENMP 103 | for(j = 0; j < dim; j++) 104 | tmp += dmat_b[index * dim + j] * (x[j] * x[j]); 105 | 106 | ret = x[index] - exp(dvec_abscissa[index]) + dvec_abscissa[index] * sin(dvec_abscissa[index]) - tmp; 107 | 108 | return ret; 109 | } 110 | 111 | 112 | // void vfunc(DVector, DVector) 113 | void vf(double *ret_vec, double *x, int dim) 114 | { 115 | double ret; 116 | int i; 117 | 118 | // ret_vec := exp(s) - s * sin(s) 119 | exp_s_sin_dvector(ret_vec, dvec_abscissa, dim); 120 | 121 | // ret_vec := exp(s) - s * sin(s) + B * x_sqr 122 | sqr_dvector(dvec_xsqr, x, dim); 123 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, dmat_b, dim, dvec_xsqr, 1, 1.0, ret_vec, 1); 124 | 125 | // ret_vec := -ret_vec 126 | cblas_dscal(dim, -1.0, ret_vec, 1); 127 | 128 | // ret_vec := x + ret_vec 129 | cblas_daxpy(dim, 1.0, x, 1, ret_vec, 1); 130 | } 131 | -------------------------------------------------------------------------------- /integral_eq/prob2.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Hammerstein Integral equation */ 4 | /* Last Update: 2016-12-02 (Fri) T.Kouya */ 5 | /*************************************************/ 6 | // 7 | // Problem 8.2: x(s) = s^2+ sin(s) * int[-1, 1] K(s, t) (x(t))^2 dt 8 | // where K(s, t) = exp(-2t) 9 | // 10 | double get_bij(int i, int j, double *abscissa, double *weight) 11 | { 12 | double ret; 13 | 14 | // abscissa[i] = s_i 15 | // abscissa[j] = t_j 16 | ret = weight[j] * exp(-2.0 * abscissa[j]) * sin(abscissa[i]); 17 | 18 | return ret; 19 | } 20 | 21 | void get_bmatrix(double *B, int row_dim, int col_dim, double *abscissa, double *weight) 22 | { 23 | int i, j; 24 | 25 | #ifdef _OPENMP 26 | #pragma omp parallel for private(j) 27 | #endif // _OPENMP 28 | for(i = 0; i < row_dim; i++) 29 | for(j = 0; j < col_dim; j++) 30 | B[i * col_dim + j] = get_bij(i, j, abscissa, weight); 31 | 32 | } 33 | 34 | double *dmat_b; 35 | double *dvec_abscissa, *dvec_weight, *dvec_ssqr, *dvec_xsqr; 36 | 37 | void init_derivative_free_iteration_dvector(int dim) 38 | { 39 | dvec_abscissa = (double *)calloc(dim, sizeof(double)); 40 | dvec_weight = (double *)calloc(dim, sizeof(double)); 41 | dvec_xsqr = (double *)calloc(dim, sizeof(double)); 42 | dvec_ssqr = (double *)calloc(dim, sizeof(double)); 43 | 44 | gauss_integral_eig_d(dvec_abscissa, dvec_weight, dim, GAUSS_LEGENDRE); 45 | dshifted_gauss_legendre(-1.0, 1.0, dvec_abscissa, dvec_weight, dim); 46 | 47 | dmat_b = (double *)calloc(dim * dim, sizeof(double)); 48 | 49 | get_bmatrix(dmat_b, dim, dim, dvec_abscissa, dvec_weight); 50 | 51 | } 52 | 53 | void free_derivative_free_iteration_dvector(void) 54 | { 55 | free(dvec_abscissa); 56 | free(dvec_weight); 57 | free(dvec_xsqr); 58 | free(dvec_ssqr); 59 | 60 | free(dmat_b); 61 | } 62 | 63 | // ret = x^2 64 | void sqr_dvector(double *ret, double *x, int dim) 65 | { 66 | int i; 67 | 68 | #ifdef _OPENMP 69 | #pragma omp parallel for 70 | #endif // _OPENMP 71 | for(i = 0; i < dim; i++) 72 | ret[i] = x[i] * x[i]; 73 | } 74 | 75 | // ret = exp(s) - s * sin(s) 76 | void ssqr_dvector(double *ret, double *s, int dim) 77 | { 78 | int i; 79 | 80 | #ifdef _OPENMP 81 | #pragma omp parallel for 82 | #endif // _OPENMP 83 | for(i = 0; i < dim; i++) 84 | ret[i] = s[i] * s[i]; 85 | } 86 | 87 | 88 | // double vfunc_index(int, DVector) 89 | double vf_index(int index, double *x, int dim) 90 | { 91 | double ret, tmp; 92 | int j; 93 | 94 | // B * x_sqr 95 | tmp = 0.0; 96 | #ifdef _OPENMP 97 | #pragma omp parallel for reduction(+:tmp) 98 | #endif // _OPENMP 99 | for(j = 0; j < dim; j++) 100 | tmp += dmat_b[index * dim + j] * (x[j] * x[j]); 101 | 102 | ret = x[index] - dvec_abscissa[index] * dvec_abscissa[index] - tmp; 103 | 104 | return ret; 105 | } 106 | 107 | 108 | // void vfunc(DVector, DVector) 109 | void vf(double *ret_vec, double *x, int dim) 110 | { 111 | double ret; 112 | int i; 113 | double *tmp_vec; 114 | 115 | tmp_vec = (double *)calloc(dim, sizeof(double)); 116 | 117 | // ret_vec := B * x_sqr 118 | sqr_dvector(dvec_xsqr, x, dim); 119 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, dmat_b, dim, dvec_xsqr, 1, 0.0, ret_vec, 1); 120 | 121 | // ret_vec := s * s + B * x_sqr 122 | ssqr_dvector(tmp_vec, dvec_abscissa, dim); 123 | cblas_daxpy(dim, 1.0, tmp_vec, 1, ret_vec, 1); 124 | 125 | // ret_vec := -ret_vec 126 | cblas_dscal(dim, -1.0, ret_vec, 1); 127 | 128 | // ret_vec := x + ret_vec 129 | cblas_daxpy(dim, 1.0, x, 1, ret_vec, 1); 130 | 131 | free(tmp_vec); 132 | } 133 | -------------------------------------------------------------------------------- /invpower_eig.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/invpower_eig.c -------------------------------------------------------------------------------- /jacobi_iteration.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Jacobi Iterative Refinement */ 4 | /* Last Update: 2015-02-20 (Fri) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | #include "cblas.h" 10 | 11 | // relative difference 12 | double reldiff_dvector(double *vec1, double *vec2, int dim) 13 | { 14 | double *tmp_vec; 15 | double ret, norm; 16 | 17 | tmp_vec = (double *)calloc(dim, sizeof(double)); 18 | 19 | cblas_dcopy(dim, vec1, 1, tmp_vec, 1); 20 | cblas_daxpy(dim, -1.0, vec2, 1, tmp_vec, 1); 21 | ret = cblas_dnrm2(dim, tmp_vec, 1); 22 | norm = cblas_dnrm2(dim, vec1, 1); 23 | 24 | if(norm != 0.0) 25 | ret /= norm; 26 | 27 | return ret; 28 | } 29 | 30 | int main() 31 | { 32 | int i, j, dim, itimes; 33 | double *ma, *md, *vy, *vx, *vx_true, *vx_new, *vb; 34 | double reps, aeps; 35 | 36 | // input dimension 37 | printf("Dim = "); scanf("%d", &dim); 38 | 39 | if(dim <= 0) 40 | { 41 | printf("Illegal dimension! (dim = %d)\n", dim); 42 | return EXIT_FAILURE; 43 | } 44 | 45 | // Initialize 46 | ma = (double *)calloc(dim * dim, sizeof(double)); 47 | md = (double *)calloc(dim, sizeof(double)); 48 | vy = (double *)calloc(dim, sizeof(double)); 49 | vx = (double *)calloc(dim, sizeof(double)); 50 | vx_new = (double *)calloc(dim, sizeof(double)); 51 | vx_true = (double *)calloc(dim, sizeof(double)); 52 | vb = (double *)calloc(dim, sizeof(double)); 53 | 54 | // input ma and vx_true 55 | for(i = 0; i < dim; i++) 56 | { 57 | for(j = 0; j < dim; j++) 58 | ma[i * dim + j] = sqrt(2.0) * (double)(i + j + 1); 59 | ma[i * dim + i] = sqrt(2.0) * dim * dim; 60 | vx_true[i] = sqrt(2.0) * (double)(i + 1); 61 | } 62 | 63 | // vb := 1.0 * ma * vx_true 64 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, ma, dim, vx_true, 1, 0.0, vb, 1); 65 | 66 | reps = 1.0e-10; 67 | aeps = 0.0; 68 | 69 | // md := D^{-1} = diag[1/a11, 1/a22, ..., 1/ann] 70 | for(i = 0; i < dim; i++) 71 | md[i] = 1.0 / ma[i * dim + i]; 72 | 73 | // vx := 0 74 | for(i = 0; i < dim; i++) 75 | vx[i] = 0.0; 76 | 77 | // Jacobi Iteration 78 | for(itimes = 0; itimes < dim * 10; itimes++) 79 | { 80 | // vy := vb 81 | cblas_dcopy(dim, vb, 1, vy, 1); 82 | 83 | // vx_new := vx 84 | cblas_dcopy(dim, vx, 1, vx_new, 1); 85 | 86 | // y := b - A * x 87 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, -1.0, ma, dim, vx, 1, 1.0, vy, 1); 88 | //for(i = 0; i < dim; i++) 89 | // printf("%3d %15.7e %15.7e\n", i, vx[i], vy[i]); 90 | //printf("\n"); 91 | 92 | // x_new := x + D^{-1} * y 93 | cblas_dsbmv(CblasRowMajor, CblasUpper, dim, 0, 1.0, md, 1, vy, 1, 1.0, vx_new, 1); 94 | //for(i = 0; i < dim; i++) 95 | // vx_new[i] = vx[i] + md[i] * vy[i]; 96 | 97 | //for(i = 0; i < dim; i++) 98 | // printf("%3d %15.7e %15.7e: %15.7e\n", i, vy[i], vx_new[i], md[i]); 99 | //printf("\n"); 100 | 101 | // || x_new - x || < reps || x_new || + aeps 102 | cblas_daxpy(dim, -1.0, vx_new, 1, vx, 1); 103 | if(cblas_dnrm2(dim, vx, 1) <= reps * cblas_dnrm2(dim, vx_new, 1) + aeps) 104 | { 105 | // vx := vx_new 106 | cblas_dcopy(dim, vx_new, 1, vx, 1); 107 | break; 108 | } 109 | 110 | printf("%3d %10.3e\n", itimes, cblas_dnrm2(dim, vx, 1)); 111 | 112 | // vx := vx_new 113 | cblas_dcopy(dim, vx_new, 1, vx, 1); 114 | } 115 | 116 | // print 117 | printf("Iterative Times = %d\n", itimes); 118 | printf("Rel.Diff = %10.3e\n", reldiff_dvector(vx, vx_true, dim)); 119 | 120 | for(i = 0; i < dim; i++) 121 | { 122 | printf("%3d %25.17e %25.17e\n", i, vx[i], vx_true[i]); 123 | } 124 | 125 | // free 126 | free(ma); 127 | free(md); 128 | free(vy); 129 | free(vx); 130 | free(vx_true); 131 | free(vx_new); 132 | free(vb); 133 | 134 | 135 | return EXIT_SUCCESS; 136 | } 137 | -------------------------------------------------------------------------------- /jacobi_iteration_csr_mkl.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/jacobi_iteration_csr_mkl.c -------------------------------------------------------------------------------- /jacobi_iteration_mkl.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/jacobi_iteration_mkl.c -------------------------------------------------------------------------------- /jacobi_iteration_spblas.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/jacobi_iteration_spblas.c -------------------------------------------------------------------------------- /lapack_complex_row_column_major.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample probram with lapack complex row-major */ 4 | /* and column major matrices */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | #include "lapacke.h" 12 | #include "cblas.h" 13 | 14 | int main() 15 | { 16 | int i, j, row_dim, col_dim; 17 | 18 | lapack_complex_double *mat_a; 19 | 20 | // input dimension 21 | printf("Row Dim = "); scanf("%d", &row_dim); 22 | printf("Column Dim = "); scanf("%d", &col_dim); 23 | 24 | if((row_dim <= 0) || (col_dim <= 0)) 25 | { 26 | printf("Illegal dimension! (row_dim = %d, col_dim = %d)\n", row_dim, col_dim); 27 | return EXIT_FAILURE; 28 | } 29 | 30 | // initialize matrix area 31 | mat_a = (lapack_complex_double *)calloc(row_dim * col_dim, sizeof(lapack_complex_double)); 32 | 33 | printf("Row Major: %d x %d\n", row_dim, col_dim); 34 | 35 | // Row Major 36 | // mat_a = A 37 | // A = [1+i 2+2i ....... n+ni] 38 | // [n+i n+2i ...... 2n+ni] 39 | // [.....................] 40 | // [(m-1)n+1+i ..... mn+ni] 41 | for(i = 0; i < row_dim; i++) 42 | { 43 | for(j = 0; j < col_dim; j++) 44 | mat_a[i * col_dim + j] = lapack_make_complex_double((double)(i * col_dim + j + 1), (double)(j + 1)); 45 | } 46 | 47 | // print (1) 48 | printf("1 dimension: \n"); 49 | printf("["); 50 | for(i = 0; i < row_dim * col_dim; i++) 51 | printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i]), lapack_complex_double_imag(mat_a[i])); 52 | printf("]\n"); 53 | 54 | // print (2) 55 | printf("2 dimension: \n"); 56 | for(i = 0; i < row_dim; i++) 57 | { 58 | printf("["); 59 | for(j = 0; j < col_dim; j++) 60 | printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i * col_dim + j]), lapack_complex_double_imag(mat_a[i * col_dim + j])); 61 | printf("]\n"); 62 | } 63 | 64 | printf("Column Major: %d x %d\n", row_dim, col_dim); 65 | 66 | // Column Major 67 | // mat_a = A 68 | // A = [1 2 ....... n] 69 | // [n+1 n+2 ...... 2n] 70 | // [.................] 71 | // [(m-1)n+1 ..... mn] 72 | for(j = 0; j < col_dim; j++) 73 | { 74 | for(i = 0; i < row_dim; i++) 75 | mat_a[i + row_dim * j] = lapack_make_complex_double((double)(i * col_dim + j + 1), (double)(j + 1)); 76 | } 77 | 78 | // print (1) 79 | printf("1 dimension: \n"); 80 | printf("["); 81 | for(i = 0; i < row_dim * col_dim; i++) 82 | printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i]), lapack_complex_double_imag(mat_a[i])); 83 | printf("]\n"); 84 | 85 | // print (2) 86 | printf("2 dimension: \n"); 87 | for(i = 0; i < row_dim; i++) 88 | { 89 | printf("["); 90 | for(j = 0; j < col_dim; j++) 91 | printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i + row_dim * j]), lapack_complex_double_real(mat_a[i + row_dim * j])); 92 | printf("]\n"); 93 | } 94 | 95 | // free 96 | free(mat_a); 97 | 98 | return EXIT_SUCCESS; 99 | } 100 | -------------------------------------------------------------------------------- /lapack_complex_row_column_major_cpp.cc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/lapack_complex_row_column_major_cpp.cc -------------------------------------------------------------------------------- /lapack_dgecon.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of DEGCON */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #ifdef USE_IMKL 10 | #include "mkl.h" // for Intel Math Kernel Library 11 | #include "mkl_lapack.h" // for dlange 12 | #include "mkl_lapacke.h" // for dgecon 13 | #include "mkl_cblas.h" // for dcopy 14 | #else // USE_IMKL 15 | #include "cblas.h" 16 | #include "lapacke.h" 17 | #endif // USE_IMKL 18 | 19 | int main() 20 | { 21 | int i, j, dim; // dimension of vectors 22 | int info, *pivot; 23 | double *ma, *ma_work; 24 | double norm1, cond1; // ||A||_1 and cond_1(A) 25 | double normi, condi; // ||A||_inf and cond_inf(A) 26 | char str_mkl_version[1024]; 27 | 28 | #ifdef USE_IMKL 29 | // print MKL version 30 | MKL_Get_Version_String(str_mkl_version, 1024); 31 | printf("%s\n", str_mkl_version); 32 | #endif // USE_IMKL 33 | 34 | // input dimension 35 | printf("Dim = "); scanf("%d", &dim); 36 | 37 | if(dim <= 0) 38 | { 39 | printf("Illegal dimension! (dim = %d)\n", dim); 40 | return EXIT_FAILURE; 41 | } 42 | 43 | // Initialize 44 | ma = (double *)calloc(sizeof(double), dim * dim); 45 | ma_work = (double *)calloc(sizeof(double), dim * dim); 46 | pivot = (int *)calloc(dim, sizeof(int)); 47 | 48 | // input va and vb 49 | for(i = 0; i < dim; i++) 50 | { 51 | for(j = 0; j < dim; j++) 52 | { 53 | // A = hirbert matrix 54 | *(ma + i * dim + j) = 1.0 / (double)(i + j + 1); 55 | /* ma[i * dim + j] = (double)rand() / (double)(RAND_MAX); 56 | if(rand() % 2 != 0) 57 | ma[i * dim + j] = -ma[i * dim + j]; 58 | */ 59 | } 60 | } 61 | 62 | // A := A 63 | cblas_dcopy(dim * dim, ma, 1, ma_work, 1); 64 | 65 | // print 66 | /* printf("A = \n"); 67 | for(i = 0; i < dim; i++) 68 | { 69 | printf("%3d: ", i); 70 | for(j = 0; j < dim; j++) 71 | printf("%10f ", *(ma + i * dim + j)); 72 | printf("\n"); 73 | 74 | } 75 | */ 76 | // Get ||A||_1 77 | norm1 = LAPACKE_dlange(LAPACK_ROW_MAJOR, '1', dim, dim, ma_work, dim); 78 | 79 | // LU decomposition 80 | info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, ma_work, dim, pivot); 81 | //printf("DGETRF info = %d\n", info); 82 | 83 | // Compute condition number of A 84 | info = LAPACKE_dgecon(LAPACK_ROW_MAJOR, '1', dim, ma_work, dim, norm1, &cond1); 85 | 86 | // error occurs if info > 0 87 | if(info < 0) 88 | { 89 | printf("The %d-th parameter is illegal!\n", -info); 90 | return EXIT_FAILURE; 91 | } 92 | 93 | // A := A 94 | cblas_dcopy(dim * dim, ma, 1, ma_work, 1); 95 | 96 | // Get ||A||_inf 97 | normi = LAPACKE_dlange(LAPACK_ROW_MAJOR, 'I', dim, dim, ma_work, dim); 98 | 99 | // LU decomposition 100 | info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, ma_work, dim, pivot); 101 | //printf("DGETRF info = %d\n", info); 102 | 103 | // Compute condition number of A 104 | info = LAPACKE_dgecon(LAPACK_ROW_MAJOR, 'I', dim, ma_work, dim, normi, &condi); 105 | 106 | // error occurs if info > 0 107 | if(info < 0) 108 | { 109 | printf("The %d-th parameter is illegal!\n", -info); 110 | return EXIT_FAILURE; 111 | } 112 | 113 | // print norm and condition number of A 114 | printf("||A||_1 = %25.17e, cond_1(A) = %25.17e\n", norm1, 1.0 / cond1); 115 | printf("||A||_inf = %25.17e, cond_inf(A) = %25.17e\n", normi, 1.0 / condi); 116 | 117 | // free 118 | free(ma); 119 | free(ma_work); 120 | 121 | return EXIT_SUCCESS; 122 | } 123 | -------------------------------------------------------------------------------- /lapack_dgeev.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/lapack_dgeev.c -------------------------------------------------------------------------------- /lapack_dgeev_magma.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/lapack_dgeev_magma.c -------------------------------------------------------------------------------- /lapack_dsyev.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of DSYEV */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2025-02-04 (Tue) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include // 2025-02-04(Tue) 10 | #include "cblas.h" 11 | #include "lapacke.h" 12 | 13 | #define MIN(i, j) ((i) < (j) ? (i) : (j)) 14 | #define MAX(i, j) ((i) > (j) ? (i) : (j)) 15 | 16 | int main() 17 | { 18 | int i, j, dim; // dimension of vectors 19 | int info; 20 | double *ma; 21 | double *cmat, *ma_org, *diag, alpha, beta; 22 | int absmax_eig_index, absmin_eig_index; 23 | double *eig, absmax_eig, absmin_eig, abs_eig, ma_norm; 24 | 25 | // input dimension 26 | printf("Dim = "); scanf("%d", &dim); 27 | 28 | if(dim <= 0) 29 | { 30 | printf("Illegal dimension! (dim = %d)\n", dim); 31 | return EXIT_FAILURE; 32 | } 33 | 34 | // Initialize 35 | eig = (double *)calloc(sizeof(double), dim); 36 | ma = (double *)calloc(sizeof(double), dim * dim); 37 | ma_org = (double *)calloc(sizeof(double), dim * dim); 38 | diag = (double *)calloc(sizeof(double), dim * dim); 39 | cmat = (double *)calloc(sizeof(double), dim * dim); 40 | 41 | // input ma, ma_org 42 | for(i = 0; i < dim; i++) 43 | { 44 | for(j = 0; j < dim; j++) 45 | { 46 | // A = Hilbert matrix 47 | //ma[i * dim + j] = 1.0 / (double)(i + j + 1); 48 | // A = Frank matrix 49 | ma[i * dim + j] = (double)(dim - MAX(i, j)); 50 | // A = Random matrix 51 | //ma[i * dim + j] = (double)rand() / (double)RAND_MAX; 52 | 53 | // ma_org = ma 54 | ma_org[i * dim + j] = ma[i * dim + j]; 55 | } 56 | } 57 | 58 | ma_norm = cblas_dnrm2(dim * dim, ma, 1); 59 | 60 | // print 61 | /* printf("A = \n"); 62 | for(i = 0; i < dim; i++) 63 | { 64 | printf("%3d: ", i); 65 | for(j = 0; j < dim; j++) 66 | printf("%10g ", ma[i * dim + j]); 67 | printf("\n"); 68 | 69 | } 70 | */ 71 | // solve A * V = \lambda * V 72 | // 'V' ... get eigenvectors 73 | info = LAPACKE_dsyev(LAPACK_ROW_MAJOR, 'V', 'U', dim, ma, dim, eig); 74 | 75 | // error occurs if info > 0 76 | if(info > 0) 77 | { 78 | printf("QR decomposition failed! (%d) \n", info); 79 | return EXIT_FAILURE; 80 | } 81 | else if(info < 0) 82 | { 83 | printf("%d-th argument of DSYEV is illegal!\n", info); 84 | return EXIT_FAILURE; 85 | } 86 | 87 | 88 | printf("Envenvectors: \n"); 89 | for(i = 0; i < dim; i++) 90 | { 91 | printf("%3d: ", i); 92 | for(j = 0; j < dim; j++) 93 | printf("%5g ", ma[i * dim + j]); 94 | printf("\n"); 95 | 96 | } 97 | 98 | // diag := diag(eig) 99 | absmax_eig_index = absmin_eig_index = 0; 100 | absmax_eig = absmin_eig = fabs(eig[0]); 101 | for(i = 0; i < dim; i++) 102 | { 103 | for(j = 0; j < dim; j++) 104 | diag[i * dim + j] = 0.0; 105 | diag[i * dim + i] = eig[i]; 106 | 107 | abs_eig = fabs(eig[i]); 108 | if(absmax_eig < abs_eig) 109 | { 110 | absmax_eig = abs_eig; 111 | absmax_eig_index = i; 112 | } 113 | if(absmin_eig > abs_eig) 114 | { 115 | absmin_eig = abs_eig; 116 | absmin_eig_index = i; 117 | } 118 | } 119 | 120 | // print 121 | printf("Eigenvalues = \n"); 122 | for(i = 0; i < dim; i++) 123 | { 124 | printf("%3d: ", i); 125 | printf("%10g\n", eig[i]); 126 | } 127 | printf("\n"); 128 | 129 | printf("absmax_eig = %25.17e\n", eig[absmax_eig_index]); 130 | printf("absmin_eig = %25.17e\n", eig[absmin_eig_index]); 131 | printf("cond2 = %25.17e\n", absmax_eig / absmin_eig); 132 | 133 | alpha = 1.0; 134 | beta = 0.0; 135 | 136 | // ev^T * A * ev - lambda * I ? 137 | cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, ma_org, dim, beta, cmat, dim); 138 | cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, cmat, dim, ma, dim, beta, ma_org, dim); 139 | 140 | alpha = -1.0; 141 | cblas_daxpy(dim * dim, alpha, diag, 1, ma_org, 1); 142 | 143 | printf("||ev^T * A * ev - lambda||_F / ||A||_F = %15.7e\n", cblas_dnrm2(dim * dim, ma_org, 1) / ma_norm); 144 | 145 | /* 146 | for(i = 0; i < dim; i++) 147 | { 148 | printf("%3d: ", i); 149 | for(j = 0; j < dim; j++) 150 | printf("%5g ", ma_org[i * dim + j]); 151 | printf("\n"); 152 | } 153 | */ 154 | // free 155 | free(eig); 156 | free(cmat); 157 | free(ma); 158 | free(ma_org); 159 | free(diag); 160 | 161 | return EXIT_SUCCESS; 162 | } 163 | -------------------------------------------------------------------------------- /lapack_gcc.inc: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Configuration file for GNU compiler collection # 4 | # Last Update: 2016-12-02 (Fri) T.Kouya # 5 | #*************************************************# 6 | CC=gcc 7 | FC=gfortran 8 | CPP=g++ 9 | 10 | INC = -I/usr/local/include 11 | LIB = -L/usr/local/lib/gcc -lgfortran -lm 12 | 13 | CBLAS_INC = $(INC) 14 | CBLAS_LIB = $(LIB) -lcblas -lrefblas -lgfortran 15 | 16 | LAPACKE_INC = -I/usr/local/include/lapacke $(CBLAS_INC) 17 | LAPACKE_LIB = -L/usr/local/lib/gcc -llapacke -llapack $(CBLAS_LIB) 18 | 19 | IMKL_INC=-I/opt/intel/mkl/include 20 | #IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -L/opt/intel/lib/intel64 -lifcore 21 | #IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L/opt/intel/lib/intel64 -lifcore -liomp5 22 | 23 | OPENMP = -fopenmp 24 | OPENMP_LIB = -lgomp 25 | -------------------------------------------------------------------------------- /lapack_icc.inc: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Configuration file for Intel C compiler # 4 | # Last Update: 2016-12-02 (Fri) T.Kouya # 5 | #*************************************************# 6 | CC=icc 7 | FC=ifort 8 | CPP=icpc 9 | 10 | INC = -I/usr/local/include 11 | LIB = -L/usr/local/lib -L/usr/lib64 -L/opt/intel/lib/intel64 -lifcore 12 | 13 | CBLAS_INC = $(INC) 14 | CBLAS_LIB = $(LIB) -lcblas -lrefblas -lm 15 | 16 | LAPACKE_INC = -I/usr/local/include/lapacke $(CBLAS_INC) 17 | LAPACKE_LIB = -L/usr/local/lib -llapacke -llapack $(CBLAS_LIB) -L/opt/intel/lib/intel64 -lifcore 18 | 19 | IMKL_INC=-I/opt/intel/include -I/opt/intel/mkl/include 20 | #IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -L/opt/intel/lib/intel64 -lifcore 21 | IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L/opt/intel/lib/intel64 -lifcore -liomp5 22 | 23 | OPENMP = -openmp -openmp-report2 24 | OPENMP_LIB = 25 | -------------------------------------------------------------------------------- /lapack_icx.inc: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Configuration file for Intel C compiler # 4 | # Last Update: 2026-02-04 (Tue) T.Kouya # 5 | #*************************************************# 6 | CC=icx-cc 7 | FC=ifx 8 | CPP=icx 9 | INTEL_ONEAPI=/opt/intel/oneapi 10 | 11 | INC = -I/usr/local/include 12 | LIB = -L/usr/local/lib -L/usr/lib64 -L/opt/intel/lib/intel64 -lifcore 13 | 14 | CBLAS_INC = $(INC) 15 | CBLAS_LIB = $(LIB) -lcblas -lrefblas -lm 16 | 17 | LAPACKE_INC = -I/usr/local/include/lapacke $(CBLAS_INC) 18 | LAPACKE_LIB = -L/usr/local/lib -llapacke -llapack $(CBLAS_LIB) -L$(INTEL_ONEAPI)/compiler/latest/lib -lifcore 19 | 20 | IMKL_INC=-I$(INTEL_ONEAPI)/mkl/latest/include 21 | IMKL_LIB=-L$(INTEL_ONEAPI)/mkl/latest/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L$(INTEL_ONEAPI)/compiler/latest/lib -lifcore -liomp5 22 | 23 | OPENMP = -openmp -qopt-report=max 24 | OPENMP_LIB = 25 | -------------------------------------------------------------------------------- /lapack_lamch.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample program to get properties */ 4 | /* of floating-point type */ 5 | /* with original routines */ 6 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 7 | /*************************************************/ 8 | #include 9 | #include 10 | 11 | #include "lapacke.h" 12 | 13 | int main() 14 | { 15 | float seps, sufth, softh; 16 | double deps, dufth, dofth; 17 | 18 | // float 19 | seps = LAPACK_slamch("E"); 20 | sufth = LAPACK_slamch("U"); 21 | softh = LAPACK_slamch("O"); 22 | printf("eps : Machine Epsilon(float) : %15.7e\n", seps); 23 | printf("ufth: Underflow Threshold(float) : %15.7e\n", sufth); 24 | printf("ofth: Overflow Threshold(float) : %15.7e\n", softh); 25 | 26 | // double 27 | deps = LAPACK_dlamch("E"); 28 | dufth = LAPACK_dlamch("U"); 29 | dofth = LAPACK_dlamch("O"); 30 | printf("Machine Epsilon(double) : %25.17e\n", deps); 31 | printf("Underflow Threshold(double) : %25.17e\n", dufth); 32 | printf("Overflow Threshold(double) : %25.17e\n", dofth); 33 | 34 | return EXIT_SUCCESS; 35 | } 36 | -------------------------------------------------------------------------------- /lapack_ssyev.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Sample Program of SSYEV */ 4 | /* with Intel Math Kernel */ 5 | /* Last Update: 2025-02-04 (Tue) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include // 2025-02-04(Tue) 10 | #include "cblas.h" 11 | #include "lapacke.h" 12 | 13 | #define MIN(i, j) ((i) < (j) ? (i) : (j)) 14 | #define MAX(i, j) ((i) > (j) ? (i) : (j)) 15 | 16 | int main() 17 | { 18 | int i, j, dim; // dimension of vectors 19 | int info; 20 | float *ma; 21 | float *cmat, *ma_org, *diag, alpha, beta; 22 | int absmax_eig_index, absmin_eig_index; 23 | float *eig, absmax_eig, absmin_eig, abs_eig, ma_norm; 24 | 25 | // input dimension 26 | printf("Dim = "); scanf("%d", &dim); 27 | 28 | if(dim <= 0) 29 | { 30 | printf("Illegal dimension! (dim = %d)\n", dim); 31 | return EXIT_FAILURE; 32 | } 33 | 34 | // Initialize 35 | eig = (float *)calloc(sizeof(float), dim); 36 | ma = (float *)calloc(sizeof(float), dim * dim); 37 | ma_org = (float *)calloc(sizeof(float), dim * dim); 38 | diag = (float *)calloc(sizeof(float), dim * dim); 39 | cmat = (float *)calloc(sizeof(float), dim * dim); 40 | 41 | // input ma, ma_org 42 | for(i = 0; i < dim; i++) 43 | { 44 | for(j = 0; j < dim; j++) 45 | { 46 | // A = Hilbert matrix 47 | //ma[i * dim + j] = 1.0 / (float)(i + j + 1); 48 | // A = Frank matrix 49 | ma[i * dim + j] = (float)(dim - MAX(i, j)); 50 | // A = Random matrix 51 | //ma[i * dim + j] = (float)rand() / (float)RAND_MAX; 52 | 53 | // ma_org = ma 54 | ma_org[i * dim + j] = ma[i * dim + j]; 55 | } 56 | } 57 | 58 | ma_norm = cblas_snrm2(dim * dim, ma, 1); 59 | 60 | // print 61 | /* printf("A = \n"); 62 | for(i = 0; i < dim; i++) 63 | { 64 | printf("%3d: ", i); 65 | for(j = 0; j < dim; j++) 66 | printf("%10g ", ma[i * dim + j]); 67 | printf("\n"); 68 | 69 | } 70 | */ 71 | // solve A * V = \lambda * V 72 | // 'V' ... get eigenvectors 73 | info = LAPACKE_ssyev(LAPACK_ROW_MAJOR, 'V', 'U', dim, ma, dim, eig); 74 | 75 | // error occurs if info > 0 76 | if(info > 0) 77 | { 78 | printf("QR decomposition failed! (%d) \n", info); 79 | return EXIT_FAILURE; 80 | } 81 | else if(info < 0) 82 | { 83 | printf("%d-th argument of SSYEV is illegal!\n", info); 84 | return EXIT_FAILURE; 85 | } 86 | 87 | /* 88 | for(i = 0; i < dim; i++) 89 | { 90 | printf("%3d: ", i); 91 | for(j = 0; j < dim; j++) 92 | printf("%5g ", ma[i * dim + j]); 93 | printf("\n"); 94 | 95 | } 96 | */ 97 | // diag := diag(eig) 98 | absmax_eig_index = absmin_eig_index = 0; 99 | absmax_eig = absmin_eig = fabs(eig[0]); 100 | for(i = 0; i < dim; i++) 101 | { 102 | for(j = 0; j < dim; j++) 103 | diag[i * dim + j] = 0.0; 104 | diag[i * dim + i] = eig[i]; 105 | 106 | abs_eig = fabs(eig[i]); 107 | if(absmax_eig < abs_eig) 108 | { 109 | absmax_eig = abs_eig; 110 | absmax_eig_index = i; 111 | } 112 | if(absmin_eig > abs_eig) 113 | { 114 | absmin_eig = abs_eig; 115 | absmin_eig_index = i; 116 | } 117 | } 118 | 119 | // print 120 | /* printf("Eigenvalues = \n"); 121 | for(i = 0; i < dim; i++) 122 | { 123 | printf("%3d: ", i); 124 | printf("%10g\n", eig[i]); 125 | } 126 | printf("\n"); 127 | */ 128 | printf("absmax_eig = %15.7e\n", eig[absmax_eig_index]); 129 | printf("absmin_eig = %15.7e\n", eig[absmin_eig_index]); 130 | printf("cond2 = %15.7e\n", absmax_eig / absmin_eig); 131 | 132 | alpha = 1.0; 133 | beta = 0.0; 134 | 135 | // ev^T * A * ev - lambda * I ? 136 | cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, ma_org, dim, beta, cmat, dim); 137 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, cmat, dim, ma, dim, beta, ma_org, dim); 138 | 139 | alpha = -1.0; 140 | cblas_saxpy(dim * dim, alpha, diag, 1, ma_org, 1); 141 | 142 | printf("||ev^T * A * ev - lambda * I||_2 = %15.7e\n", cblas_snrm2(dim * dim, ma_org, 1) / ma_norm); 143 | 144 | /* for(i = 0; i < dim; i++) 145 | { 146 | printf("%3d: ", i); 147 | for(j = 0; j < dim; j++) 148 | printf("%5g ", ma_org[i * dim + j]); 149 | printf("\n"); 150 | } 151 | */ 152 | // free 153 | free(eig); 154 | free(cmat); 155 | free(ma); 156 | free(ma_org); 157 | free(diag); 158 | 159 | return EXIT_SUCCESS; 160 | } 161 | -------------------------------------------------------------------------------- /lapack_win_intel.inc: -------------------------------------------------------------------------------- 1 | #*************************************************# 2 | # LAPACK/BLAS Tutorial # 3 | # Configuration file for Intel C compiler # 4 | # on Windows # 5 | # Last Update: 2016-12-02 (Fri) T.Kouya # 6 | #*************************************************# 7 | CC=icl /TC /Qstd=c99 /D:WIN32 8 | FC=ifort 9 | CPP=icl /TP 10 | 11 | VCL=cl 12 | 13 | INTEL_ROOT = "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows" 14 | INTEL_LIB_ROOT = $(INTEL_ROOT)\compiler\lib\intel64 15 | INTEL_MKL_ROOT = $(INTEL_ROOT)\mkl\lib\intel64 16 | 17 | CBLAS_INC = $(INC) 18 | CBLAS_LIB = /link $(INTEL_MKL_ROOT)\mkl_intel_lp64.lib /link $(INTEL_MKL_ROOT)\mkl_core.lib /link $(INTEL_MKL_ROOT)\mkl_sequential.lib 19 | 20 | IMKL_INC = $(INC) 21 | IMKL_LIB = /link $(INTEL_MKL_ROOT)\mkl_intel_lp64.lib /link $(INTEL_MKL_ROOT)\mkl_core.lib /link $(INTEL_MKL_ROOT)\mkl_sequential.lib 22 | 23 | LAPACKE_INC = $(INC) 24 | LAPACKE_LIB = /link $(INTEL_MKL_ROOT)\mkl_intel_lp64.lib /link $(INTEL_MKL_ROOT)\mkl_core.lib /link $(INTEL_MKL_ROOT)\mkl_sequential.lib 25 | 26 | OPENMP = -openmp -openmp-report2 27 | OPENMP_LIB = 28 | -------------------------------------------------------------------------------- /linear_eq.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/linear_eq.c -------------------------------------------------------------------------------- /linear_eq_dgbsv.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Solver for Linear equation with DSGESV */ 4 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | 10 | #include "lapacke.h" 11 | #include "cblas.h" 12 | 13 | #define IJMAX(i, j) ( ((i) > (j)) ? (i) : (j) ) 14 | #define IJMIN(i, j) (((i) < (j)) ? (i) : (j)) 15 | 16 | int main() 17 | { 18 | lapack_int i, j, k, dim, shift, index, ku, kl; 19 | lapack_int inc_vec_x, inc_vec_b; 20 | lapack_int *pivot, info; 21 | 22 | double *mat_a, *vec_b, *vec_x, *mat_a_full; 23 | double alpha, beta; 24 | double running_time; 25 | 26 | // input dimension of a linear equation to be solved 27 | printf("Dim = "); scanf("%d", &dim); 28 | 29 | if(dim <= 0) 30 | { 31 | printf("Illegal dimension! (dim = %d)\n", dim); 32 | return EXIT_FAILURE; 33 | } 34 | 35 | // initialize a tridiagonal matrix(mat_a) and vectors 36 | kl = 1; 37 | ku = 1; // necessary for pivoting 38 | mat_a = (double *)calloc((kl * 2 + ku + 1) * dim, sizeof(double)); 39 | vec_x = (double *)calloc(dim, sizeof(double)); 40 | vec_b = (double *)calloc(dim, sizeof(double)); 41 | 42 | // dim * dim 43 | mat_a_full = (double *)calloc(dim * dim, sizeof(double)); 44 | 45 | for(i = 0; i < dim; i++) 46 | vec_b[i] = 0.0; 47 | 48 | // input mat_a and vec_x 49 | for(i = 0; i < dim; i++) 50 | { 51 | // mat_a_full := 0 52 | for(j = 0; j < dim; j++) 53 | mat_a_full[i * dim + j] = 0.0; 54 | } 55 | 56 | for(i = 0; i < dim; i++) 57 | { 58 | // upper subdiagonal element 59 | if((i + 1) < dim) 60 | { 61 | j = i + 1; 62 | index = i * dim + j; 63 | mat_a_full[index] = (double)(j + 1); 64 | 65 | if((i + j + 1) % 2 != 0) 66 | mat_a_full[index] = -mat_a_full[index]; 67 | } 68 | 69 | // diagonal element 70 | j = i; 71 | index = i * dim + j; 72 | mat_a_full[index] = 1.0 / (double)(i + j + 1); 73 | if((i + j + 1) % 2 != 0) 74 | mat_a_full[index] = -mat_a_full[index]; 75 | 76 | mat_a_full[index] += 2.0; 77 | 78 | // lower subdiagonal element 79 | if((i - 1) >= 0) 80 | { 81 | j = i - 1; 82 | index = i * dim + j; 83 | mat_a_full[index] = 1.0 / (double)(i + 1); 84 | 85 | if((i + j + 1) % 2 != 0) 86 | mat_a_full[index] = -mat_a_full[index]; 87 | } 88 | 89 | //vec_x[i] = 1.0 / (double)(i + 1); 90 | vec_x[i] = 1.0 ; 91 | } 92 | 93 | // print 94 | for(i = 0; i < dim; i++) 95 | { 96 | for(j = 0; j < dim; j++) 97 | printf("%10.3e ", mat_a_full[i * dim + j]); 98 | printf("\n"); 99 | } 100 | printf("\n"); 101 | 102 | // convert 103 | for(j = 0; j < dim; j++) 104 | { 105 | k = ku - j; 106 | for(i = IJMAX(0, j - ku); i < IJMIN(dim, j + kl + 1); i++) 107 | { 108 | printf("(%d, %d) -> (%d, %d)\n", i, j, k + i, j); 109 | mat_a[(k + i) * dim + j] = mat_a_full[i * dim + j]; 110 | } 111 | } 112 | 113 | // print 114 | for(i = 0; i < (ku + kl + 1); i++) 115 | { 116 | for(j = 0; j < dim; j++) 117 | printf("%10.3e ", mat_a[i * dim + j]); 118 | printf("\n"); 119 | } 120 | printf("\n"); 121 | 122 | // size(vec_x) == size(vec_b) 123 | inc_vec_x = inc_vec_b = 1; 124 | 125 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 126 | alpha = 1.0; 127 | beta = 0.0; 128 | //cblas_dgbmv(CblasRowMajor, CblasTrans, dim, dim, kl, ku, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 129 | cblas_dgbmv(CblasColMajor, CblasNoTrans, dim, dim, kl, ku, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 130 | 131 | // print 132 | for(i = 0; i < dim; i++) 133 | { 134 | printf("["); 135 | for(j = 0; j < dim; j++) 136 | { 137 | if(j == (i + 1)) 138 | printf("%10.3e ", mat_a[shift + j]); 139 | else if(j == i) 140 | printf("%10.3e ", mat_a[shift + dim + j]); 141 | else if(j == (i - 1)) 142 | printf("%10.3e ", mat_a[shift + 2 * dim + j]); 143 | else 144 | printf("%10.3e ", 0.0); 145 | } 146 | 147 | printf("] %10.3f = %10.3f\n", vec_x[i], vec_b[i]); 148 | } 149 | 150 | // initialize pivot 151 | pivot = (lapack_int *)calloc(dim, sizeof(lapack_int)); 152 | 153 | // solve A * X = C -> C := X 154 | info = LAPACKE_dgbsv(LAPACK_COL_MAJOR, dim, kl, ku, 1, mat_a, kl * 2 + ku + 1, pivot, vec_b, dim); 155 | 156 | printf("info = %d\n", info); 157 | 158 | // print 159 | printf("calculated x = \n"); 160 | for(i = 0; i < dim; i++) 161 | { 162 | printf("%3d -> %3d: ", i, pivot[i]); 163 | printf("%25.17e ", vec_b[i]); 164 | printf("\n"); 165 | } 166 | 167 | // diff 168 | printf("x - calculated x = \n"); 169 | for(i = 0; i < dim; i++) 170 | { 171 | printf("%3d: ", i); 172 | printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i])); 173 | printf("\n"); 174 | } 175 | 176 | // free 177 | free(mat_a); 178 | free(vec_x); 179 | free(vec_b); 180 | free(pivot); 181 | free(mat_a_full); 182 | 183 | return EXIT_SUCCESS; 184 | } 185 | -------------------------------------------------------------------------------- /linear_eq_dgetrf.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Solver for Linear equation */ 4 | /* with xGETRF & xGETRS */ 5 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | #include "lapacke.h" 12 | #include "cblas.h" 13 | 14 | int main() 15 | { 16 | lapack_int i, j, dim; 17 | lapack_int inc_vec_x, inc_vec_b; 18 | lapack_int *pivot, info; 19 | 20 | double *mat_a, *vec_b, *vec_x; 21 | double alpha, beta; 22 | double running_time; 23 | 24 | // input dimension of linear equation to be solved 25 | printf("Dim = "); scanf("%d", &dim); 26 | 27 | if(dim <= 0) 28 | { 29 | printf("Illegal dimension! (dim = %d)\n", dim); 30 | return EXIT_FAILURE; 31 | } 32 | 33 | // initialize a matrix and vectors 34 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 35 | vec_x = (double *)calloc(dim, sizeof(double)); 36 | vec_b = (double *)calloc(dim, sizeof(double)); 37 | 38 | // input mat_a and vec_x 39 | for(i = 0; i < dim; i++) 40 | { 41 | for(j = 0; j < dim; j++) 42 | { 43 | //mat_a[i * dim + j] = (double)rand() / (double)RAND_MAX; 44 | mat_a[i * dim + j] = 1.0 / (double)(i + j + 1); 45 | if((i + j + 1) % 2 != 0) 46 | mat_a[i * dim + j] = -mat_a[i * dim + j]; 47 | } 48 | mat_a[i * dim + i] += 2.0; 49 | vec_x[i] = 1.0 / (double)(i + 1); 50 | } 51 | 52 | // size(vec_x) == size(vec_b) 53 | inc_vec_x = inc_vec_b = 1; 54 | 55 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 56 | alpha = 1.0; 57 | beta = 0.0; 58 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 59 | 60 | // print 61 | for(i = 0; i < dim; i++) 62 | { 63 | printf("["); 64 | for(j = 0; j < dim; j++) 65 | printf("%10.3f ", mat_a[i * dim + j]); 66 | printf("] %10.3f = %10.3f\n", vec_x[i], vec_b[i]); 67 | } 68 | 69 | // initialize pivot 70 | pivot = (lapack_int *)calloc(dim, sizeof(lapack_int)); 71 | 72 | // LU decomposition 73 | info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, mat_a, dim, pivot); 74 | printf("DGETRF info = %d\n", info); 75 | 76 | // forward & backward substitution 77 | info = LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, mat_a, dim, pivot, vec_b, 1); 78 | printf("DGETRS info = %d\n", info); 79 | 80 | // print 81 | printf("calculated x = \n"); 82 | for(i = 0; i < dim; i++) 83 | { 84 | printf("%3d -> %3d: ", i, pivot[i]); 85 | printf("%25.17e ", vec_b[i]); 86 | printf("\n"); 87 | } 88 | 89 | // diff 90 | printf("x - calculated x = \n"); 91 | for(i = 0; i < dim; i++) 92 | { 93 | printf("%3d: ", i); 94 | printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i])); 95 | printf("\n"); 96 | } 97 | 98 | // free 99 | free(mat_a); 100 | free(vec_x); 101 | free(vec_b); 102 | free(pivot); 103 | 104 | return EXIT_SUCCESS; 105 | } 106 | -------------------------------------------------------------------------------- /linear_eq_dsgesv.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Solver for Linear equation with DSGESV */ 4 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | 10 | #include "lapacke.h" 11 | #include "cblas.h" 12 | 13 | int main() 14 | { 15 | lapack_int i, j, dim, itimes; 16 | lapack_int inc_vec_x, inc_vec_b; 17 | lapack_int *pivot, info; 18 | 19 | double *mat_a, *vec_b, *vec_x, *vec_x_approx; 20 | double alpha, beta; 21 | double running_time; 22 | 23 | // input dimension of a linear equation to be solved 24 | printf("Dim = "); scanf("%d", &dim); 25 | 26 | if(dim <= 0) 27 | { 28 | printf("Illegal dimension! (dim = %d)\n", dim); 29 | return EXIT_FAILURE; 30 | } 31 | 32 | // inisialize a matrix and vectors 33 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 34 | vec_x = (double *)calloc(dim, sizeof(double)); 35 | vec_x_approx = (double *)calloc(dim, sizeof(double)); 36 | vec_b = (double *)calloc(dim, sizeof(double)); 37 | 38 | // input mat_a and vec_x 39 | for(i = 0; i < dim; i++) 40 | { 41 | for(j = 0; j < dim; j++) 42 | { 43 | //mat_a[i * dim + j] = (double)rand() / (double)RAND_MAX; 44 | mat_a[i * dim + j] = 1.0 / (double)(i + j + 1); 45 | if((i + j + 1) % 2 != 0) 46 | mat_a[i * dim + j] = -mat_a[i * dim + j]; 47 | } 48 | mat_a[i * dim + i] += 2.0; 49 | vec_x[i] = 1.0 / (double)(i + 1); 50 | } 51 | 52 | // size(vec_x) == size(vec_b) 53 | inc_vec_x = inc_vec_b = 1; 54 | 55 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 56 | alpha = 1.0; 57 | beta = 0.0; 58 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 59 | 60 | // print 61 | for(i = 0; i < dim; i++) 62 | { 63 | printf("["); 64 | for(j = 0; j < dim; j++) 65 | printf("%10.3f ", mat_a[i * dim + j]); 66 | printf("] %10.3f = %10.3f\n", vec_x[i], vec_b[i]); 67 | } 68 | 69 | // initialize pivot 70 | pivot = (lapack_int *)calloc(dim, sizeof(lapack_int)); 71 | 72 | // single-double mixed presicion iterative refinement method 73 | info = LAPACKE_dsgesv(LAPACK_ROW_MAJOR, dim, 1, mat_a, dim, pivot, vec_b, 1, vec_x_approx, 1, &itimes); 74 | printf("DSGESV info = %d, Iterative Times = %d\n", info, itimes); 75 | 76 | // print 77 | printf("calculated x = \n"); 78 | for(i = 0; i < dim; i++) 79 | { 80 | printf("%3d -> %3d: ", i, pivot[i]); 81 | printf("%25.17e ", vec_x_approx[i]); 82 | printf("\n"); 83 | } 84 | 85 | // diff 86 | printf("x - calculated x = \n"); 87 | for(i = 0; i < dim; i++) 88 | { 89 | printf("%3d: ", i); 90 | printf("%10.2e ", fabs((vec_x[i] - vec_x_approx[i]) / vec_x[i])); 91 | printf("\n"); 92 | } 93 | 94 | // free 95 | free(mat_a); 96 | free(vec_x); 97 | free(vec_x_approx); 98 | free(vec_b); 99 | free(pivot); 100 | 101 | return EXIT_SUCCESS; 102 | } 103 | -------------------------------------------------------------------------------- /linear_eq_dsposv.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Solver for Linear equation with DSPOSV */ 4 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | 10 | #include "lapacke.h" 11 | #include "cblas.h" 12 | 13 | int main() 14 | { 15 | lapack_int i, j, dim, itimes; 16 | lapack_int inc_vec_x, inc_vec_b; 17 | lapack_int info; 18 | 19 | double *mat_a, *vec_b, *vec_x, *vec_x_iter; 20 | double alpha, beta; 21 | double running_time; 22 | 23 | // input dimension of linear equation to be solved 24 | printf("Dim = "); scanf("%d", &dim); 25 | 26 | if(dim <= 0) 27 | { 28 | printf("Illegal dimension! (dim = %d)\n", dim); 29 | return EXIT_FAILURE; 30 | } 31 | 32 | // initialize a matrix and vectors 33 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 34 | vec_x = (double *)calloc(dim, sizeof(double)); 35 | vec_x_iter = (double *)calloc(dim, sizeof(double)); 36 | vec_b = (double *)calloc(dim, sizeof(double)); 37 | 38 | // input mat_a and vec_x 39 | for(i = 0; i < dim; i++) 40 | { 41 | for(j = 0; j < dim; j++) 42 | { 43 | //mat_a[i * dim + j] = (double)rand() / (double)RAND_MAX; 44 | mat_a[i * dim + j] = 1.0 / (double)(i + j + 1); 45 | if((i + j + 1) % 2 != 0) 46 | mat_a[i * dim + j] = -mat_a[i * dim + j]; 47 | } 48 | mat_a[i * dim + i] += 2.0; 49 | vec_x[i] = 1.0 / (double)(i + 1); 50 | } 51 | 52 | // size(vec_x) == size(vec_b) 53 | inc_vec_x = inc_vec_b = 1; 54 | 55 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 56 | alpha = 1.0; 57 | beta = 0.0; 58 | cblas_dsymv(CblasRowMajor, CblasUpper, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 59 | 60 | // print 61 | for(i = 0; i < dim; i++) 62 | { 63 | printf("["); 64 | for(j = 0; j < dim; j++) 65 | printf("%10.3f ", mat_a[i * dim + j]); 66 | printf("] %10.3f = %10.3f\n", vec_x[i], vec_b[i]); 67 | } 68 | 69 | // solve A * X = C -> C := X 70 | info = LAPACKE_dsposv(LAPACK_ROW_MAJOR, 'U', dim, 1, mat_a, dim, vec_b, 1, vec_x_iter, 1, &itimes); 71 | printf("DSPOSV info = %d, Iterative Times = %d\n", info, itimes); 72 | 73 | // print 74 | printf("calculated x = \n"); 75 | for(i = 0; i < dim; i++) 76 | { 77 | //printf("%3d -> %3d: ", i, pivot[i]); 78 | printf("%25.17e ", vec_x_iter[i]); 79 | printf("\n"); 80 | } 81 | 82 | // diff 83 | printf("x - calculated x = \n"); 84 | for(i = 0; i < dim; i++) 85 | { 86 | printf("%3d: ", i); 87 | printf("%10.2e ", fabs((vec_x[i] - vec_x_iter[i]) / vec_x[i])); 88 | printf("\n"); 89 | } 90 | 91 | // free 92 | free(mat_a); 93 | free(vec_x); 94 | free(vec_x_iter); 95 | free(vec_b); 96 | 97 | return EXIT_SUCCESS; 98 | } 99 | -------------------------------------------------------------------------------- /linear_eq_dsysv.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/linear_eq_dsysv.c -------------------------------------------------------------------------------- /linear_eq_magma.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Solver for Linear equation with MAGMA */ 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | 10 | // BLAS on CPU 11 | #include "cblas.h" 12 | 13 | // CUDA & MAGMA 14 | #include "cuda.h" 15 | #include "magma.h" 16 | 17 | int main() 18 | { 19 | int i, j, dim; 20 | int inc_vec_x, inc_vec_b; 21 | int *pivot, info; 22 | 23 | double *mat_a, *vec_b, *vec_x; // CPU 24 | double alpha, beta; 25 | 26 | // input dimension 27 | printf("Dim = "); scanf("%d", &dim); 28 | 29 | if(dim <= 0) 30 | { 31 | printf("Illegal dimension! (dim = %d)\n", dim); 32 | return EXIT_FAILURE; 33 | } 34 | 35 | // initialize a matrix and vectors on CPU 36 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 37 | vec_x = (double *)calloc(dim, sizeof(double)); 38 | vec_b = (double *)calloc(dim, sizeof(double)); 39 | 40 | // input mat_a and vec_x on CPU 41 | for(j = 0; j < dim; j++) 42 | { 43 | // Column-major 44 | for(i = 0; i < dim; i++) 45 | { 46 | mat_a[i + j * dim] = (double)rand() / (double)RAND_MAX; 47 | if(rand() % 2 != 0) 48 | mat_a[i + j * dim] = -mat_a[i + j * dim]; 49 | } 50 | vec_x[j] = 1.0 / (double)(j + 1); 51 | } 52 | 53 | // size(vec_x) == size(vec_b) 54 | inc_vec_x = inc_vec_b = 1; 55 | 56 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 57 | alpha = 1.0; 58 | beta = 0.0; 59 | cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 60 | 61 | // print 62 | /* for(i = 0; i < dim; i++) 63 | { 64 | printf("["); 65 | for(j = 0; j < dim; j++) 66 | printf("%10.3f ", mat_a[i + j * dim]); 67 | printf("] %10.3f = %10.3f\n", vec_x[i], vec_b[i]); 68 | } 69 | */ 70 | // start MAGMA 71 | magma_init(); 72 | 73 | // initialize pivot area 74 | pivot = (int *)calloc(sizeof(int), dim); 75 | 76 | // solve A * X = C -> C := X 77 | magma_dgesv(dim, 1, mat_a, dim, pivot, vec_b, dim, &info); 78 | 79 | printf("info = %d\n", info); 80 | 81 | // print 82 | /* printf("calculated x = \n"); 83 | for(i = 0; i < dim; i++) 84 | { 85 | printf("%3d -> %3d: ", i, pivot[i]); 86 | printf("%25.17e ", vec_b[i]); 87 | printf("\n"); 88 | } 89 | 90 | // diff 91 | printf("x - calculated x = \n"); 92 | for(i = 0; i < dim; i++) 93 | { 94 | printf("%3d: ", i); 95 | printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i])); 96 | printf("\n"); 97 | } 98 | */ 99 | // norm relative error: ||vec_b - vec_x||_2 / ||vec_x||_2 100 | cblas_daxpy(dim, -1.0, vec_x, 1, vec_b, 1); 101 | printf("||x - calculated x||_2 = %10.7e\n", cblas_dnrm2(dim, vec_b, 1) / cblas_dnrm2(dim, vec_x, 1)); 102 | 103 | // free 104 | free(mat_a); 105 | free(vec_x); 106 | free(vec_b); 107 | free(pivot); 108 | 109 | // finalize MAGMA 110 | magma_finalize(); 111 | 112 | return EXIT_SUCCESS; 113 | } 114 | -------------------------------------------------------------------------------- /matvec_mul.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/matvec_mul.c -------------------------------------------------------------------------------- /matvec_mul_cublas.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Mutiplication of matrix and vector */ 4 | /* with cuBLAS */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | // BLAS on CPU 12 | #include "cblas.h" 13 | 14 | // define mycuda_calloc and mycuda_free functions 15 | #include "mycuda.h" // CUDA 16 | 17 | #include "cublas_v2.h" // cuBLAS 18 | 19 | int main() 20 | { 21 | int i, j, dim; 22 | int inc_vec_x, inc_vec_b; 23 | 24 | double *mat_a, *vec_b, *vec_x, *vec_b_gpu; // CPU 25 | double *dev_mat_a, *dev_vec_b, *dev_vec_x; // GPU 26 | double alpha, beta; 27 | 28 | // variables for cuBLAS 29 | cublasStatus_t status; 30 | cublasHandle_t handle; 31 | 32 | // input dimension 33 | printf("Dim = "); scanf("%d", &dim); 34 | 35 | if(dim <= 0) 36 | { 37 | printf("Illegal dimension! (dim = %d)\n", dim); 38 | return EXIT_FAILURE; 39 | } 40 | 41 | // initialize a matrix and vectors on CPU 42 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 43 | vec_x = (double *)calloc(dim, sizeof(double)); 44 | vec_b = (double *)calloc(dim, sizeof(double)); 45 | 46 | // input mat_a and vec_x 47 | for(j = 0; j < dim; j++) 48 | { 49 | for(i = 0; i < dim; i++) 50 | { 51 | // column-major 52 | mat_a[i + j * dim] = (double)rand() / (double)RAND_MAX; 53 | if(rand() % 2 != 0) 54 | mat_a[i + j * dim] *= -1.0; 55 | } 56 | vec_x[j] = 1.0 / (double)(j + 1); 57 | } 58 | 59 | // size(vec_x) == size(vec_b) 60 | inc_vec_x = inc_vec_b = 1; 61 | 62 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 63 | alpha = 1.0; 64 | beta = 0.0; 65 | cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 66 | 67 | // print 68 | /* for(i = 0; i < dim; i++) 69 | { 70 | printf("["); 71 | for(j = 0; j < dim; j++) 72 | printf("%10.3lf ", mat_a[i + j * dim]); 73 | printf("] %10.3lf = %10.3lf\n", vec_x[i], vec_b[i]); 74 | } 75 | */ 76 | // GPU 77 | 78 | // initialize a matrix and vectors on GPU 79 | dev_mat_a = (double *)mycuda_calloc(dim * dim, sizeof(double)); 80 | dev_vec_x = (double *)mycuda_calloc(dim, sizeof(double)); 81 | dev_vec_b = (double *)mycuda_calloc(dim, sizeof(double)); 82 | 83 | // dev_vec_x on GPU -> vec_x_gpu on CPU 84 | vec_b_gpu = (double *)calloc(dim, sizeof(double)); 85 | 86 | // CPU(Host) -> GPU(device) 87 | 88 | // start cuBLAS 89 | status = cublasCreate(&handle); 90 | 91 | if(status != CUBLAS_STATUS_SUCCESS) 92 | { 93 | printf("Fail to initialize cuBLAS!\n"); 94 | 95 | mycuda_free(dev_mat_a); 96 | mycuda_free(dev_vec_b); 97 | mycuda_free(dev_vec_x); 98 | cublasDestroy(handle); 99 | 100 | return 0; 101 | } 102 | 103 | // mat_a -> dev_mat_a 104 | status = cublasSetMatrix(dim, dim, sizeof(double), mat_a, dim, dev_mat_a, dim); 105 | if(status != CUBLAS_STATUS_SUCCESS) 106 | printf("mat_a -> dev_mat_a: cublasSetMatrix failed.\n"); 107 | 108 | // size(vec_x) == size(vec_b) 109 | inc_vec_x = inc_vec_b = 1; 110 | 111 | // vec_x -> dev_vec_x 112 | status = cublasSetVector(dim, sizeof(double), vec_x, inc_vec_x, dev_vec_x, inc_vec_x); 113 | if(status != CUBLAS_STATUS_SUCCESS) 114 | printf("vec_x -> dev_vec_x: cublasSetVector failed.\n"); 115 | 116 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 117 | alpha = 1.0; 118 | beta = 0.0; 119 | status = cublasDgemv(handle, CUBLAS_OP_N, dim, dim, &alpha, dev_mat_a, dim, dev_vec_x, inc_vec_x, &beta, dev_vec_b, inc_vec_b); 120 | 121 | if(status != CUBLAS_STATUS_SUCCESS) 122 | printf("cublasDgemv failed.\n"); 123 | 124 | // synchronize 125 | cudaDeviceSynchronize(handle); 126 | 127 | // dev_vec_b -> vec_b_gpu 128 | status = cublasGetVector(dim, sizeof(double), dev_vec_b, inc_vec_b, vec_b_gpu, inc_vec_b); 129 | if(status != CUBLAS_STATUS_SUCCESS) 130 | printf("dev_vec_b -> vec_b_gpu: cublasGetVector failed.\n"); 131 | 132 | // print 133 | /* for(i = 0; i < dim; i++) 134 | { 135 | printf("["); 136 | for(j = 0; j < dim; j++) 137 | printf("%10.3lf ", mat_a[i + j * dim]); 138 | printf("] %10.3lf = %10.3lf\n", vec_x[i], vec_b_gpu[i]); 139 | } 140 | */ 141 | // relative difference: ||vec_b_gpu - vec_b||_2 / ||vec_b||_2 142 | cblas_daxpy(dim, -1.0, vec_b, inc_vec_b, vec_b_gpu, inc_vec_b); 143 | printf("||vec_b_gpu - vec_b||_2 / ||vec_b||_2 = %15.7e\n", cblas_dnrm2(dim, vec_b_gpu, 1) / cblas_dnrm2(dim, vec_b, 1)); 144 | 145 | // free 146 | free(mat_a); 147 | free(vec_x); 148 | free(vec_b); 149 | free(vec_b_gpu); 150 | 151 | mycuda_free(dev_mat_a); 152 | mycuda_free(dev_vec_b); 153 | mycuda_free(dev_vec_x); 154 | 155 | return EXIT_SUCCESS; 156 | } 157 | -------------------------------------------------------------------------------- /matvec_mul_magma.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/matvec_mul_magma.c -------------------------------------------------------------------------------- /matvec_mul_magma_pure.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Mutiplication of matrix and vector */ 4 | /* with magmablas */ 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | // BLAS on CPU 12 | #include "cblas.h" 13 | 14 | // MAGMA 15 | #include "magma.h" 16 | 17 | int main() 18 | { 19 | int i, j, dim; 20 | int inc_vec_x, inc_vec_b; 21 | 22 | double *mat_a, *vec_b, *vec_x, *vec_b_gpu; // CPU 23 | double *dev_mat_a, *dev_vec_b, *dev_vec_x; // GPU 24 | double alpha, beta; 25 | 26 | // input dimension 27 | printf("Dim = "); scanf("%d", &dim); 28 | 29 | if(dim <= 0) 30 | { 31 | printf("Illegal dimension! (dim = %d)\n", dim); 32 | return EXIT_FAILURE; 33 | } 34 | 35 | // start MAGMA 36 | magma_init(); 37 | 38 | // initialize a matrix and vectors on CPU 39 | magma_dmalloc_cpu(&mat_a, dim * dim); 40 | magma_dmalloc_cpu(&vec_b, dim); 41 | magma_dmalloc_cpu(&vec_x, dim); 42 | 43 | // input mat_a and vec_x 44 | for(j = 0; j < dim; j++) 45 | { 46 | for(i = 0; i < dim; i++) 47 | { 48 | // column-major 49 | mat_a[i + j * dim] = (double)rand() / (double)RAND_MAX; 50 | if(rand() % 2 != 0) 51 | mat_a[i + j * dim] *= -1.0; 52 | } 53 | vec_x[j] = 1.0 / (double)(j + 1); 54 | } 55 | 56 | // size(vec_x) == size(vec_b) 57 | inc_vec_x = inc_vec_b = 1; 58 | 59 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 60 | alpha = 1.0; 61 | beta = 0.0; 62 | cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b); 63 | 64 | // print 65 | for(i = 0; i < dim; i++) 66 | { 67 | printf("["); 68 | for(j = 0; j < dim; j++) 69 | printf("%10.3lf ", mat_a[i + j * dim]); 70 | printf("] %10.3lf = %10.3lf\n", vec_x[i], vec_b[i]); 71 | } 72 | 73 | // GPU 74 | 75 | // initialize a matrix and vectors on GPU (MAGMA) 76 | magma_dmalloc(&dev_mat_a, dim * dim); 77 | magma_dmalloc(&dev_vec_x, dim); 78 | magma_dmalloc(&dev_vec_b, dim); 79 | 80 | // dev_vec_x on GPU -> vec_x_gpu on CPU 81 | magma_dmalloc_cpu(&vec_b_gpu, dim); 82 | 83 | // CPU(Host) -> GPU(device) 84 | 85 | // mat_a -> dev_mat_a 86 | magma_dsetmatrix(dim, dim, mat_a, dim, dev_mat_a, dim); 87 | 88 | // size(vec_x) == size(vec_b) 89 | inc_vec_x = inc_vec_b = 1; 90 | 91 | // vec_x -> dev_vec_x 92 | magma_dsetvector(dim, vec_x, inc_vec_x, dev_vec_x, inc_vec_x); 93 | 94 | // vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b 95 | alpha = 1.0; 96 | beta = 0.0; 97 | magmablas_dgemv(MagmaNoTrans, dim, dim, alpha, dev_mat_a, dim, dev_vec_x, inc_vec_x, beta, dev_vec_b, inc_vec_b); 98 | 99 | // dev_vec_b -> vec_b_gpu 100 | magma_dgetvector(dim, dev_vec_b, inc_vec_b, vec_b_gpu, inc_vec_b); 101 | 102 | // print 103 | for(i = 0; i < dim; i++) 104 | { 105 | printf("["); 106 | for(j = 0; j < dim; j++) 107 | printf("%10.3lf ", mat_a[i + j * dim]); 108 | printf("] %10.3lf = %10.3lf\n", vec_x[i], vec_b_gpu[i]); 109 | } 110 | 111 | // relative difference: ||vec_b_gpu - vec_b||_2 / ||vec_b||_2 112 | cblas_daxpy(dim, -1.0, vec_b, inc_vec_b, vec_b_gpu, inc_vec_b); 113 | printf("||vec_b_gpu - vec_b||_2 / ||vec_b||_2 = %15.7e\n", cblas_dnrm2(dim, vec_b_gpu, 1) / cblas_dnrm2(dim, vec_b, 1)); 114 | 115 | // free 116 | magma_free_cpu(mat_a); 117 | magma_free_cpu(vec_x); 118 | magma_free_cpu(vec_b); 119 | magma_free_cpu(vec_b_gpu); 120 | 121 | magma_free(dev_mat_a); 122 | magma_free(dev_vec_b); 123 | magma_free(dev_vec_x); 124 | 125 | // finalize MAGMA 126 | magma_finalize(); 127 | 128 | return EXIT_SUCCESS; 129 | } 130 | -------------------------------------------------------------------------------- /mm/diagtest.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate real general 2 | %------------------------------------------------------------------------------- 3 | % Test Sparse Matrix 4 | % [ 5 0 0 0 0 ] 5 | % [ 0 6 0 0 0 ] 6 | % [ 0 0 7 0 0 ] 7 | % [ 0 0 0 8 0 ] 8 | % [ 0 0 0 0 9 ] 9 | %------------------------------------------------------------------------------- 10 | 5 5 5 11 | 1 1 5 12 | 2 2 6 13 | 3 3 7 14 | 4 4 8 15 | 5 5 9 16 | -------------------------------------------------------------------------------- /mm/matrix_market_io.h: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* I/O functions for MatrixMarcket format */ 3 | /* */ 4 | /* Version 0.1: 2015-04-15(Wed) */ 5 | /* */ 6 | /* Original code: */ 7 | /* http://math.nist.gov/MatrixMarket/mmio-c.html */ 8 | /* *************** Public Domain *************** */ 9 | /*************************************************/ 10 | 11 | #ifndef __MATRIX_MARKET_IO_H__ 12 | #define __MATRIX_MARKET_IO_H__ 13 | 14 | #include 15 | #include 16 | #include 17 | 18 | // Max. length of string per one line 19 | #define MM_MAX_LINE_LEN 4096 20 | 21 | // Max. length of token 22 | #define MM_MAX_TOKEN_LEN 128 23 | 24 | // Bunner of Matrix Market format 25 | #define MM_BANNER "%%MatrixMarket" 26 | 27 | // return codes 28 | #define MM_ERROR (-1) 29 | #define MM_SUCCESS (0) 30 | 31 | // boolean values 32 | #define MM_TRUE (1) 33 | #define MM_FALSE (0) 34 | 35 | // The number of kinds of Matrix Market typecode 36 | #define MM_NUM_TYPECODE 4 37 | 38 | // Matrix Type code 39 | typedef unsigned char MM_typecode[MM_NUM_TYPECODE]; 40 | 41 | /******************************************************/ 42 | /* Matrix Market Internal definitions of Matrix types */ 43 | /* type[4] ... 4-character sequence */ 44 | /* */ 45 | /* Object Sparse Storage */ 46 | /* or Dense Data Type Scheme */ 47 | /* */ 48 | /* type[0] type[1] type[2] type[3] */ 49 | /* */ 50 | /* M(atrix) C(oord) R(real) G(eneral) */ 51 | /* A(rray) C(omplex) H(ermitian) */ 52 | /* P(attern) S(ymmetric) */ 53 | /* I(nteger) (s)K(ew-symmetric) */ 54 | /******************************************************/ 55 | 56 | #define MM_MATRIX_STR "matrix" 57 | #define mm_is_matrix(typecode) ((typecode)[0] == 'M') 58 | #define mm_set_matrix(typecode) ((*typecode)[0] = 'M') 59 | 60 | #define MM_SPARSE_STR "coordinate" 61 | #define mm_is_sparse(typecode) ((typecode)[1] == 'C') 62 | #define mm_set_sparse(typecode) ((*typecode)[1] = 'C') 63 | 64 | #define MM_COODINATE_STR "coordinate" 65 | #define mm_is_coodinate(typecode) ((typecode)[1] == 'C') 66 | #define mm_set_coodinate(typecode) ((*typecode)[1] = 'C') 67 | 68 | #define MM_DENSE_STR "array" 69 | #define mm_is_dense(typecode) ((typecode)[1] == 'A') 70 | #define mm_set_dense(typecode) ((*typecode)[1] = 'A') 71 | 72 | #define MM_ARRAY_STR "array" 73 | #define mm_is_array(typecode) ((typecode)[1] == 'A') 74 | #define mm_set_array(typecode) ((*typecode)[1] = 'A') 75 | 76 | #define MM_REAL_STR "real" 77 | #define mm_is_real(typecode) ((typecode)[2] == 'R') 78 | #define mm_set_real(typecode) ((*typecode)[2] = 'R') 79 | 80 | #define MM_COMPLEX_STR "complex" 81 | #define mm_is_complex(typecode) ((typecode)[2] == 'C') 82 | #define mm_set_complex(typecode) ((*typecode)[2] = 'C') 83 | 84 | #define MM_PATTERN_STR "pattern" 85 | #define mm_is_pattern(typecode) ((typecode)[2] == 'P') 86 | #define mm_set_pattern(typecode) ((*typecode)[2] = 'P') 87 | 88 | #define MM_INTEGER_STR "integer" 89 | #define mm_is_integer(typecode) ((typecode)[2] == 'I') 90 | #define mm_set_integer(typecode) ((*typecode)[2] = 'I') 91 | 92 | #define MM_GENERAL_STR "general" 93 | #define mm_is_general(typecode) ((typecode)[3] == 'G') 94 | #define mm_set_general(typecode) ((*typecode)[3] = 'G') 95 | 96 | #define MM_HERMITIAN_STR "hermitian" 97 | #define mm_is_hermitian(typecode) ((typecode)[3] == 'H') 98 | #define mm_set_hermitian(typecode) ((*typecode)[3] = 'H') 99 | 100 | #define MM_SYMMETRIC_STR "symmetric" 101 | #define mm_is_symmetric(typecode) ((typecode)[3] == 'S') 102 | #define mm_set_symmetric(typecode) ((*typecode)[3] = 'S') 103 | 104 | #define MM_SKEW_STR "skew-symmetric" 105 | #define mm_is_skew(typecode) ((typecode)[3] == 'K') 106 | #define mm_set_skew(typecode) ((*typecode)[3] = 'K') 107 | 108 | /* clear type codes */ 109 | #define mm_clear_typecode(typecode) ((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G') 110 | #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode) 111 | 112 | /* error codes */ 113 | #define MM_COULD_NOT_READ_FILE 11 114 | #define MM_COULD_NOT_READ_FILE_STR "MM_ERROR: cannot read the file!" 115 | 116 | #define MM_COULD_NOT_WRITE_FILE 12 117 | #define MM_COULD_NOT_WRITE_FILE_STR "MM_ERROR: cannot write the file!" 118 | 119 | #define MM_NOT_MATRIX 13 120 | #define MM_NOT_MATRIX_STR "MM_ERROR: not matrix!" 121 | 122 | #define MM_NO_HEADER 14 123 | #define MM_NO_HEADER_STR "MM_ERROR: cannot find MM header!" 124 | 125 | #define MM_UNSUPPORTED_TYPE 15 126 | #define MM_UNSUPPORTED_TYPE_STR "MM_ERROR: is unsupported type!" 127 | 128 | #define MM_LINE_TOO_LONG 16 129 | #define MM_LINE_TOO_LONG_STR "MM_ERROR: length of line exceeded!" 130 | 131 | #define MM_PREMATURE_EOF 17 132 | #define MM_PREMATURE_EOF_STR "MM_ERROR: premature EOF!" 133 | 134 | #define MM_UNDEFINED_ERROR 18 135 | #define MM_UNDEFINED_ERROR_STR "MM_ERROR: cannot specify the king of errors!" 136 | 137 | /******************************************/ 138 | /* Functions defined in mmio */ 139 | /* */ 140 | /* mm_is_valid */ 141 | /* mm_typecode_to_str */ 142 | /* mm_read_banner */ 143 | /* mm_read_mtx_crd_size */ 144 | /* mm_write_crd_size */ 145 | /* mm_read_unsymmetric_sparse (obsolete?) */ 146 | /* mm_read_mtx_array_size */ 147 | /* mm_write_mtx_array_size */ 148 | /* mm_read_mtx_crd_data */ 149 | /* mm_read_mtx_crd_entry */ 150 | /* mm_read_mtx_crd */ 151 | /* mm_write_mtx_crd */ 152 | /******************************************/ 153 | #ifdef __cplusplus 154 | extern "C" { 155 | #endif // __cplusplus 156 | 157 | // NULL or print str 158 | unsigned char *mm_put_str(unsigned char *str); 159 | 160 | /* print error messages */ 161 | int mm_print_error(unsigned char *str, int error_code); 162 | 163 | // convert type codes to strings 164 | unsigned char *mm_typecode_to_str(MM_typecode matcode); 165 | 166 | // check matcode 167 | int mm_is_valid(MM_typecode matcode); 168 | 169 | // read banner 170 | int mm_read_banner(FILE *fp, MM_typecode *matcode); 171 | 172 | // write banner 173 | int mm_write_banner(FILE *fp, MM_typecode matcode); 174 | 175 | // read coodinate matrix size 176 | int mm_read_mtx_crd_size(FILE *fp, int *row_dim, int *col_dim, int *num_nonzeros); 177 | 178 | // write coodinate matrix size 179 | int mm_write_crd_size(FILE *fp, int row_dim, int col_dim, int num_nonzeros); 180 | 181 | // read unsymmetric double precision sparse matrix 182 | // I cannot understand thre reason why this function is nessesary. For practical training ? 183 | int mm_read_unsymmetric_sparse(const char *fname, int *row_dim, int *col_dim, int *num_nonzeros, double **val, int **row_index, int **col_index); 184 | 185 | // read mtx array size 186 | int mm_read_mtx_array_size(FILE *fp, int *row_dim, int *col_dim); 187 | 188 | // write mtx array size 189 | int mm_write_mtx_array_size(FILE *fp, int row_dim, int col_dim); 190 | 191 | // read coodinate matrix when row_index, col_index, val are already allocated 192 | int mm_read_mtx_crd_data(FILE *fp, int row_dim, int col_dim, int num_nonzeros, int row_index[], int col_index[], double val[], MM_typecode matcode); 193 | 194 | // read one line(entry) of coodinate matrix 195 | int mm_read_mtx_crd_entry(FILE *fp, int *row_index, int *col_index, double *real, double *imag, MM_typecode matcode); 196 | 197 | // read any kinds of coodinate matrix 198 | // After finishing to read the whole data, mm_read_mtx_crd fills 199 | // row_dim, col_dim, num_nonzeros, row_index, col_index and array of values, 200 | // and return typecode such as "MCRS" which means "Matrix, Coordinate, Real, Symmetric". 201 | int mm_read_mtx_crd(const char *fname, int *row_dim, int *col_dim, int *num_nonzeros, int **row_index, int **col_index, double **val, MM_typecode *matcode); 202 | 203 | // write mtx format file 204 | int mm_write_mtx_crd(unsigned char *fname, int row_dim, int col_dim, int num_nonzeros, int row_index[], int col_index[], double val[], MM_typecode matcode); 205 | 206 | /******************************************/ 207 | /* Functions newly defined */ 208 | /* */ 209 | /* mm_read_mtx_array_data */ 210 | /* mm_read_mtx_array_entry */ 211 | /* mm_read_mtx_array */ 212 | /* mm_write_mtx_array */ 213 | /* mm_print_header_mtx_crd */ 214 | /* mm_print_header_mtx_array */ 215 | /******************************************/ 216 | 217 | // read array type matrix when val is already allocated 218 | int mm_read_mtx_array_data(FILE *fp, int row_dim, int col_dim, double val[], MM_typecode matcode); 219 | 220 | // read one line(entry) of coodinate matrix 221 | int mm_read_mtx_array_entry(FILE *fp, double *real, double *imag, MM_typecode matcode); 222 | 223 | // read general real or complex array matrix 224 | // After finishing to read the whole data, mm_read_mtx_crd fills 225 | // row_dim, col_dim and array of values, 226 | // and return typecode such as "MARG" which means "Matrix, Array, Real, General". 227 | int mm_read_mtx_array(const char *fname, int *row_dim, int *col_dim, double **val, MM_typecode *matcode); 228 | 229 | // write mtx format file 230 | int mm_write_mtx_array(unsigned char *fname, int row_dim, int col_dim, double val[], MM_typecode matcode); 231 | 232 | // read any kinds of coodinate matrix within specified number of lines 233 | int mm_print_header_mtx_crd(const char *fname, int num_lines_print); 234 | 235 | // read any kinds of coodinate matrix within specified number of lines 236 | int mm_print_header_mtx_array(const char *fname, int num_lines_print); 237 | 238 | #ifdef __cplusplus 239 | } // extern "C" 240 | #endif // __cplusplus 241 | 242 | #endif // __MATRIX_MARKET_IO_H__ 243 | -------------------------------------------------------------------------------- /mm/sptest.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate real general 2 | %------------------------------------------------------------------------------- 3 | % Test Sparse Matrix 4 | % [ 5 0 1 ] 5 | % [ 0 6 0 ] 6 | % [ 0 1 7 ] 7 | %------------------------------------------------------------------------------- 8 | 3 3 5 9 | 1 1 5.0 10 | 2 2 6.0 11 | 3 2 1.0 12 | 1 3 1.0 13 | 3 3 7.0 14 | -------------------------------------------------------------------------------- /mm/sptest_array.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix array real general 2 | %------------------------------------------------------------------------------- 3 | % Test Sparse Matrix 4 | % [ 5 0 1 ] 5 | % [ 0 6 0 ] 6 | % [ 0 1 7 ] 7 | %------------------------------------------------------------------------------- 8 | 3 3 9 | 5.0 10 | 0 11 | 0 12 | 0.0 13 | 6.0 14 | 1.0 15 | 1.0 16 | 0.0 17 | 7.0 18 | -------------------------------------------------------------------------------- /mm/sptest_big.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate real general 2 | %------------------------------------------------------------------------------- 3 | % Test Sparse Matrix 4 | % [ 5 0 1 0 0 ] 5 | % [ 0 6 0 0 -1 ] 6 | % [ 0 1 7 0 0 ] 7 | % [ 0 1 0 8 0 ] 8 | % [ 0 0 1 0 9 ] 9 | %------------------------------------------------------------------------------- 10 | 5 5 10 11 | 1 1 5 12 | 1 3 1 13 | 2 2 6 14 | 2 5 -1 15 | 3 2 1 16 | 3 3 7 17 | 4 2 1 18 | 4 4 8 19 | 5 3 1 20 | 5 5 9 21 | -------------------------------------------------------------------------------- /mm/sptest_pattern.mtx: -------------------------------------------------------------------------------- 1 | %%MatrixMarket matrix coordinate pattern general 2 | %------------------------------------------------------------------------------- 3 | % Test Sparse Matrix 4 | % [ 1 0 1 ] 5 | % [ 0 1 0 ] 6 | % [ 0 1 1 ] 7 | %------------------------------------------------------------------------------- 8 | 3 3 5 9 | 1 1 10 | 2 2 11 | 3 2 12 | 1 3 13 | 3 3 14 | -------------------------------------------------------------------------------- /my_linear_eq.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Solver for Linear equation */ 4 | /* with original routines */ 5 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | // rowwise only 12 | // my_matvec_mul: vec_b := mat_a * vec_x 13 | void my_matvec_mul(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x) 14 | { 15 | int i, j, row_index; 16 | 17 | // main loop 18 | for(i = 0; i < row_dim; i++) 19 | { 20 | vec_b[i] = 0.0; 21 | row_index = row_dim * i; 22 | 23 | for(j = 0; j < col_dim; j++) 24 | vec_b[i] += mat_a[row_index + j] * vec_x[j]; 25 | } 26 | } 27 | 28 | // rowwise only 29 | // my_linear_eq_solve: solve mat_a * x = vec_b in x -> vec_b := x 30 | int my_linear_eq_solve(double *mat_a, int dim, int *pivot, double *vec_b) 31 | { 32 | int i, j, k, row_index_j, row_index_i, max_j, tmp_index; 33 | double absmax_aji, abs_aji, pivot_aii, vec_x; 34 | 35 | // initialize pivot vector 36 | for(i = 0; i < dim; i++) 37 | pivot[i] = i; 38 | 39 | // forward 40 | for(i = 0; i < dim; i++) 41 | { 42 | // partial pivoting 43 | absmax_aji = fabs(mat_a[pivot[i] * dim + i]); 44 | max_j = i; 45 | for(j = i + 1; j < dim; j++) 46 | { 47 | abs_aji = mat_a[pivot[j] * dim + i]; 48 | if(absmax_aji < abs_aji) 49 | { 50 | max_j = j; 51 | absmax_aji = abs_aji; 52 | } 53 | } 54 | if(max_j != i) 55 | { 56 | tmp_index = pivot[max_j]; 57 | pivot[max_j] = pivot[i]; 58 | pivot[i] = tmp_index; 59 | } 60 | 61 | // select pivoted column 62 | row_index_i = pivot[i] * dim; 63 | pivot_aii = mat_a[row_index_i + i]; 64 | 65 | // error 66 | if(fabs(pivot_aii) <= 0.0) 67 | return -1; 68 | 69 | for(j = i + 1; j < dim; j++) 70 | { 71 | row_index_j = pivot[j] * dim; 72 | mat_a[row_index_j + i] /= pivot_aii; 73 | 74 | for(k = i + 1; k < dim; k++) 75 | mat_a[row_index_j + k] -= mat_a[row_index_j + i] * mat_a[row_index_i + k]; 76 | } 77 | } 78 | 79 | // forward substitution 80 | for(j = 0; j < dim; j++) 81 | { 82 | vec_x = vec_b[pivot[j]]; 83 | for(i = j + 1; i < dim; i++) 84 | vec_b[pivot[i]] -= mat_a[pivot[i] * dim + j] * vec_x; 85 | } 86 | 87 | // backward substitution 88 | for(i = dim - 1; i >= 0; i--) 89 | { 90 | vec_x = vec_b[pivot[i]]; 91 | row_index_i = pivot[i] * dim; 92 | for(j = i + 1; j < dim; j++) 93 | vec_x -= mat_a[row_index_i + j] * vec_b[pivot[j]]; 94 | 95 | vec_b[pivot[i]] = vec_x / mat_a[row_index_i + i]; 96 | } 97 | 98 | // reordering 99 | for(i = 0; i < dim; i++) 100 | { 101 | if(pivot[i] != i) 102 | { 103 | for(j = i + 1; j < dim; j++) 104 | { 105 | if(pivot[j] == i) 106 | { 107 | vec_x = vec_b[pivot[i]]; 108 | vec_b[pivot[i]] = vec_b[i]; 109 | vec_b[i] = vec_x; 110 | pivot[j] = pivot[i]; 111 | pivot[i] = i; 112 | } 113 | } 114 | } 115 | } 116 | 117 | return 0; 118 | } 119 | 120 | int main() 121 | { 122 | int i, j, dim; 123 | int *pivot, info; 124 | 125 | double *mat_a, *vec_b, *vec_x; 126 | double alpha, beta; 127 | double running_time; 128 | 129 | // input dimension of linear equation to be solved 130 | printf("Dim = "); scanf("%d", &dim); 131 | 132 | if(dim <= 0) 133 | { 134 | printf("Illegal dimension! (dim = %d)\n", dim); 135 | return EXIT_FAILURE; 136 | } 137 | 138 | // initialize a matrix and vectors 139 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 140 | vec_x = (double *)calloc(dim, sizeof(double)); 141 | vec_b = (double *)calloc(dim, sizeof(double)); 142 | 143 | // input mat_a and vec_x 144 | for(i = 0; i < dim; i++) 145 | { 146 | for(j = 0; j < dim; j++) 147 | { 148 | mat_a[i * dim + j] = 1.0 / (double)(i + j + 1); 149 | if((i + j + 1) % 2 != 0) 150 | mat_a[i * dim + j] = -mat_a[i * dim + j]; 151 | } 152 | mat_a[i * dim + i] += 2.0; 153 | vec_x[i] = 1.0 / (double)(i + 1); 154 | } 155 | 156 | // vec_b := mat_a * vec_x 157 | my_matvec_mul(vec_b, mat_a, dim, dim, vec_x); 158 | 159 | // print 160 | for(i = 0; i < dim; i++) 161 | { 162 | printf("["); 163 | for(j = 0; j < dim; j++) 164 | printf("%10.3f ", mat_a[i * dim + j]); 165 | printf("] %10.3f = %10.3f\n", vec_x[i], vec_b[i]); 166 | } 167 | 168 | // initialize pivot vector 169 | pivot = (int *)calloc(sizeof(int), dim); 170 | 171 | // solve A * x = b -> b := x 172 | info = my_linear_eq_solve(mat_a, dim, pivot, vec_b); 173 | 174 | printf("info = %d\n", info); 175 | 176 | // print 177 | printf("calculated x = \n"); 178 | for(i = 0; i < dim; i++) 179 | { 180 | printf("%3d -> %3d: ", i, pivot[i]); 181 | printf("%25.17e ", vec_b[pivot[i]]); 182 | printf("\n"); 183 | } 184 | 185 | // diff 186 | printf("x - calculated x = \n"); 187 | for(i = 0; i < dim; i++) 188 | { 189 | printf("%3d: ", i); 190 | printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i])); 191 | printf("\n"); 192 | } 193 | 194 | // free 195 | free(mat_a); 196 | free(vec_x); 197 | free(vec_b); 198 | free(pivot); 199 | 200 | return EXIT_SUCCESS; 201 | } 202 | -------------------------------------------------------------------------------- /my_linear_eq_omp.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/my_linear_eq_omp.c -------------------------------------------------------------------------------- /my_matvec_mul.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Multiplication of matrix and vector */ 4 | /* with OpenMP */ 5 | /* Last Update: 2016-11-30 (Wed) T.Kouya */ 6 | /*************************************************/ 7 | #include 8 | #include 9 | #include 10 | 11 | // rowwise only 12 | // my_matvec_mul: vec_b := mat_a * vec_x 13 | #ifdef _OPENMP 14 | void my_matvec_mul_omp(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x) 15 | { 16 | int i, j, row_index; 17 | 18 | // parallelized main loop 19 | #pragma omp parallel for private(j) 20 | for(i = 0; i < row_dim; i++) 21 | { 22 | //printf("Thread No. %d:\n", omp_get_thread_num()); 23 | vec_b[i] = 0.0; 24 | row_index = row_dim * i; 25 | 26 | for(j = 0; j < col_dim; j++) 27 | vec_b[i] += mat_a[row_index + j] * vec_x[j]; 28 | } 29 | } 30 | #endif // _OPENMP 31 | 32 | // rowwise only 33 | // my_matvec_mul: vec_b := mat_a * vec_x 34 | void my_matvec_mul(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x) 35 | { 36 | int i, j, row_index; 37 | 38 | // serial main loop 39 | for(i = 0; i < row_dim; i++) 40 | { 41 | vec_b[i] = 0.0; 42 | row_index = row_dim * i; 43 | 44 | for(j = 0; j < col_dim; j++) 45 | vec_b[i] += mat_a[row_index + j] * vec_x[j]; 46 | } 47 | } 48 | 49 | int main() 50 | { 51 | int i, j, dim; 52 | 53 | double *mat_a, *vec_b, *vec_x; 54 | 55 | // input dimension of square matrix and vector 56 | printf("Dim = "); scanf("%d", &dim); 57 | 58 | // input maximum number of threads 59 | #ifdef _OPENMP 60 | int num_threads; 61 | 62 | printf("Max.Num.threads = "); scanf("%d", &num_threads); 63 | omp_set_num_threads(num_threads); 64 | printf("#threads = %d\n", omp_get_max_threads()); 65 | #endif // _OPENMP 66 | 67 | if(dim <= 0) 68 | { 69 | printf("Illegal dimension! (dim = %d)\n", dim); 70 | return EXIT_FAILURE; 71 | } 72 | 73 | // initialize a matrix and vectors 74 | mat_a = (double *)calloc(dim * dim, sizeof(double)); 75 | vec_x = (double *)calloc(dim, sizeof(double)); 76 | vec_b = (double *)calloc(dim, sizeof(double)); 77 | 78 | // input mat_a and vec_x 79 | for(i = 0; i < dim; i++) 80 | { 81 | for(j = 0; j < dim; j++) 82 | { 83 | mat_a[i * dim + j] = (double)(i + j + 1); 84 | if((i + j + 1) % 2 != 0) 85 | mat_a[i * dim + j] *= -1.0; 86 | } 87 | vec_x[i] = 1.0 / (double)(i + 1); 88 | } 89 | 90 | // vec_b := mat_a * vec_x 91 | #ifdef _OPENMP 92 | my_matvec_mul_omp(vec_b, mat_a, dim, dim, vec_x); 93 | #else 94 | my_matvec_mul(vec_b, mat_a, dim, dim, vec_x); 95 | #endif // _OPENMP 96 | 97 | // print 98 | for(i = 0; i < dim; i++) 99 | { 100 | printf("["); 101 | for(j = 0; j < dim; j++) 102 | printf("%10.3lf ", mat_a[i * dim + j]); 103 | printf("] %10.3lf = %10.3lf\n", vec_x[i], vec_b[i]); 104 | } 105 | 106 | // free 107 | free(mat_a); 108 | free(vec_x); 109 | free(vec_b); 110 | 111 | return EXIT_SUCCESS; 112 | } 113 | -------------------------------------------------------------------------------- /my_matvec_mul_pt.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/my_matvec_mul_pt.c -------------------------------------------------------------------------------- /mycuda.c: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/mycuda.c -------------------------------------------------------------------------------- /mycuda.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/mycuda.h -------------------------------------------------------------------------------- /mycuda_daxpy.cu: -------------------------------------------------------------------------------- 1 | /********************************************************************************/ 2 | /* mycuda_daxpy : Original DAXPY based on CUDA */ 3 | /* Copyright (C) 2015 Tomonori Kouya */ 4 | /* */ 5 | /* This program is free software: you can redistribute it and/or modify it */ 6 | /* under the terms of the GNU Lesser General Public License as published by the */ 7 | /* Free Software Foundation, either version 3 of the License or any later */ 8 | /* version. */ 9 | /* */ 10 | /* This program is distributed in the hope that it will be useful, but WITHOUT */ 11 | /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or */ 12 | /* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License */ 13 | /* for more details. */ 14 | /* */ 15 | /* You should have received a copy of the GNU Lesser General Public License */ 16 | /* along with this program. If not, see . */ 17 | /* */ 18 | /********************************************************************************/ 19 | #include 20 | 21 | #include "cblas.h" 22 | #include "mycuda.h" 23 | #include "tkaux.h" 24 | 25 | // y := alpha * x + y 26 | __global__ void mycuda_daxpy_kernel (int dim, double *ptr_alpha, double x[], int x_step_dim, double y[], int y_step_dim) 27 | { 28 | int k = blockIdx.x * blockDim.x + threadIdx.x; 29 | 30 | int x_index = k * x_step_dim; 31 | int y_index = k * y_step_dim; 32 | 33 | if ((x_index < dim) && (y_index < dim)) 34 | { 35 | y[y_index] = *ptr_alpha * x[x_index] + y[y_index]; 36 | } 37 | } 38 | 39 | // Maximum threads per a block 40 | #define MAX_NUM_THREADS_CUDA 8 41 | 42 | // c := a * b 43 | void mycuda_daxpy(int dim, double *dev_alpha, double dev_x[], int x_step_dim, double dev_y[], int y_step_dim) 44 | { 45 | dim3 threads(MAX_NUM_THREADS_CUDA); 46 | dim3 blocks(1); 47 | 48 | threads.x = (dim > MAX_NUM_THREADS_CUDA) ? MAX_NUM_THREADS_CUDA : dim; 49 | 50 | blocks.x = (dim > MAX_NUM_THREADS_CUDA) ? (dim / MAX_NUM_THREADS_CUDA) + 1 : 1; 51 | 52 | printf("Threads (x): %d\n", threads.x); 53 | printf("Blocks (x): %d\n", blocks.x); 54 | 55 | mycuda_daxpy_kernel<<>>(dim, dev_alpha, dev_x, x_step_dim, dev_y, y_step_dim); 56 | } 57 | 58 | int main() 59 | { 60 | int i, dim; 61 | double host_alpha, *host_x, *host_y; // on CPU 62 | double *dev_alpha, *dev_x, *dev_y; // on GPU 63 | 64 | printf("dim = "); scanf("%d", &dim); 65 | 66 | host_x = (double *)calloc(dim, sizeof(double)); 67 | host_y = (double *)calloc(dim, sizeof(double)); 68 | 69 | // alpha = sqrt(2) 70 | // x[i] = sqrt(2) * i 71 | // y[i] = sqrt(3) * (dim - i) 72 | 73 | host_alpha = sqrt(2.0); 74 | for(i = 0; i < dim; i++) 75 | { 76 | host_x[i] = sqrt(2.0) * (double)(i + 1); 77 | host_y[i] = sqrt(3.0) * (double)(dim - i); 78 | } 79 | 80 | // host to device 81 | dev_alpha = (double *)mycuda_calloc(1, sizeof(double)); 82 | dev_x = (double *)mycuda_calloc(dim, sizeof(double)); 83 | dev_y = (double *)mycuda_calloc(dim, sizeof(double)); 84 | 85 | // host_x -> dev_x 86 | // host_y -> dev_y 87 | cudaMemcpy((void *)dev_alpha, (void *)&host_alpha, sizeof(double), cudaMemcpyHostToDevice); 88 | cudaMemcpy((void *)dev_x, (void *)host_x, dim * sizeof(double), cudaMemcpyHostToDevice); 89 | cudaMemcpy((void *)dev_y, (void *)host_y, dim * sizeof(double), cudaMemcpyHostToDevice); 90 | 91 | // y := alpha * x + y on CPU 92 | cblas_daxpy(dim, host_alpha, host_x, 1, host_y, 1); 93 | 94 | printf_dvector("%d %25.17e\n", host_y, dim, 1); 95 | 96 | // y := alpha * x + y on GPU 97 | mycuda_daxpy(dim, dev_alpha, dev_x, 1, dev_y, 1); 98 | 99 | // dev_y -> host_x 100 | cudaMemcpy((void *)host_x, (void *)dev_y, dim * sizeof(double), cudaMemcpyDeviceToHost); 101 | 102 | printf_dvector("%d %25.17e\n", host_x, dim, 1); 103 | 104 | mycuda_free(dev_x); 105 | mycuda_free(dev_y); 106 | 107 | free(host_x); 108 | free(host_y); 109 | 110 | return 0; 111 | } 112 | -------------------------------------------------------------------------------- /power_eig.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Power Method for max eigenvalue & eigenvetor */ 4 | /* Last Update: 2015-02-20 (Fri) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | #include "cblas.h" 10 | 11 | // relative difference 12 | double reldiff_dvector(double *vec1, double *vec2, int dim) 13 | { 14 | double *tmp_vec; 15 | double ret, norm; 16 | 17 | tmp_vec = (double *)calloc(dim, sizeof(double)); 18 | 19 | cblas_dcopy(dim, vec1, 1, tmp_vec, 1); 20 | cblas_daxpy(dim, -1.0, vec2, 1, tmp_vec, 1); 21 | ret = cblas_dnrm2(dim, tmp_vec, 1); 22 | norm = cblas_dnrm2(dim, vec1, 1); 23 | 24 | if(norm != 0.0) 25 | ret /= norm; 26 | 27 | return ret; 28 | } 29 | 30 | int main() 31 | { 32 | int i, j, dim, itimes; 33 | double *ma, *md, *vy, *vx, *vx_true, *vx_new, *vb, eig, eig_old, y_norm; 34 | double reps, aeps; 35 | 36 | // input dimension 37 | printf("Dim = "); scanf("%d", &dim); 38 | 39 | if(dim <= 0) 40 | { 41 | printf("Illegal dimension! (dim = %d)\n", dim); 42 | return EXIT_FAILURE; 43 | } 44 | 45 | // Initialize 46 | ma = (double *)calloc(dim * dim, sizeof(double)); 47 | vy = (double *)calloc(dim, sizeof(double)); 48 | vx = (double *)calloc(dim, sizeof(double)); 49 | 50 | // input ma 51 | for(i = 0; i < dim; i++) 52 | { 53 | for(j = 0; j < dim; j++) 54 | ma[i * dim + j] = sqrt(2.0) * (double)(i + j + 1); 55 | ma[i * dim + i] = sqrt(2.0) * dim * dim; 56 | } 57 | 58 | reps = 1.0e-10; 59 | aeps = 0.0; 60 | 61 | 62 | // vx := [1 1 ... 1] / sqrt(dim) 63 | for(i = 0; i < dim; i++) 64 | vx[i] = 1.0 / sqrt((double)dim); 65 | 66 | eig_old = eig = 0.0; 67 | 68 | // Power Method 69 | for(itimes = 0; itimes < dim * 10; itimes++) 70 | { 71 | // y := A * x 72 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, ma, dim, vx, 1, 0.0, vy, 1); 73 | //for(i = 0; i < dim; i++) 74 | // printf("%3d %15.7e %15.7e\n", i, vx[i], vy[i]); 75 | //printf("\n"); 76 | 77 | // eig = (A * x, x) / (x, x) = (A * x, x) 78 | eig = cblas_ddot(dim, vy, 1, vx, 1); 79 | 80 | // |eig_old - eig | <= reps * |eig| + aeps 81 | if(fabs(eig_old - eig) <= reps * fabs(eig) + aeps) 82 | break; 83 | 84 | printf("%3d %25.17e\n", itimes, eig); 85 | 86 | eig_old = eig; 87 | 88 | // x := y / ||y|| 89 | y_norm = 1.0 / cblas_dnrm2(dim, vy, 1); 90 | cblas_dscal(dim, y_norm, vy, 1); 91 | cblas_dcopy(dim, vy, 1, vx, 1); 92 | } 93 | 94 | // eig * x == A * x? 95 | cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, ma, dim, vx, 1, 0.0, vy, 1); 96 | cblas_dscal(dim, eig, vx, 1); 97 | 98 | // print 99 | printf("Iterative Times = %d\n", itimes); 100 | printf("Max. Eigenvalue = %25.17e\n", eig); 101 | printf("Rel.Diff = %10.3e\n", reldiff_dvector(vx, vy, dim)); 102 | 103 | for(i = 0; i < dim; i++) 104 | { 105 | printf("%3d %25.17e %25.17e\n", i, eig * vx[i], vy[i]); 106 | } 107 | 108 | // free 109 | free(ma); 110 | free(vy); 111 | free(vx); 112 | 113 | 114 | return EXIT_SUCCESS; 115 | } 116 | -------------------------------------------------------------------------------- /readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/readme.txt -------------------------------------------------------------------------------- /row_column_major.c: -------------------------------------------------------------------------------- 1 | /*************************************************/ 2 | /* LAPACK/BLAS Tutorial */ 3 | /* Row-major and column-major matrices */ 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya */ 5 | /*************************************************/ 6 | #include 7 | #include 8 | #include 9 | 10 | #include "cblas.h" 11 | 12 | int main() 13 | { 14 | int i, j, row_dim, col_dim; 15 | 16 | double *mat_a; 17 | 18 | // input dimension 19 | printf("Row Dim = "); scanf("%d", &row_dim); 20 | printf("Column Dim = "); scanf("%d", &col_dim); 21 | 22 | if((row_dim <= 0) || (col_dim <= 0)) 23 | { 24 | printf("Illegal dimension! (row_dim = %d, col_dim = %d)\n", row_dim, col_dim); 25 | return EXIT_FAILURE; 26 | } 27 | 28 | // initialize matrix area 29 | mat_a = (double *)calloc(row_dim * col_dim, sizeof(double)); 30 | 31 | printf("Row Major: %d x %d\n", row_dim, col_dim); 32 | 33 | // Row Major 34 | // mat_a = A 35 | // A = [1 2 ....... n] 36 | // [n+1 n+2 ...... 2n] 37 | // [.................] 38 | // [(m-1)n+1 ..... mn] 39 | for(i = 0; i < row_dim; i++) 40 | { 41 | for(j = 0; j < col_dim; j++) 42 | mat_a[i * col_dim + j] = (double)(i * col_dim + j + 1); 43 | } 44 | 45 | // print (1) 46 | printf("1 dimensional: \n"); 47 | printf("["); 48 | for(i = 0; i < row_dim * col_dim; i++) 49 | printf(" %6.3lf ", mat_a[i]); 50 | printf("]\n"); 51 | 52 | // print (2) 53 | printf("2 dimensional: \n"); 54 | for(i = 0; i < row_dim; i++) 55 | { 56 | printf("["); 57 | for(j = 0; j < col_dim; j++) 58 | printf(" %6.3lf ", mat_a[i * col_dim + j]); 59 | printf("]\n"); 60 | } 61 | 62 | printf("Column Major: %d x %d\n", row_dim, col_dim); 63 | 64 | // Column Major 65 | // mat_a = A 66 | // A = [1 2 ....... n] 67 | // [n+1 n+2 ...... 2n] 68 | // [.................] 69 | // [(m-1)n+1 ..... mn] 70 | for(j = 0; j < col_dim; j++) 71 | { 72 | for(i = 0; i < row_dim; i++) 73 | mat_a[i + row_dim * j] = (double)(i * col_dim + j + 1); 74 | } 75 | 76 | // print (1) 77 | printf("1 dimension: \n"); 78 | printf("["); 79 | for(i = 0; i < row_dim * col_dim; i++) 80 | printf(" %6.3lf ", mat_a[i]); 81 | printf("]\n"); 82 | 83 | // print (2) 84 | printf("2 dimension: \n"); 85 | for(i = 0; i < row_dim; i++) 86 | { 87 | printf("["); 88 | for(j = 0; j < col_dim; j++) 89 | printf(" %6.3lf ", mat_a[i + row_dim * j]); 90 | printf("]\n"); 91 | } 92 | 93 | // free 94 | free(mat_a); 95 | 96 | return EXIT_SUCCESS; 97 | } 98 | -------------------------------------------------------------------------------- /tkaux.c: -------------------------------------------------------------------------------- 1 | /********************************************************************************/ 2 | /* tkaux.c : Auxiliary Routines for textbook "LAPACK/BLAS Tutorial" */ 3 | /* Copyright (C) 2015 Tomonori Kouya */ 4 | /* */ 5 | /* This program is free software: you can redistribute it and/or modify it */ 6 | /* under the terms of the GNU Lesser General Public License as published by the */ 7 | /* Free Software Foundation, either version 3 of the License or any later */ 8 | /* version. */ 9 | /* */ 10 | /* This program is distributed in the hope that it will be useful, but WITHOUT */ 11 | /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or */ 12 | /* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License */ 13 | /* for more details. */ 14 | /* */ 15 | /* You should have received a copy of the GNU Lesser General Public License */ 16 | /* along with this program. If not, see . */ 17 | /* */ 18 | /********************************************************************************/ 19 | #include 20 | #include 21 | #include 22 | 23 | #include "tkaux.h" 24 | 25 | // printf_dvector -> printf(format, index, dvec[index]) 26 | void printf_dvector(const char *format, double *dvec, int dim_dvec, int inc_dvec) 27 | { 28 | int index; 29 | 30 | for(index = 0; index < dim_dvec; index += inc_dvec) 31 | printf(format, index, dvec[index]); 32 | } 33 | 34 | // printf_dvecto2r -> printf(format, index, dvec1[index], dvec2[index]) 35 | void printf_dvector2(const char *format, double *dvec1, double *dvec2, int dim_dvec, int inc_dvec) 36 | { 37 | int index; 38 | 39 | for(index = 0; index < dim_dvec; index += inc_dvec) 40 | printf(format, index, dvec1[index], dvec2[index]); 41 | } 42 | 43 | // zero clear 44 | void set0_dvector(double *dvec, int dim_dvec, int inc_dvec) 45 | { 46 | int i; 47 | 48 | for(i = 0; i < dim_dvec; i += inc_dvec) 49 | dvec[i] = 0.0; 50 | } 51 | 52 | // zero clear 53 | void set0_dmatrix(double *dmat, int row_dim, int col_dim) 54 | { 55 | int index; 56 | 57 | for(index = 0; index < row_dim * col_dim; index++) 58 | dmat[index] = 0.0; 59 | } 60 | 61 | // zero clear 62 | void set0_zmatrix(double complex *zmat, int row_dim, int col_dim) 63 | { 64 | int index; 65 | 66 | for(index = 0; index < row_dim * col_dim; index++) 67 | zmat[index] = 0.0 + 0.0 * I; 68 | } 69 | 70 | // Column Major <- Row major 71 | // colm_mat <- rowm_mat 72 | void row2col_dmatrix(double *colm_mat, int colm_row_dim, int colm_col_dim, double *rowm_mat) 73 | { 74 | int i, j; 75 | 76 | for(j = 0; j < colm_col_dim; j++) 77 | { 78 | for(i = 0; i < colm_row_dim; i++) 79 | colm_mat[i + j * colm_row_dim] = rowm_mat[i * colm_col_dim + j]; 80 | //printf("%d <= %d\n", i + j * colm_row_dim, i * colm_col_dim + j); 81 | } 82 | } 83 | 84 | // Row Major <- Column major 85 | // colm_mat <- rowm_mat 86 | void col2row_dmatrix(double *rowm_mat, int rowm_row_dim, int rowm_col_dim, double *colm_mat) 87 | { 88 | int i, j; 89 | 90 | for(i = 0; i < rowm_row_dim; i++) 91 | { 92 | for(j = 0; j < rowm_col_dim; j++) 93 | rowm_mat[i * rowm_col_dim + j] = colm_mat[i + j * rowm_row_dim]; 94 | // printf("%d <= %d\n", i * rowm_col_dim + j, i + rowm_row_dim * j); 95 | } 96 | } 97 | 98 | // norm elative error 99 | // || dvec_err - dvec_true ||_2 / || dvec_true ||_2 100 | double dreldiff_dvector(double *dvec_err, int dim_dvec, int inc_dvec_err, double *dvec_true, int inc_dvec_true) 101 | { 102 | double reldiff, norm_dvec_true, *diff_vec; 103 | 104 | diff_vec = (double *)calloc(dim_dvec, sizeof(double)); 105 | 106 | // diff_vec := -dvec_err + dvec_true 107 | cblas_dcopy(dim_dvec, dvec_err, 1, diff_vec, 1); 108 | cblas_daxpy(dim_dvec, -1.0, dvec_true, inc_dvec_true, diff_vec, 1); 109 | reldiff = cblas_dnrm2(dim_dvec, diff_vec, 1); 110 | norm_dvec_true = cblas_dnrm2(dim_dvec, dvec_true, 1); 111 | if(norm_dvec_true > 0.0) 112 | reldiff /= norm_dvec_true; 113 | 114 | free(diff_vec); 115 | 116 | return reldiff; 117 | } 118 | -------------------------------------------------------------------------------- /tkaux.h: -------------------------------------------------------------------------------- 1 | /********************************************************************************/ 2 | /* tkaux.h : Auxiliary Routines for textbook "LAPACK/BLAS Tutorial" */ 3 | /* Copyright (C) 2015 Tomonori Kouya */ 4 | /* */ 5 | /* This program is free software: you can redistribute it and/or modify it */ 6 | /* under the terms of the GNU Lesser General Public License as published by the */ 7 | /* Free Software Foundation, either version 3 of the License or any later */ 8 | /* version. */ 9 | /* */ 10 | /* This program is distributed in the hope that it will be useful, but WITHOUT */ 11 | /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or */ 12 | /* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License */ 13 | /* for more details. */ 14 | /* */ 15 | /* You should have received a copy of the GNU Lesser General Public License */ 16 | /* along with this program. If not, see . */ 17 | /* */ 18 | /********************************************************************************/ 19 | #include 20 | #include 21 | #include 22 | 23 | #ifndef USE_IMKL 24 | #include "cblas.h" 25 | #endif // USE_IMKL 26 | 27 | #ifndef __TKAUX_H__ 28 | 29 | #define __TKAUX_H__ 30 | 31 | // macros 32 | #define DMAX(a, b) (((a) > (b)) ? (a) : (b)) 33 | #define DMIN(a, b) (((a) < (b)) ? (a) : (b)) 34 | 35 | #ifdef __cplusplus 36 | extern "C" { 37 | #endif 38 | 39 | // printf_dvector(format, dvec, dim, interval -> printf(format, index, dvec[index + interval]) 40 | void printf_dvector(const char *, double *, int, int); 41 | 42 | // printf_dvecto2r -> printf(format, index, dvec1[index], dvec2[index]) 43 | void printf_dvector2(const char *, double *, double *, int, int); 44 | 45 | // zero clear 46 | void set0_dvector(double *, int, int); 47 | 48 | // zero clear 49 | void set0_dmatrix(double *, int, int); 50 | 51 | // zero clear 52 | void set0_zmatrix(double complex *, int, int); 53 | 54 | // Column Major <- Row major 55 | // colm_mat <- rowm_mat 56 | void row2col_dmatrix(double *, int, int, double *); 57 | 58 | // Row Major <- Column major 59 | // colm_mat <- rowm_mat 60 | void col2row_dmatrix(double *, int, int, double *); 61 | 62 | // norm relative error (2-norm) 63 | // || dvec_err - dvec_true ||_2 / || dvec_true ||_2 64 | double dreldiff_dvector(double *, int, int, double *, int); 65 | 66 | #ifdef __cplusplus 67 | } 68 | #endif 69 | 70 | #endif // __TKAUX_H__ 71 | --------------------------------------------------------------------------------