├── LICENSE
├── Makefile.unix
├── Makefile.win_intel
├── README.ja.md
├── README.md
├── bicgstab_csr_cusparse.c
├── bicgstab_csr_magma.c
├── bicgstab_csr_mkl.c
├── bicgstab_mkl.c
├── blas1.c
├── blas1_norm.c
├── blas2.c
├── blas3.c
├── complex_first.c
├── complex_first_cpp.cc
├── complex_matvec_mul.c
├── complex_row_column_major.c
├── cublas_dgemv.c
├── dgemv_bench_mt.c
├── first.c
├── get_sec.c
├── get_secv.h
├── integral_eq
    ├── Makefile.unix
    ├── Makefile.win_intel
    ├── ex1.c
    ├── gauss_integral.c
    ├── gauss_integral.h
    ├── iteration.c
    ├── prob1.c
    └── prob2.c
├── invpower_eig.c
├── jacobi_iteration.c
├── jacobi_iteration_csr_mkl.c
├── jacobi_iteration_mkl.c
├── jacobi_iteration_spblas.c
├── lapack_complex_row_column_major.c
├── lapack_complex_row_column_major_cpp.cc
├── lapack_dgecon.c
├── lapack_dgeev.c
├── lapack_dgeev_magma.c
├── lapack_dsyev.c
├── lapack_gcc.inc
├── lapack_icc.inc
├── lapack_icx.inc
├── lapack_lamch.c
├── lapack_ssyev.c
├── lapack_win_intel.inc
├── linear_eq.c
├── linear_eq_dgbsv.c
├── linear_eq_dgetrf.c
├── linear_eq_dsgesv.c
├── linear_eq_dsposv.c
├── linear_eq_dsysv.c
├── linear_eq_magma.c
├── matvec_mul.c
├── matvec_mul_cublas.c
├── matvec_mul_magma.c
├── matvec_mul_magma_pure.c
├── mm
    ├── diagtest.mtx
    ├── matrix_market_io.c
    ├── matrix_market_io.h
    ├── sptest.mtx
    ├── sptest_array.mtx
    ├── sptest_big.mtx
    └── sptest_pattern.mtx
├── my_linear_eq.c
├── my_linear_eq_omp.c
├── my_matvec_mul.c
├── my_matvec_mul_pt.c
├── mycuda.c
├── mycuda.h
├── mycuda_daxpy.cu
├── power_eig.c
├── readme.txt
├── row_column_major.c
├── tkaux.c
└── tkaux.h


/Makefile.unix:
--------------------------------------------------------------------------------
  1 | #*************************************************#
  2 | # LAPACK/BLAS Tutorial                            #
  3 | # Makefile for Linux gcc or icc environment       #
  4 | # Last Update: 2016-12-02 (Fri) T.Kouya           #
  5 | #*************************************************#
  6 | # ------------------------ start of configuration --------------------------
  7 | # Intel C compiler (Old)
  8 | #include lapack_icc.inc
  9 | 
 10 | # Intel OneAPI (New)
 11 | include lapack_icx.inc
 12 | 
 13 | # GNU Compiler Collection
 14 | #include lapack_gcc.inc
 15 | 
 16 | # CUDA environment
 17 | #NVCC=nvcc
 18 | NVCC=nvcc -arch=sm_30
 19 | CUDA_INC = -I/usr/local/cuda/include
 20 | #CUDA_LIB = -L/usr/local/cuda/lib64 -lcublas -lcudart -lcusparse
 21 | CUDA_LIB = -L/usr/local/cuda/lib64 -lcusparse -lcublas -lcudart -lpthread -ldl
 22 | 
 23 | # MAGMA 1.5.0 to 1.6.2
 24 | MAGMA_INC = -I/usr/local/magma/include -I/usr/local/magma/include/sparse-iter/include -O3 -fPIC -DADD_ -Wall -openmp -DMAGMA_SETAFFINITY -DMAGMA_WITH_MKL
 25 | MAGMA_LIB = -L/usr/local/magma/lib -lmagma_sparse -lmagma -lstdc++ -lgomp
 26 | 
 27 | # ------------------------ end of configuration --------------------------
 28 | 
 29 | all: first matvec_mul linear_eq blas row_column_major eig dgecon openmp pthread
 30 | #all: first matvec_mul linear_eq blas row_column_major gpu eig dgecon openmp pthread spblas_mkl
 31 | 
 32 | get_sec: get_secv.h get_sec.c
 33 | 	$(CC) -c get_sec.c -o get_sec.o
 34 | 
 35 | tkaux: tkaux.h tkaux.c
 36 | 	$(CC) -c tkaux.c -o tkaux.o
 37 | 
 38 | first: first.c complex_first.c complex_first_cpp.cc
 39 | 	$(CC) first.c -o first $(LIB)
 40 | 	$(CC) complex_first.c -o complex_first $(LIB)
 41 | 	$(CPP) complex_first_cpp.cc -o complex_first_cpp $(LIB)
 42 | 
 43 | matvec_mul: matvec_mul.c my_matvec_mul.c
 44 | 	$(CC) $(LAPACKE_INC) matvec_mul.c -o matvec_mul $(LAPACKE_LIB)
 45 | 	$(CC) my_matvec_mul.c -o my_matvec_mul
 46 | 	$(CC) $(LAPACKE_INC) complex_matvec_mul.c -o complex_matvec_mul $(LAPACKE_LIB)
 47 | 
 48 | linear_eq: linear_eq.c my_linear_eq.c linear_eq_dgetrf.c linear_eq_dsgesv.c linear_eq_dsposv.c linear_eq_dsysv.c linear_eq_dgbsv.c invpower_eig.c
 49 | 	$(CC) $(LAPACKE_INC) linear_eq.c -o linear_eq $(LAPACKE_LIB)
 50 | 	$(CC) $(LAPACKE_INC) linear_eq_dgetrf.c -o linear_eq_dgetrf $(LAPACKE_LIB)
 51 | 	$(CC) $(LAPACKE_INC) linear_eq_dsgesv.c -o linear_eq_dsgesv $(LAPACKE_LIB)
 52 | 	$(CC) $(LAPACKE_INC) linear_eq_dsposv.c -o linear_eq_dsposv $(LAPACKE_LIB)
 53 | 	$(CC) $(LAPACKE_INC) linear_eq_dsysv.c -o linear_eq_dsysv $(LAPACKE_LIB)
 54 | 	$(CC) $(LAPACKE_INC) linear_eq_dgbsv.c -o linear_eq_dgbsv $(LAPACKE_LIB)
 55 | 	$(CC) my_linear_eq.c -o my_linear_eq
 56 | 	$(CC) $(LAPACKE_INC) invpower_eig.c -o invpower_eig $(LAPACKE_LIB)
 57 | 
 58 | blas: blas1.c blas2.c blas3.c blas1_norm.c jacobi_iteration.c power_eig.c
 59 | 	$(CC) $(CBLAS_INC) blas1.c -o blas1 $(CBLAS_LIB)
 60 | 	$(CC) $(CBLAS_INC) blas1_norm.c -o blas1_norm $(CBLAS_LIB)
 61 | 	$(CC) $(CBLAS_INC) blas2.c -o blas2 $(CBLAS_LIB)
 62 | 	$(CC) $(CBLAS_INC) blas3.c -o blas3 $(CBLAS_LIB)
 63 | 	$(CC) $(CBLAS_INC) jacobi_iteration.c -o jacobi_iteration $(CBLAS_LIB)
 64 | 	$(CC) $(CBLAS_INC) power_eig.c -o power_eig $(CBLAS_LIB)
 65 | 
 66 | mycuda: mycuda.h mycuda.c
 67 | 	$(CC) $(CUDA_INC) -c mycuda.c -o mycuda.o
 68 | 
 69 | gpu: matvec_mul_cublas.c matvec_mul_magma.c linear_eq_magma.c lapack_dgeev_magma.c bicgstab_csr_cusparse.c mycuda get_sec tkaux
 70 | 	$(NVCC) $(CBLAS_INC) -c mycuda_daxpy.cu
 71 | 	$(CC) mycuda_daxpy.o mycuda.o tkaux.o -o mycuda_daxpy $(CBLAS_LIB) $(CUDA_LIB)
 72 | 	$(CC) $(LAPACKE_INC) $(CUDA_INC) matvec_mul_cublas.c -o matvec_mul_cublas mycuda.o $(LAPACKE_LIB) $(CUDA_LIB)
 73 | 	$(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) matvec_mul_magma.c -o matvec_mul_magma mycuda.o $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB)
 74 | 	$(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) matvec_mul_magma_pure.c -o matvec_mul_magma_pure $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB)
 75 | 	$(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) linear_eq_magma.c -o linear_eq_magma $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB)
 76 | 	$(CC) $(LAPACKE_INC) $(CUDA_INC) $(MAGMA_INC) -DUSE_MAGMA lapack_dgeev_magma.c tkaux.o -o lapack_dgeev_magma $(LAPACKE_LIB) $(MAGMA_LIB) $(CUDA_LIB) $(IMKL_LIB)
 77 | 	$(CC) $(CUDA_INC) $(CBLAS_INC) bicgstab_csr_cusparse.c mm/matrix_market_io.c tkaux.o get_sec.o mycuda.o -o bicgstab_csr_cusparse $(CUDA_LIB) $(CBLAS_LIB)
 78 | 
 79 | row_column_major: row_column_major.c complex_row_column_major.c lapack_complex_row_column_major.c
 80 | 	$(CC) $(INC) row_column_major.c -o row_column_major $(LAPACKE_LIB)
 81 | 	$(CC) $(INC) complex_row_column_major.c -o complex_row_column_major $(LAPACKE_LIB)
 82 | 	$(CC) $(INC) $(LAPACKE_INC) lapack_complex_row_column_major.c -o lapack_complex_row_column_major $(LAPACKE_LIB)
 83 | 
 84 | dgecon: lapack_dgecon.c lapack_lamch.c
 85 | 	$(CC) $(INC) $(LAPACKE_INC) lapack_dgecon.c -o lapack_dgecon $(LAPACKE_LIB) $(CBLAS_LIB)
 86 | 	$(CC) $(INC) $(LAPACKE_INC) lapack_lamch.c -o lapack_lamch $(LAPACKE_LIB) $(CBLAS_LIB)
 87 | 
 88 | eig: lapack_dgeev.c lapack_dsyev.c lapack_ssyev.c tkaux
 89 | 	$(CC) $(LAPACKE_INC) lapack_dgeev.c tkaux.o -o lapack_dgeev $(LAPACKE_LIB)
 90 | 	$(CC) $(LAPACKE_INC) lapack_dsyev.c -o lapack_dsyev $(LAPACKE_LIB)
 91 | 	$(CC) $(LAPACKE_INC) lapack_ssyev.c -o lapack_ssyev $(LAPACKE_LIB)
 92 | 
 93 | # need IMKL!
 94 | spblas_mkl: jacobi_iteration_mkl.c jacobi_iteration_csr_mkl.c bicgstab_mkl.c bicgstab_csr_mkl.c tkaux
 95 | 	$(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_mkl.c mm/matrix_market_io.c -o jacobi_iteration_mkl tkaux.o $(IMKL_LIB)
 96 | 	$(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_csr_mkl.c mm/matrix_market_io.c -o jacobi_iteration_csr_mkl tkaux.o $(IMKL_LIB)
 97 | 	$(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_mkl.c mm/matrix_market_io.c -o bicgstab_mkl tkaux.o $(IMKL_LIB)
 98 | 	$(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_csr_mkl.c mm/matrix_market_io.c -o bicgstab_csr_mkl tkaux.o $(IMKL_LIB)
 99 | 
100 | openmp: my_matvec_mul.c my_linear_eq_omp.c
101 | 	$(CC) $(OPENMP) $(CBLAS_INC) my_matvec_mul.c -o my_matvec_mul_omp $(CBLAS_LIB)
102 | 	$(CC) $(OPENMP) $(CBLAS_INC) my_linear_eq_omp.c -o my_linear_eq_omp $(CBLAS_LIB)
103 | 
104 | pthread: my_matvec_mul_pt.c
105 | 	$(CC) $(CBLAS_INC) my_matvec_mul_pt.c -o my_matvec_mul_pt $(CBLAS_LIB) -lpthread
106 | 
107 | clean:
108 | 	-rm *.o
109 | 	-rm first
110 | 	-rm complex_first
111 | 	-rm complex_first_cpp
112 | 	-rm matvec_mul
113 | 	-rm my_matvec_mul
114 | 	-rm complex_matvec_mul
115 | 	-rm linear_eq
116 | 	-rm linear_eq_dgetrf
117 | 	-rm linear_eq_dgbsv
118 | 	-rm linear_eq_dsysv
119 | 	-rm linear_eq_dsgesv
120 | 	-rm linear_eq_dsposv
121 | 	-rm my_linear_eq
122 | 	-rm invpower_eig
123 | 	-rm blas1
124 | 	-rm blas1_norm
125 | 	-rm blas2
126 | 	-rm blas3
127 | 	-rm jacobi_iteration
128 | 	-rm power_eig
129 | 	-rm mycuda_daxpy
130 | 	-rm matvec_mul_cublas
131 | 	-rm matvec_mul_magma
132 | 	-rm matvec_mul_magma_pure
133 | 	-rm linear_eq_magma
134 | 	-rm lapack_dgeev_magma
135 | 	-rm bicgstab_csr_cusparse
136 | 	-rm row_column_major
137 | 	-rm complex_row_column_major
138 | 	-rm lapack_complex_row_column_major
139 | 	-rm lapack_dgecon
140 | 	-rm lapack_lamch
141 | 	-rm lapack_dgeev
142 | 	-rm lapack_dsyev
143 | 	-rm lapack_ssyev
144 | 	-rm jacobi_iteration_mkl
145 | 	-rm jacobi_iteration_csr_mkl
146 | 	-rm bicgstab_mkl
147 | 	-rm bicgstab_csr_mkl
148 | 	-rm my_matvec_mul_omp
149 | 	-rm my_linear_eq_omp
150 | 	-rm my_matvec_mul_pt
151 | 
152 | tarzip:
153 | 	-rm ../lapack-dist.tar.gz
154 | 	(cd ../; tar zcvf lapack-dist.tar.gz lapack-dist/*.txt lapack-dist/*.h lapack-dist/*.c lapack-dist/*.cc lapack-dist/Makefile lapack-dist/*.cu lapack-dist/*.inc lapack-dist/mm/*.h lapack-dist/mm/*.c lapack-dist/mm/*.mtx lapack-dist/integral_eq/*.c lapack-dist/integral_eq/Makefile lapack-dist/integral_eq/*.h; cd lapack-dist;)
155 | 
156 | zip:
157 | 	-rm ../lapack-dist.zip
158 | 	(cd ../; zip -ll -v lapack-dist.zip lapack-dist/*.txt lapack-dist/*.h lapack-dist/*.c lapack-dist/*.cc lapack-dist/Makefile.* lapack-dist/*.cu lapack-dist/*.inc lapack-dist/mm/*.h lapack-dist/mm/*.c lapack-dist/mm/*.mtx lapack-dist/integral_eq/*.c lapack-dist/integral_eq/Makefile.* lapack-dist/integral_eq/*.h; cd ./lapack-dist)
159 | 
160 | 


--------------------------------------------------------------------------------
/Makefile.win_intel:
--------------------------------------------------------------------------------
 1 | #*************************************************#
 2 | # LAPACK/BLAS Tutorial                            #
 3 | # Makefile for Intel C compiler on Windows        #
 4 | # Last Update: 2016-12-02 (Fri) T.Kouya           #
 5 | #*************************************************#
 6 | # ------------------------ start of configuration --------------------------
 7 | include ./lapack_win_intel.inc
 8 | INC = /I.\windows
 9 | # ------------------------ end of configuration --------------------------
10 | 
11 | all: first matvec_mul linear_eq blas row_column_major eig spblas dgecon openmp
12 | 
13 | get_sec: get_secv.h get_sec.c
14 | 	$(CC) -c get_sec.c
15 | 
16 | first: first.c complex_first.c complex_first_cpp.cc
17 | 	$(CC) first.c
18 | 	$(CC) complex_first.c
19 | 	$(CPP) complex_first_cpp.cc
20 | 
21 | matvec_mul: matvec_mul.c my_matvec_mul.c
22 | 	$(CC) $(LAPACKE_INC) matvec_mul.c -o matvec_mul $(LAPACKE_LIB)
23 | 	$(CC) my_matvec_mul.c -o my_matvec_mul
24 | 	$(CC) $(LAPACKE_INC) complex_matvec_mul.c -o complex_matvec_mul $(LAPACKE_LIB)
25 | 
26 | linear_eq: linear_eq.c my_linear_eq.c linear_eq_dgetrf.c linear_eq_dsgesv.c linear_eq_dsposv.c linear_eq_dsysv.c linear_eq_dgbsv.c invpower_eig.c
27 | 	$(CC) $(LAPACKE_INC) linear_eq.c $(LAPACKE_LIB)
28 | 	$(CC) $(LAPACKE_INC) linear_eq_dgetrf.c $(LAPACKE_LIB)
29 | 	$(CC) $(LAPACKE_INC) linear_eq_dsgesv.c $(LAPACKE_LIB)
30 | 	$(CC) $(LAPACKE_INC) linear_eq_dsposv.c $(LAPACKE_LIB)
31 | 	$(CC) $(LAPACKE_INC) linear_eq_dsysv.c $(LAPACKE_LIB)
32 | 	$(CC) $(LAPACKE_INC) linear_eq_dgbsv.c $(LAPACKE_LIB)
33 | 	$(CC) my_linear_eq.c 
34 | 	$(CC) $(LAPACKE_INC) invpower_eig.c $(LAPACKE_LIB)
35 | 
36 | blas: blas1.c blas2.c blas3.c blas1_norm.c jacobi_iteration.c power_eig.c
37 | 	$(CC) $(CBLAS_INC) blas1.c $(CBLAS_LIB)
38 | 	$(CC) $(CBLAS_INC) blas1_norm.c $(CBLAS_LIB)
39 | 	$(CC) $(CBLAS_INC) blas2.c $(CBLAS_LIB)
40 | 	$(CC) $(CBLAS_INC) blas3.c $(CBLAS_LIB)
41 | 	$(CC) $(CBLAS_INC) jacobi_iteration.c $(CBLAS_LIB)
42 | 	$(CC) $(CBLAS_INC) power_eig.c $(CBLAS_LIB)
43 | 
44 | row_column_major: row_column_major.c complex_row_column_major.c lapack_complex_row_column_major.c
45 | 	$(CC) $(INC) row_column_major.c -o row_column_major $(LAPACKE_LIB)
46 | 	$(CC) $(INC) complex_row_column_major.c -o complex_row_column_major $(LAPACKE_LIB)
47 | #	$(CC) $(INC) $(LAPACKE_INC) lapack_complex_row_column_major.c -o lapack_complex_row_column_major $(LAPACKE_LIB)
48 | 
49 | dgecon: lapack_dgecon.c lapack_lamch.c
50 | 	$(CC) $(INC) $(LAPACKE_INC) lapack_dgecon.c $(LAPACKE_LIB) $(CBLAS_LIB)
51 | 	$(CC) $(INC) $(LAPACKE_INC) lapack_lamch.c $(LAPACKE_LIB) $(CBLAS_LIB)
52 | 
53 | eig: lapack_dgeev.c lapack_dsyev.c lapack_ssyev.c
54 | 	$(CC) $(LAPACKE_INC) lapack_dgeev.c tkaux.c $(LAPACKE_LIB)
55 | 	$(CC) $(LAPACKE_INC) lapack_dsyev.c $(LAPACKE_LIB)
56 | 	$(CC) $(LAPACKE_INC) lapack_ssyev.c $(LAPACKE_LIB)
57 | 
58 | spblas: jacobi_iteration_mkl.c jacobi_iteration_csr_mkl.c bicgstab_mkl.c bicgstab_csr_mkl.c
59 | 	$(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB)
60 | 	$(CC) $(IMKL_INC) -DUSE_IMKL jacobi_iteration_csr_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB)
61 | 	$(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB)
62 | 	$(CC) $(IMKL_INC) -DUSE_IMKL bicgstab_csr_mkl.c mm/matrix_market_io.c tkaux.c $(IMKL_LIB)
63 | 
64 | openmp: my_matvec_mul.c my_linear_eq_omp.c
65 | 	$(CC) $(OPENMP) $(CBLAS_INC) my_matvec_mul.c -o my_matvec_mul_omp $(CBLAS_LIB)
66 | 	$(CC) $(OPENMP) $(CBLAS_INC) my_linear_eq_omp.c -o my_linear_eq_omp $(CBLAS_LIB)
67 | 
68 | clean:
69 | 	-del *.exe
70 | 	-del *.obj
71 | 
72 | zip:
73 | 	-del ..\lapack-dist.zip
74 | 	cd ../
75 | 	zip -ll -v lapack-dist.zip lapack-dist/*.txt lapack-dist/*.h lapack-dist/*.c lapack-dist/*.cc lapack-dist/Makefile.* lapack-dist/*.cu lapack-dist/*.inc lapack-dist/mm/*.h lapack-dist/mm/*.c lapack-dist/mm/*.mtx lapack-dist/integral_eq/*.c lapack-dist/integral_eq/Makefile.* lapack-dist/integral_eq/*.h lapack-dist/windows/
76 | 	cd ./lapack-dist
77 | 
78 | 


--------------------------------------------------------------------------------
/README.ja.md:
--------------------------------------------------------------------------------
  1 | LAPACK/BLAS 入門：サンプルプログラム
  2 | ============================================================
  3 | 
  4 | 2016-12-02 (Fri) 幸谷 智紀
  5 | ---------------------------------
  6 | 
  7 | 　以下のファイルは「LAPACK/BLAS入門」(森北出版)で解説しているプログラムです。詳細については本文を参照して下さい。
  8 | 
  9 | 　本文及び演習問題で使われているプログラムはLinux環境下で実行を確認したものです。Windows環境下ではCygwinを使うことで同等の環境を整えることができます。Windows環境下での，Visual C++&Intel C++ compilerを使ったBLAS, LAPACK用プログラムのコンパイルも可能です。
 10 | 
 11 | ☆サンプルプログラム一覧
 12 | -----------------------------
 13 | 
 14 | ### 共通  
 15 | >	lapack_gcc.inc ... GCC用設定ファイル (Linux)  
 16 | >	lapack_icc.inc ... Intel C compiler用設定ファイル (Linux)  
 17 | >	lapack_win_intel.inc ... Intel C compiler用設定ファイル (Windows)  
 18 | >	Makefile.unix ... Linux用メイクファイル → GCCかIntel C compilerかを選び，設定ファイルを読み込ませて，"make -f Makefile.unix"で生成  
 19 | >	Makefile.win_intel ... Windows用メイクファイル → Intel C compilerの設定ファイルを読み込ませて，"make -f Makefile.win_intel"で生成  
 20 | >	windows\ ... Windows環境下でCBLAS, LAPACKEのインクルードファイルを置いておくフォルダ  
 21 | 
 22 | ### 第1章
 23 | >	first.c ... 単精度，倍精度基本演算と相対誤差の導出  
 24 | >	complex_first.c ... 複素数演算(C言語用)  
 25 | >	complex_first_cpp.c ... 複素数演算(C++用)  
 26 | 
 27 | ### 第2章
 28 | >	my_matvec_mul.c ... 行列・ベクトル積  
 29 | >	matvec_mul.c ... DGEMVを用いた実行列・ベクトル積  
 30 | >	complex_matvec_mul.c ... ZGEMVを用いた複素行列・ベクトル積  
 31 | >	my_linear_eq.c ... 連立一次方程式の求解  
 32 | >	linear_eq.c ... DGESVを用いた連立一次方程式の求解  
 33 | >	row_column_major.c ... 行優先，列優先行列格納形式  
 34 | >	complex_row_column_major.c ... 複素数行優先，列優先行列格納形式  
 35 | >	lapack_complex_row_column_major.c ... LAPACK関数を用いた複素数行優先，列優先行列格納形式  
 36 | >	lapack_complex_row_column_major.cc ... LAPACK関数を用いた複素数行優先，列優先行列格納形式(C++)  
 37 | 
 38 | ### 第3章
 39 | >	blas1.c ... BLAS1関数サンプル  
 40 | >	blas2.c ... BLAS2関数サンプル  
 41 | >	blas3.c ... BLAS3関数サンプル  
 42 | >	jacobi_iteration.c ... ヤコビ反復法  
 43 | >	power_eig.c ... べき乗法  
 44 | 
 45 | ### 第4章
 46 | >	linear_eq_dgetrf.c ... LU分解，前進代入・後退代入  
 47 | >	linear_eq_dsgesv.c ... 混合精度反復改良法  
 48 | >	linear_eq_dsposv.c ... 実対称行列用の混合精度反復改良法  
 49 | >	lapack_dgecon.c ... 条件数の計算  
 50 | >	lapack_lamch.c ... マシンイプシロン等の導出  
 51 | >	invpower_eig.c ... 逆べき乗法  
 52 | >	lapack_dgeev.c ... 実非対称行列用固有値・固有ベクトル計算  
 53 | >	lapack_dsyev.c ... 実対称行列用固有値・固有ベクトル計算  
 54 | >	lapack_ssyev.c ... 実対称行列用固有値・固有ベクトル計算(単精度)  
 55 | 
 56 | ### 第5章
 57 | >	my_matvec_mul_pt.c ... Pthreadで並列化した行列・ベクトル積計算  
 58 | >	my_matvec_mul_omp.c ... OpenMPで並列化した行列・ベクトル積計算  
 59 | >	my_linear_eq_omp.c ... OpenMPで並列化したLU分解，前進代入・後退代入計算  
 60 | 
 61 | ### 第6章
 62 | >	jacobi_iteration_mkl.c ... COO形式疎行列用のJacobi反復法  
 63 | >	jacobi_iteration_csr_mkl.c... CSR形式疎行列用のJacobi反復法  
 64 | >	bicgstab_mkl.c ... COO形式疎行列用のBiCGSTAB法  
 65 | >	bicgstab_csr_mkl.c ... CSR形式疎行列用のBiCGSTAB法  
 66 | >	mm/matrix_market_io.h  ... MatrixMarketフォーマット用関数定義  
 67 | >	mm/matrix_market_io.c  ... MatrixMarketフォーマット用関数群  
 68 | 
 69 | ### 第7章 (Windows環境下の実行はサポートしていません)
 70 | >	mycuda_daxpy.cu ... CUDAサンプルプログラム  
 71 | >	matvec_mul_cublas.c ... CUBLASを用いた行列・ベクトル積  
 72 | >	matvec_mul_magma.c ... MAGMAとCUBLASを用いた行列・ベクトル積  
 73 | >	matvec_mul_magma_pure.c ... MAGMAだけを用いた行列・ベクトル積  
 74 | >	linear_eq_magma.c ... MAGMAを用いた連立一次方程式の求解  
 75 | >	lapack_dgeev_magma.c ... MAGMAを用いた実非対称行列用固有値・固有ベクトル計算  
 76 | >	bicgstab_csr_cusparse.c ... cuSPARSEを用いたBiCGSTAB法  
 77 | 
 78 | ### 第8章
 79 | >	integral_eq/Makefile.unix ... 積分方程式求解プログラムのコンパイル(Linux)  
 80 | >	integral_eq/Makefile.win_intel ... 積分方程式求解プログラムのコンパイル(Windows)  
 81 | >	integral_eq/gauss_integral.h ... ガウス積分公式導出のためのヘッダファイル  
 82 | >	integral_eq/gauss_integral.c ... ガウス積分公式の導出  
 83 | >	integral_eq/iteration.c ... 割線法とデリバティブフリー解法  
 84 | 
 85 | ☆コンパイル条件
 86 | -----------------------------
 87 | 
 88 | 　本プログラムはLinux, Windowsソフトウェア開発環境下でコンパイル＆実行可能であることを下記の環境で確認しております。
 89 | 
 90 | ・Linux   ... GCC 4.4.7, Intel C/C++/Fortran compiler 13.1.3, Intel Math Kernel Library 11.0.5, LAPACK 3.6.0, MAGMA 1.6.0, CUDA 7.5 on CentOS 6.5 x86_64  
 91 |           ... GCC 4.4.7, Iitel C/C++/Fortran compiler 14.0.2, Intel Math Kernel Library 11.1.2, LAPACK 3.6.0, MAGMA 1.6.1, CUDA 7.0 on CentOS 6.3 x86_64  
 92 | ・Windows ... Intel C++ compiler 16.0.1.146, Intel Math Kernel Library 11.3.2 on Windows 8.1 x64  
 93 | 
 94 | 　Linux, Windows環境下での本プログラムのコンパイル＆実行は，上記のソフトウェア環境が整っているCUIで行って下さい。それ以外の環境下での諸問題については確認ができませんので，お答えすることも不可能です。
 95 | 
 96 | 
 97 | ☆コンパイル方法 ... Linux
 98 | -----------------------------
 99 | 
100 | 0. Intel Math Kernel, CUDA, MAGMA, LAPACKE/CBLASをインストールし，インストール先のディレクトリをMakefile.unix内の適切なマクロ名に設定  
101 | 1. Intel C/C++ Compilerの場合はlapack_icc.incを，GCCの場合はlapack_gcc.incを環境に合わせて修正し，それぞれのコンパイラが適切に動作するよう環境設定を行い，Makefile.unixが読み込むファイルを設定  
102 | 2. make -f ./Makefile.unix でコンパイル  
103 | 3. make -f ./Makefile.unix clean でobjectファイル，実行ファイルが消去される  
104 | 
105 | ☆コンパイル方法 ... Windows
106 | -----------------------------
107 | 
108 | 0. Intel Math Kernelをインストールし，インストール先のディレクトリをMakefile.win_intel内の適切なマクロ名に設定。また，LAPACKEとCBLASのインクルードファイルをwindowsフォルダにコピーしておく（デフォルト設定の場合）。  
109 | 1. Intel C/C++ CompilerとVisual C++が適切に動作するよう環境設定を行う  
110 | 2. nmake -f ./Makefile.win_intel でコンパイル  
111 | 3. nmake -f ./Makefile.win_intel clean でobjectファイル，実行ファイルが消去される  
112 | 
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | LAPACK/BLAS Tutorial: List of sample programs
 2 | ============================================================
 3 | 
 4 | 2016-12-02 (Fri) Tomonori Kouya
 5 | ---------------------------------
 6 | 
 7 | The following programs are published for all readers of "LAPACK/BLAS Tutorial" written in JAPANESE.
 8 | You can read "README.ja.md"(Japanese!) if you want to get detailed description about them.
 9 | 
10 | *Common*  
11 | 	lapack_gcc.inc  
12 | 	lapack_icc.inc  
13 | 	lapack_win_intel.inc  
14 | 	Makefile.unix  
15 | 	Makefile.win_intel  
16 | 	windows\  
17 | 
18 | *Chapter 1*  
19 | 	first.c  
20 | 	complex_first.c  
21 | 	complex_first_cpp.c  
22 | 
23 | *Chapter 2*  
24 | 	my_matvec_mul.c  
25 | 	matvec_mul.c  
26 | 	complex_matvec_mul.c  
27 | 	my_linear_eq.c  
28 | 	linear_eq.c  
29 | 	row_column_major.c  
30 | 	complex_row_column_major.c  
31 | 	lapack_complex_row_column_major.c  
32 | 	lapack_complex_row_column_major.cc  
33 | 
34 | *Chapter 3*  
35 | 	blas1.c  
36 | 	blas2.c  
37 | 	blas3.c  
38 | 	jacobi_iteration.c  
39 | 	power_eig.c  
40 | 
41 | *Chapter 4*  
42 | 	linear_eq_dgetrf.c  
43 | 	linear_eq_dsgesv.c  
44 | 	linear_eq_dsposv.c  
45 | 	lapack_dgecon.c  
46 | 	lapack_lamch.c  
47 | 	invpower_eig.c  
48 | 	lapack_dgeev.c  
49 | 	lapack_dsyev.c  
50 | 	lapack_ssyev.c  
51 | 
52 | *Chapter 5*  
53 | 	my_matvec_mul_pt.c  
54 | 	my_matvec_mul_omp.c  
55 | 	my_linear_eq_omp.c  
56 | 
57 | *Chapter 6*  
58 | 	jacobi_iteration_mkl.c  
59 | 	jacobi_iteration_csr_mkl.c  
60 | 	bicgstab_mkl.c  
61 | 	bicgstab_csr_mkl.c  
62 | 	mm/matrix_market_io.h  
63 | 	mm/matrix_market_io.c  
64 | 
65 | *Chapter 7* (Caution: These programs cannot be compiled on Windows!)  
66 | 	mycuda_daxpy.cu  
67 | 	matvec_mul_cublas.c  
68 | 	matvec_mul_magma.c  
69 | 	matvec_mul_magma_pure.c  
70 | 	linear_eq_magma.c  
71 | 	lapack_dgeev_magma.c  
72 | 	bicgstab_csr_cusparse.c  
73 | 
74 | *Chapter 8*  
75 | 	integral_eq/Makefile.unix  
76 | 	integral_eq/Makefile.win_intel  
77 | 	integral_eq/gauss_integral.h  
78 | 	integral_eq/gauss_integral.c  
79 | 	integral_eq/iteration.c  
80 | 
81 | 


--------------------------------------------------------------------------------
/bicgstab_csr_cusparse.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_csr_cusparse.c


--------------------------------------------------------------------------------
/bicgstab_csr_magma.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_csr_magma.c


--------------------------------------------------------------------------------
/bicgstab_csr_mkl.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_csr_mkl.c


--------------------------------------------------------------------------------
/bicgstab_mkl.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/bicgstab_mkl.c


--------------------------------------------------------------------------------
/blas1.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample Program of BLAS Level1                 */
 4 | /*                        with Intel Math Kernel */
 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya         */
 6 | /*************************************************/
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | #include "cblas.h"
11 | 
12 | int main()
13 | {
14 | 	int i, dim;
15 | 	int inc_vb, inc_vc, inc_va;
16 | 	double *va, *vb, *vc;
17 | 	double alpha;
18 | 
19 | 	// input dimension
20 | 	printf("Dim = "); scanf("%d", &dim);
21 | 
22 | 	if(dim <= 0)
23 | 	{
24 | 		printf("Illegal dimension! (dim = %d)\n", dim);
25 | 		return EXIT_FAILURE;
26 | 	}
27 | 
28 | 	// Initialize
29 | 	va = (double *)calloc(dim, sizeof(double));
30 | 	vb = (double *)calloc(dim, sizeof(double));
31 | 	vc = (double *)calloc(dim, sizeof(double));
32 | 
33 | 	// input va and vb
34 | 	for(i = 0; i < dim; i++)
35 | 	{
36 | 		va[i] = i + 1;
37 | 		vb[i] = dim - (i + 1);
38 | 	}
39 | 
40 | 	//vc := vb
41 | 	inc_vb = inc_vc = inc_va = 1;
42 | 	cblas_dcopy(dim, vb, inc_vb, vc, inc_vc);
43 | 
44 | 	// vc := 1.0 * va + vb
45 | 	alpha = 1.0;
46 | 	cblas_daxpy(dim, alpha, va, inc_va, vc, inc_vc);
47 | 
48 | 	// print
49 | 	for(i = 0; i < dim; i++)
50 | 		printf("%10.3f + %10.3f = %10.3f\n", *(va + i), *(vb + i), *(vc + i));
51 | 
52 | 	// free
53 | 	free(va);
54 | 	free(vb);
55 | 	free(vc);
56 | 
57 | 	return EXIT_SUCCESS;
58 | }
59 | 


--------------------------------------------------------------------------------
/blas1_norm.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample Program of BLAS Level1                 */
 4 | /*                        with Intel Math Kernel */
 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya         */
 6 | /*************************************************/
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | #include "cblas.h"
11 | 
12 | int main()
13 | {
14 | 	int i, dim;
15 | 	int inc_vb, inc_vc, inc_va;
16 | 	double *va, *vb, *vc;
17 | 	double alpha, norm1, norm2, normi;
18 | 
19 | 	// input dimension
20 | 	printf("Dim = "); scanf("%d", &dim);
21 | 
22 | 	if(dim <= 0)
23 | 	{
24 | 		printf("Illegal dimension! (dim = %d)\n", dim);
25 | 		return EXIT_FAILURE;
26 | 	}
27 | 
28 | 	// Initialize
29 | 	va = (double *)calloc(dim, sizeof(double));
30 | 	vb = (double *)calloc(dim, sizeof(double));
31 | 	vc = (double *)calloc(dim, sizeof(double));
32 | 
33 | 	// input va and vb
34 | 	for(i = 0; i < dim; i++)
35 | 	{
36 | 		va[i] = i + 1;
37 | 		vb[i] = dim - (i + 1);
38 | 	}
39 | 
40 | 	//vc := vb
41 | 	inc_vb = inc_vc = inc_va = 1;
42 | 	cblas_dcopy(dim, vb, inc_vb, vc, inc_vc);
43 | 
44 | 	// vc := 1.0 * va + vb
45 | 	alpha = 1.0;
46 | 	cblas_daxpy(dim, alpha, va, inc_va, vc, inc_vc);
47 | 
48 | 	// print
49 | 	for(i = 0; i < dim; i++)
50 | 		printf("%10.3f + %10.3f = %10.3f\n", *(va + i), *(vb + i), *(vc + i));
51 | 
52 | 	// norm_1, 2, i
53 | 	norm1 = cblas_dasum(dim, va, 1);
54 | 	norm2 = cblas_dnrm2(dim, va, 1);
55 | 	normi = fabs(vc[cblas_idamax(dim, va, 1)]);
56 | 
57 | 	printf("norm1, norm2, normi = %10.3e, %10.3e, %10.3e\n", norm1, norm2, normi);
58 | 
59 | 	// free
60 | 	free(va);
61 | 	free(vb);
62 | 	free(vc);
63 | 
64 | 	return EXIT_SUCCESS;
65 | }
66 | 


--------------------------------------------------------------------------------
/blas2.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample Program of BLAS Level2                 */
 4 | /*                        with Intel Math Kernel */
 5 | /* Last Update: 2013-08-05 (Mon) T.Kouya         */
 6 | /*************************************************/
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | #include "cblas.h"
11 | 
12 | int main()
13 | {
14 | 	int i, j, dim;
15 | 	int inc_vb, inc_vc;
16 | 	double *ma, *vb, *vc;
17 | 	double alpha, beta;
18 | 
19 | 	// input dimension
20 | 	printf("Dim = "); scanf("%d", &dim);
21 | 
22 | 	if(dim <= 0)
23 | 	{
24 | 		printf("Illegal dimension! (dim = %d)\n", dim);
25 | 		return EXIT_FAILURE;
26 | 	}
27 | 
28 | 	// Initialize
29 | 	ma = (double *)calloc(dim * dim, sizeof(double));
30 | 	vb = (double *)calloc(dim, sizeof(double));
31 | 	vc = (double *)calloc(dim, sizeof(double));
32 | 
33 | 	// input ma and vb
34 | 	for(i = 0; i < dim; i++)
35 | 	{
36 | 		for(j = 0; j < dim; j++)
37 | 			ma[i * dim + j] = sqrt(2.0) * (double)(dim - (i + j + 1));
38 | 		vb[i] = sqrt(2.0) * (double)(i + 1);
39 | 	}
40 | 
41 | 	//vc := vb
42 | 	inc_vb = inc_vc = 1;
43 | 
44 | 	// vc := 1.0 * ma * vb
45 | 	alpha = 1.0;
46 | 	beta = 0.0;
47 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, alpha, ma, dim, vb, inc_vb, beta, vc, inc_vc);
48 | 
49 | 	// print
50 | 	for(i = 0; i < dim; i++)
51 | 	{
52 | 		printf("[");
53 | 		for(j = 0; j < dim; j++)
54 | 			printf("%10.3f ", ma[i * dim + j]);
55 | 		printf("]  %10.3f = %10.3f\n", vb[i], vc[i]);
56 | 	}
57 | 
58 | 	// free
59 | 	free(ma);
60 | 	free(vb);
61 | 	free(vc);
62 | 
63 | 	return EXIT_SUCCESS;
64 | }
65 | 


--------------------------------------------------------------------------------
/blas3.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample Program of BLAS Level3                 */
 4 | /*                        with Intel Math Kernel */
 5 | /* Last Update: 2011-06-10 (Fri) T.Kouya         */
 6 | /*************************************************/
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | 
11 | #include "cblas.h"
12 | 
13 | int main()
14 | {
15 | 	int i, j, dim;
16 | 	double *ma, *mb, *mc;
17 | 	double alpha, beta;
18 | 
19 | 	// input dimension
20 | 	printf("Dim = "); scanf("%d", &dim);
21 | 
22 | 	if(dim <= 0)
23 | 	{
24 | 		printf("Illegal dimension! (dim = %d)\n", dim);
25 | 		return EXIT_FAILURE;
26 | 	}
27 | 
28 | 	// Initialize
29 | 	ma = (double *)calloc(dim * dim, sizeof(double));
30 | 	mb = (double *)calloc(dim * dim, sizeof(double));
31 | 	mc = (double *)calloc(dim * dim, sizeof(double));
32 | 
33 | 	// input va and vb
34 | 	for(i = 0; i < dim; i++)
35 | 	{
36 | 		for(j = 0; j < dim; j++)
37 | 		{
38 | 			ma[i * dim + j] = sqrt(2.0) * (i + j + 1);
39 | 			mb[i * dim + j] = sqrt(2.0) * (dim * 2 - (i + j + 1));
40 | 		}
41 | 	}
42 | 
43 | 	// print
44 | 	for(i = 0; i < dim; i++)
45 | 	{
46 | 		printf("[");
47 | 		for(j = 0; j < dim; j++)
48 | 			printf("%10.3f ", ma[i * dim + j]);
49 | 		printf("] [");
50 | 		for(j = 0; j < dim; j++)
51 | 			printf("%10.3f ", mb[i * dim + j]);
52 | 		printf("]\n");
53 | 	}
54 | 
55 | 	// mc := 1.0 * ma * mb + 0.0 * mc
56 | 	alpha = 1.0;
57 | 	beta = 0.0;
58 | 	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, mb, dim, beta, mc, dim);
59 | 
60 | 	// print
61 | 	printf(" = \n");
62 | 	for(i = 0; i < dim; i++)
63 | 	{
64 | 		printf("%3d: [", i);
65 | 		for(j = 0; j < dim; j++)
66 | 			printf("%10.3f ", mc[i * dim + j]);
67 | 		printf("]\n");
68 | 	}
69 | 
70 | 	return EXIT_SUCCESS;
71 | }
72 | 


--------------------------------------------------------------------------------
/complex_first.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample program with C99 complex type          */
 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
 5 | /*************************************************/
 6 | #include <stdio.h>
 7 | #include <math.h>
 8 | #include <complex.h> // C99 Complex data type
 9 | 
10 | int main()
11 | {
12 | 	float  complex cc = 0.0, ca = -2.0 + 2.0 * I, cb = 3.0 - 3.0 * I;
13 | 	double complex zc = 0.0, za = -2.0 + 2.0 * I, zb = 3.0 - 3.0 * I;
14 | 	double relerr;
15 | 
16 | 	// basic arithmetic: float complex
17 | 	printf("--- float data type(single precsion floating-point number) ---\n");
18 | 	cc = ca + cb;
19 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca), crealf(cb), cimagf(cb));
20 | 	cc = ca - cb;
21 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca), crealf(cb), cimagf(cb));
22 | 	cc = ca * cb;
23 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca), crealf(cb), cimagf(cb));
24 | 	cc = ca / cb;
25 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(ca), cimag(ca), creal(cb), cimagf(cb));
26 | 
27 | 	// absolute value and square root: float
28 | 	cc = cabsf(ca);
29 | 	printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", crealf(cc), cimagf(cc), crealf(ca), cimagf(ca));
30 | 	cc = csqrtf(cb);
31 | 	printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", crealf(cc), cimagf(cc), crealf(cb), cimagf(cb));
32 | 
33 | 	// basic arithmetic: double
34 | 	printf("--- double data type(double precsion floating-point number) ---\n");
35 | 	zc = za + zb;
36 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb));
37 | 	zc = za - zb;
38 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb));
39 | 	zc = za * zb;
40 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb));
41 | 	zc = za / zb;
42 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(za), cimag(za), creal(zb), cimag(zb));
43 | 
44 | 	// absolute value and square root: double
45 | 	zc = cabs(za);
46 | 	printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", creal(zc), cimag(zc), creal(za), cimag(za));
47 | 	zc = csqrt(zb);
48 | 	printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", creal(zc), cimag(zc), creal(zb), cimag(zb));
49 | 
50 | 	// relative error of float square root
51 | 	relerr = cabs(cc - zc);
52 | 	if(cabs(zc) > 0.0)
53 | 		relerr /= cabs(zc);
54 | 
55 | 	printf("Single Prec.  : %25.17e + %25.17e * I\n", creal(cc), cimag(cc));
56 | 	printf("Double Prec.  : %25.17e + %25.17e * I\n", creal(zc), cimag(zc));
57 | 	printf("Relative Error: %10.3e\n", relerr);
58 | 
59 | 	// real part
60 | 	relerr = fabs(creal(cc) - creal(zc));
61 | 	if(fabs(creal(zc)) > 0.0)
62 | 		relerr /= fabs(creal(zc));
63 | 
64 | 	printf("Relative Error(real): %10.3e\n", relerr);
65 | 
66 | 	// imaginary part
67 | 	relerr = fabs(cimag(cc) - cimag(zc));
68 | 	if(fabs(cimag(zc)) > 0.0)
69 | 		relerr /= fabs(cimag(zc));
70 | 
71 | 	printf("Relative Error(imag): %10.3e\n", relerr);
72 | 
73 | 	return 0;
74 | }
75 | 


--------------------------------------------------------------------------------
/complex_first_cpp.cc:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample program with C++ complex type          */
 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
 5 | /*************************************************/
 6 | #include <iostream>
 7 | #include <complex>
 8 | #include <cstdio>
 9 | 
10 | using namespace std;
11 | 
12 | int main()
13 | {
14 | 	complex<float> cc = 0.0, ca = complex<float>(-2.0, 2.0), cb = complex<float>(3.0, -3.0);
15 | 	complex<double> zc = 0.0, za = complex<double>(-2.0, 2.0), zb = complex<double>(3.0,  -3.0);
16 | 	double relerr;
17 | 
18 | 	// basic arithmetic: float complex
19 | 	cout << "--- float data type(single precsion floating-point number) ---" << endl;
20 | 	cc = ca + cb;
21 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag());
22 | 	cc = ca - cb;
23 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag());
24 | 	cc = ca * cb;
25 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag());
26 | 	cc = ca / cb;
27 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), ca.real(), ca.imag(), cb.real(), cb.imag());
28 | 
29 | 	// absolute value and square root: float
30 | 	cc = abs(ca);
31 | 	printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", cc.real(), cc.imag(), ca.real(), ca.imag());
32 | 	cc = sqrt(cb);
33 | 	printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", cc.real(), cc.imag(), cb.real(), cb.imag());
34 | 
35 | 	// basic arithmetic: double
36 | 	printf("--- double data type(double precsion floating-point number) ---\n");
37 | 	zc = za + zb;
38 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) + (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag());
39 | 	zc = za - zb;
40 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) - (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag());
41 | 	zc = za * zb;
42 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) * (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag());
43 | 	zc = za / zb;
44 | 	printf("%25.17e %+-25.17e * I := (%25.17e %+-25.17e * I) / (%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), za.real(), za.imag(), zb.real(), zb.imag());
45 | 
46 | 	// absolute value and square root: double
47 | 	zc = abs(za);
48 | 	printf("%25.17e %+-25.17e * I := |%25.17e %+-25.17e * I|\n", zc.real(), zc.imag(), za.real(), za.imag());
49 | 	zc = sqrt(zb);
50 | 	printf("%25.17e %+-25.17e * I:= sqrt(%25.17e %+-25.17e * I)\n", zc.real(), zc.imag(), zb.real(), zb.imag());
51 | 
52 | 	// relative error of float square root
53 | 	relerr = abs((complex<double>)cc - zc);
54 | 	if(abs(zc) > 0.0)
55 | 		relerr /= abs(zc);
56 | 
57 | 	printf("Single Prec.  : %25.17e %+-25.17e * I\n", cc.real(), cc.imag());
58 | 	printf("Double Prec.  : %25.17e %+-25.17e * I\n", zc.real(), zc.imag());
59 | 	printf("Relative Error: %10.3e\n", relerr);
60 | 
61 | 	// real part
62 | 	relerr = abs(cc.real() - zc.real());
63 | 	if(abs(zc.real()) > 0.0)
64 | 		relerr /= abs(zc.real());
65 | 
66 | 	printf("Relative Error(real): %10.3e\n", relerr);
67 | 
68 | 	// imaginary part
69 | 	relerr = abs(cc.imag() - zc.imag());
70 | 	if(abs(zc.imag()) > 0.0)
71 | 		relerr /= abs(zc.imag());
72 | 
73 | 	printf("Relative Error(imag): %10.3e\n", relerr);
74 | 
75 | 	return 0;
76 | }
77 | 


--------------------------------------------------------------------------------
/complex_matvec_mul.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Multiplication of C99 complex matrix          */
 4 | /*                                    and vector */
 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
 6 | /*************************************************/
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | #include <complex.h> // C99 complex data type
11 | 
12 | #include "cblas.h"
13 | 
14 | int main()
15 | {
16 | 	int i, j, dim;
17 | 	int inc_vec_x, inc_vec_b;
18 | 
19 | 	double complex *mat_a, *vec_b, *vec_x;
20 | 	double complex alpha, beta;
21 | 
22 | 	// input dimension
23 | 	printf("Dim = "); scanf("%d", &dim);
24 | 
25 | 	if(dim <= 0)
26 | 	{
27 | 		printf("Illegal dimension! (dim = %d)\n", dim);
28 | 		return EXIT_FAILURE;
29 | 	}
30 | 
31 | 	// initialize a matrix and vectors
32 | 	mat_a = (double complex *)calloc(dim * dim, sizeof(double complex));
33 | 	vec_x = (double complex *)calloc(dim, sizeof(double complex));
34 | 	vec_b = (double complex *)calloc(dim, sizeof(double complex));
35 | 
36 | 	// input mat_a and vec_x
37 | 	for(i = 0; i < dim; i++)
38 | 	{
39 | 		for(j = 0; j < dim; j++)
40 | 		{
41 | 			mat_a[i * dim + j] = (double)(i + j + 1) - (double)(i + j + 1) * I;
42 | 			if((i + j + 1) % 2 != 0)
43 | 				mat_a[i * dim + j] *= -1.0;
44 | 		}
45 | 		vec_x[i] = 1.0 / (double)(i + 1) + 1.0 / (double)(i + 1) * I;
46 | 	}
47 | 
48 | 	// size(vec_x) == size(vec_b)
49 | 	inc_vec_x = inc_vec_b = 1;
50 | 
51 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
52 | 	alpha = 1.0;
53 | 	beta = 0.0;
54 | 	cblas_zgemv(CblasRowMajor, CblasNoTrans, dim, dim, (void *)&alpha, mat_a, dim, vec_x, inc_vec_x, (void *)&beta, vec_b, inc_vec_b);
55 | 
56 | 	// print
57 | 	for(i = 0; i < dim; i++)
58 | 	{
59 | 		printf("[");
60 | 		for(j = 0; j < dim; j++)
61 | 			printf("%6.3lf %+-6.3lf * I ", creal(mat_a[i * dim + j]), cimag(mat_a[i * dim + j]));
62 | 		printf("]  %6.3lf %+-6.3lf * I = %6.3lf %+-6.3lf * I\n", creal(vec_x[i]), cimag(vec_x[i]), creal(vec_b[i]), cimag(vec_b[i]));
63 | 	}
64 | 
65 | 	// free
66 | 	free(mat_a);
67 | 	free(vec_x);
68 | 	free(vec_b);
69 | 
70 | 	return EXIT_SUCCESS;
71 | }
72 | 


--------------------------------------------------------------------------------
/complex_row_column_major.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample probram with C99 complex row-major     */
  4 | /*                     and column major matrices */
  5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <complex.h> // C99 complex data type
 11 | 
 12 | #include "cblas.h"
 13 | 
 14 | int main()
 15 | {
 16 | 	int i, j, row_dim, col_dim;
 17 | 
 18 | 	double complex *mat_a;
 19 | 
 20 | 	// input dimension
 21 | 	printf("Row    Dim = "); scanf("%d", &row_dim);
 22 | 	printf("Column Dim = "); scanf("%d", &col_dim);
 23 | 
 24 | 	if((row_dim <= 0) || (col_dim <= 0))
 25 | 	{
 26 | 		printf("Illegal dimension! (row_dim = %d, col_dim = %d)\n", row_dim, col_dim);
 27 | 		return EXIT_FAILURE;
 28 | 	}
 29 | 
 30 | 	// initialize matrix area
 31 | 	mat_a = (double complex *)calloc(row_dim * col_dim, sizeof(double complex));
 32 | 
 33 | 	printf("Row Major: %d x %d\n", row_dim, col_dim);
 34 | 
 35 | 	// Row Major 
 36 | 	// mat_a = A
 37 | 	// A = [1+i 2+2i ....... n+ni]
 38 | 	//     [n+i n+2i ...... 2n+ni]
 39 | 	//     [.....................]
 40 | 	//     [(m-1)n+1+i ..... mn+ni]
 41 | 	for(i = 0; i < row_dim; i++)
 42 | 	{
 43 | 		for(j = 0; j < col_dim; j++)
 44 | 			mat_a[i * col_dim + j] = (double)(i * col_dim + j + 1) + (double)(j + 1) * I;
 45 | 	}
 46 | 
 47 | 	// print (1)
 48 | 	printf("1 dimension: \n");
 49 | 	printf("[");
 50 | 	for(i = 0; i < row_dim * col_dim; i++)
 51 | 		printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i]), cimag(mat_a[i]));
 52 | 	printf("]\n");
 53 | 
 54 | 	// print(2)
 55 | 	printf("2 dimension: \n");
 56 | 	for(i = 0; i < row_dim; i++)
 57 | 	{
 58 | 		printf("[");
 59 | 		for(j = 0; j < col_dim; j++)
 60 | 			printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i * col_dim + j]), cimag(mat_a[i * col_dim + j]));
 61 | 		printf("]\n");
 62 | 	}
 63 | 
 64 | 	printf("Column Major: %d x %d\n", row_dim, col_dim);
 65 | 
 66 | 	// Column Major
 67 | 	// mat_a = A
 68 | 	// A = [1+i 2+2i ....... n+ni]
 69 | 	//     [n+i n+2i ...... 2n+ni]
 70 | 	//     [.....................]
 71 | 	//     [(m-1)n+1+i ..... mn+ni]
 72 | 	for(j = 0; j < col_dim; j++)
 73 | 	{
 74 | 		for(i = 0; i < row_dim; i++)
 75 | 			mat_a[i + row_dim * j] = (double)(i * col_dim + j + 1) + (double)(j + 1) * I;
 76 | 	}
 77 | 
 78 | 	// print (1)
 79 | 	printf("1 dimension: \n");
 80 | 	printf("[");
 81 | 	for(i = 0; i < row_dim * col_dim; i++)
 82 | 		printf(" %6.3lf  %+-6.3lf * i ", creal(mat_a[i]), cimag(mat_a[i]));
 83 | 	printf("]\n");
 84 | 
 85 | 	// print (2)
 86 | 	printf("2 dimension: \n");
 87 | 	for(i = 0; i < row_dim; i++)
 88 | 	{
 89 | 		printf("[");
 90 | 		for(j = 0; j < col_dim; j++)
 91 | 			printf(" %6.3lf %+-6.3lf * i ", creal(mat_a[i + row_dim * j]), cimag(mat_a[i + row_dim * j]));
 92 | 		printf("]\n");
 93 | 	}
 94 | 
 95 | 	// free
 96 | 	free(mat_a);
 97 | 
 98 | 	return EXIT_SUCCESS;
 99 | }
100 | 


--------------------------------------------------------------------------------
/cublas_dgemv.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample Program of BLAS Level3                 */
  4 | /*                        with Intel Math Kernel */
  5 | /* Last Update: 2011-06-10 (Fri) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include "get_secv.h"
 10 | #ifdef USE_IMKL
 11 | 	#include "mkl.h"
 12 | 	#include "mkl_cblas.h" // for Intel Math Kernel Library
 13 | #else
 14 | 	#include "cblas.h"
 15 | #endif
 16 | 
 17 | #ifdef USE_CUDA
 18 | 	#include "cuda.h"
 19 | 	#include "cublas_v2.h"
 20 | #ifdef USE_MAGMA
 21 | 	#include "magma.h"
 22 | #endif
 23 | #endif
 24 | 
 25 | // normal 3 loops version
 26 | void handmade_dmatmul(double *c, double *a, double *b, int dim)
 27 | {
 28 | 	int i, j, k;
 29 | 
 30 | 	for(i = 0; i < dim; i++)
 31 | 	{
 32 | 		for(j = 0; j < dim; j++)
 33 | 		{
 34 | 			*(c + i * dim + j) = 0.0;
 35 | 			for(k = 0; k < dim; k++)
 36 | 				*(c + i * dim + j) += *(a + i * dim + k) * *(b + k * dim + j);
 37 | 		}
 38 | 	}
 39 | }
 40 | 
 41 | // flag = 0 ... use DGEMM in BLAS
 42 | // flag = 1 ... use handmade matmul 
 43 | double get_gflops_dgemm(int flag, int dim)
 44 | {
 45 | #ifdef USE_ATLAS
 46 | 	int i, j;
 47 | #else
 48 | 	MKL_INT i, j; // dimension of vectors
 49 | #endif
 50 | 	double *ma, *mb, *mc;
 51 | 	double alpha, beta;
 52 | 	double running_time;
 53 | 	double gflops;
 54 | 	int itimes, max_itimes = 10000;
 55 | 	double tmp;
 56 | 
 57 | 	// Initialize
 58 | 	ma = (double *)calloc(dim * dim, sizeof(double));
 59 | 	mb = (double *)calloc(dim * dim, sizeof(double));
 60 | 	mc = (double *)calloc(dim * dim, sizeof(double));
 61 | 
 62 | 	// column major
 63 | 	for(i = 0; i < dim; i++)
 64 | 	{
 65 | 		for(j = 0; j < dim; j++)
 66 | 		{
 67 | 			*(ma + i + j * dim) = sqrt(2.0) * (i + j + 1);
 68 | 			*(mb + i + j * dim) = sqrt(2.0) * (dim * 2 - (i + j + 1));
 69 | 		}
 70 | 	}
 71 | 
 72 | 	if(flag == 0)
 73 | 	{
 74 | 		// mc := 1.0 * ma * mb + 0.0 * mc
 75 | 		alpha = 1.0;
 76 | 		beta = 0.0;
 77 | 		tmp = 0.0;
 78 | 		for(itimes = 0; itimes < max_itimes; itimes++)
 79 | 		{
 80 | 			running_time = get_real_secv();
 81 | 			cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, mb, dim, beta, mc, dim);
 82 | 			tmp += get_real_secv() - running_time; // end
 83 | 			if(tmp > 0.0)
 84 | 			{
 85 | 				running_time = tmp / (itimes + 1);
 86 | 				break;
 87 | 			}	
 88 | 		}
 89 | 	}
 90 | 	else if(flag == 1)
 91 | 	{
 92 | 		tmp = 0.0;
 93 | 		for(itimes = 0; itimes < max_itimes; itimes++)
 94 | 		{
 95 | 			running_time = get_real_secv();
 96 | 			handmade_dmatmul(mc, ma, mb, dim);
 97 | 			tmp += get_real_secv() - running_time; // end
 98 | 			if(tmp > 0.0)
 99 | 			{
100 | 				running_time = tmp / (itimes + 1);
101 | 				break;
102 | 			}
103 | 		}
104 | 	
105 | 	}
106 | 
107 | 	// print
108 | /*	for(i = 0; i < dim; i++)
109 | 	{
110 | 		printf("%3d: ", i);
111 | 		for(j = 0; j < dim; j++)
112 | 			printf("%10f ", *(mc + i * dim + j));
113 | 		printf("\n");
114 | 	}
115 | */
116 | //	printf("dim = %d, Running Time(sec) = %f\n", dim, running_time);
117 | 	gflops = 2.0 * (double)dim * (double)dim * (double)dim / running_time / 1024.0 / 1024.0 / 1024.0;
118 | //	printf("%f Gflops\n", gflops);
119 | 
120 | 	free(ma);
121 | 	free(mb);
122 | 	free(mc);
123 | 
124 | 	return gflops;
125 | }
126 | 
127 | // GPGPU
128 | #ifdef USE_CUDA
129 | 
130 | // GPU上に行列格納領域を確保
131 | void *mycuda_calloc(int num_elements, size_t size_element)
132 | {
133 | 	cudaError_t cuda_error;
134 | 	void *ret = NULL;
135 | 
136 | 	cuda_error = cudaMalloc((void **)&ret, num_elements * size_element);
137 | 
138 | 	if(cuda_error != cudaSuccess)
139 | 	{
140 | 		printf("device memory allocation failed!(num_elements = %d, size = %d)\n", num_elements, size_element);
141 | 		return NULL;
142 | 	}
143 | 
144 | 	return ret;
145 | }
146 | 
147 | // GPU上のメモリ領域を解放
148 | void mycuda_free(void *mem)
149 | {
150 | 	cudaFree(mem);
151 | }
152 | 
153 | // flag = 0 ... use DGEMM in cuBLAS
154 | // flag = 1 ... use magma_dgemm in MAGMA
155 | double get_gflops_cuda_dgemm(int flag, int dim, double *total_running_time)
156 | {
157 | 	int i, j; // dimension of vectors
158 | 	double *ma, *mb, *mc, *mc_host; // on CPU
159 | 	double *dev_ma, *dev_mb, *dev_mc; // on GPU
160 | 	double alpha, beta;
161 | 	double running_time, tmp_time, tmp_total_time;
162 | 	double gflops;
163 | 	int itimes, max_itimes = 10000;
164 | 	cublasStatus_t status;
165 | 	cublasHandle_t handle;
166 | 
167 | 	// Initialize on CPU
168 | 	ma = (double *)calloc(dim * dim, sizeof(double));
169 | 	mb = (double *)calloc(dim * dim, sizeof(double));
170 | 	mc = (double *)calloc(dim * dim, sizeof(double));
171 | 	mc_host = (double *)calloc(dim * dim, sizeof(double));
172 | 
173 | 	// Initialize on GPU
174 | 	dev_ma = (double *)mycuda_calloc(dim * dim, sizeof(double));
175 | 	dev_mb = (double *)mycuda_calloc(dim * dim, sizeof(double));
176 | 	dev_mc = (double *)mycuda_calloc(dim * dim, sizeof(double));
177 | 
178 | 	// input ma and mb
179 | 	for(j = 0; j < dim; j++)
180 | 	{
181 | 		for(i = 0; i < dim; i++)
182 | 		{
183 | 			// column major
184 | 			ma[i + j * dim] = sqrt(2.0) * (i + j + 1);
185 | 			mb[i + j * dim] = sqrt(2.0) * (dim * 2 - (i + j + 1));
186 | 			mc[i + j * dim] = 0.0;
187 | 			mc_host[i + j * dim] = 0.0;
188 | 		}
189 | 	}
190 | 
191 | 	// set matrix
192 | 	status = cublasCreate(&handle);
193 | 	if(status != CUBLAS_STATUS_SUCCESS)
194 | 	{
195 | 		printf("cuBLASの初期化に失敗しました。\n");
196 | 
197 | 		mycuda_free(dev_ma);
198 | 		mycuda_free(dev_mb);
199 | 		mycuda_free(dev_mc);
200 | 		cublasDestroy(handle);
201 | 
202 | 		return 0;
203 | 	}
204 | 
205 | /*	status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim);
206 | 	if(status != CUBLAS_STATUS_SUCCESS)
207 | 		printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n");
208 | 	status = cublasSetMatrix(dim, dim, sizeof(double), mb, dim, dev_mb, dim);
209 | 	if(status != CUBLAS_STATUS_SUCCESS)
210 | 		printf("mb -> dev_mb: cublasSetMatrix失敗しました。\n");
211 | */
212 | 
213 | 	// cublasDgemm
214 | 	if(flag == 0)
215 | 	{
216 | 		// mc := 1.0 * ma * mb + 0.0 * mc
217 | 		alpha = 1.0; // on CPU
218 | 		beta = 0.0; // on CPU
219 | 		tmp_time = 0.0;
220 | 		tmp_total_time = 0.0;
221 | 		for(itimes = 0; itimes < max_itimes; itimes++)
222 | 		{
223 | 			tmp_total_time = get_real_secv();
224 | 
225 | 			status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim);
226 | 			if(status != CUBLAS_STATUS_SUCCESS)
227 | 				printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n");
228 | 			status = cublasSetMatrix(dim, dim, sizeof(double), mb, dim, dev_mb, dim);
229 | 			if(status != CUBLAS_STATUS_SUCCESS)
230 | 				printf("mb -> dev_mb: cublasSetMatrix失敗しました。\n");
231 | 
232 | 			tmp_time = get_real_secv();
233 | 			status = cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, dim, dim, dim, &alpha, dev_ma, dim, dev_mb, dim, &beta, dev_mc, dim);
234 | 			//magma_dgemm(MagmaNoTrans, MagmaNoTrans, dim, dim, dim, alpha, dev_ma, dim, dev_mb, dim, beta, dev_mc, dim);
235 | 			cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須！
236 | 			running_time += get_real_secv() - tmp_time; // end
237 | 			if(status != CUBLAS_STATUS_SUCCESS)
238 | 				printf("cublasDgemm失敗しました。\n");
239 | 
240 | 			//printf("%f \n", get_real_secv() - running_time);
241 | 
242 | 			status = cublasGetMatrix(dim, dim, sizeof(double), dev_mc, dim, mc, dim);
243 | 			if(status != CUBLAS_STATUS_SUCCESS)
244 | 				printf("dev_mc -> mc: cublasGetMatrix失敗しました。\n");
245 | 
246 | 			*total_running_time += get_real_secv() - tmp_total_time;
247 | 
248 | 			if(*total_running_time > 1.0)
249 | 				break;
250 | 		}
251 | 	}
252 | #ifdef USE_MAGMA
253 | 	// magmablas_dgemm
254 | 	else if(flag == 1)
255 | 	{
256 | 		// mc := 1.0 * ma * mb + 0.0 * mc
257 | 		alpha = 1.0; // on CPU
258 | 		beta = 0.0; // on CPU
259 | 		tmp_time = 0.0;
260 | 		tmp_total_time = 0.0;
261 | 		for(itimes = 0; itimes < max_itimes; itimes++)
262 | 		{
263 | 			tmp_total_time = get_real_secv();
264 | 
265 | 			status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim);
266 | 			if(status != CUBLAS_STATUS_SUCCESS)
267 | 				printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n");
268 | 			status = cublasSetMatrix(dim, dim, sizeof(double), mb, dim, dev_mb, dim);
269 | 			if(status != CUBLAS_STATUS_SUCCESS)
270 | 				printf("mb -> dev_mb: cublasSetMatrix失敗しました。\n");
271 | 
272 | 			tmp_time = get_real_secv();
273 | 			magma_dgemm(MagmaNoTrans, MagmaNoTrans, dim, dim, dim, alpha, dev_ma, dim, dev_mb, dim, beta, dev_mc, dim);
274 | 			cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須！
275 | 			running_time += get_real_secv() - tmp_time; // end
276 | 			if(status != CUBLAS_STATUS_SUCCESS)
277 | 				printf("cublasDgemm失敗しました。\n");
278 | 
279 | 			//printf("%f \n", get_real_secv() - running_time);
280 | 
281 | 			status = cublasGetMatrix(dim, dim, sizeof(double), dev_mc, dim, mc, dim);
282 | 			if(status != CUBLAS_STATUS_SUCCESS)
283 | 				printf("dev_mc -> mc: cublasGetMatrix失敗しました。\n");
284 | 
285 | 			*total_running_time += get_real_secv() - tmp_total_time;
286 | 
287 | 			if(*total_running_time > 1.0)
288 | 				break;
289 | 		}
290 | 	}
291 | #endif // USE_MAGMA
292 | 
293 | 	running_time /= (itimes + 1);
294 | 	*total_running_time /= (itimes + 1);
295 | 	//printf("itimes = %d, total_time, running_time = %f, %f \n", itimes, *total_running_time, running_time);
296 | 
297 | 	// on CPU
298 | 	cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, mb, dim, beta, mc_host, dim);
299 | 	// ||mc_host - mc||_F
300 | 	cblas_daxpy(dim * dim, -1.0, mc, 1, mc_host, 1);
301 | 	//printf("||mc - mc_host||_F = %25.17e\n", cblas_dnrm2(dim * dim, mc_host, 1) / cblas_dnrm2(dim * dim, mc, 1));
302 | 
303 | 	// print
304 | /*	for(i = 0; i < dim; i++)
305 | 	{
306 | 		printf("%3d: ", i);
307 | 		for(j = 0; j < dim; j++)
308 | 			printf("%10f ", *(mc + i * dim + j));
309 | 		printf("\n");
310 | 	}
311 | */
312 | //	printf("dim = %d, Running Time(sec) = %lf (%lf ms)\n", dim, running_time, running_time * 1000);
313 | 	gflops = 2.0 * (double)dim * (double)dim * (double)dim / running_time / 1024.0 / 1024.0 / 1024.0;
314 | 	//printf("%f Gflops\n", gflops);
315 | 
316 | 	// free on CPU
317 | 	free(ma);
318 | 	free(mb);
319 | 	free(mc);
320 | 
321 | 	// free on GPU
322 | 	mycuda_free(dev_ma);
323 | 	mycuda_free(dev_mb);
324 | 	mycuda_free(dev_mc);
325 | 
326 | 	cublasDestroy(handle);
327 | 
328 | 	return gflops;
329 | }
330 | #endif
331 | 
332 | int main()
333 | {
334 | 	int start_dim, end_dim, step_dim, dim;
335 | 	char str_mkl_version[1024];
336 | 	int max_num_threads;
337 | 	double total_time[2] = {0.0, 0.0};
338 | 
339 | 	// print MKL version
340 | #ifndef USE_IMKL
341 | 	MKL_Get_Version_String(str_mkl_version, 1024);
342 | 	printf("%s\n", str_mkl_version);
343 | 
344 | 	max_num_threads = mkl_get_max_threads();
345 | 	printf("Max Number of Threads: %d\n", max_num_threads);
346 | 	mkl_set_num_threads(max_num_threads);
347 | #endif
348 | 
349 | 	// input dimension
350 | 	printf("Start Dim = "); scanf("%d", &start_dim);
351 | 	printf("End   Dim = "); scanf("%d", &end_dim);
352 | 	printf("Step  Dim = "); scanf("%d", &step_dim);
353 | 
354 | 	printf("  DIM  DGEMM(GFlops)");
355 | #ifdef USE_CUDA
356 | 	printf(" cuBLAS(GFlops)");
357 | #endif
358 | #ifdef USE_MAGMA
359 | 	printf(" magmablas(GFlops)");
360 | 	printf(" Total time(sec)");
361 | #endif
362 | 	printf("\n");
363 | 
364 | 	for(dim = start_dim; dim <= end_dim; dim += step_dim)
365 | 	{
366 | 		printf("%5d %10.3f", dim, get_gflops_dgemm(0, dim));
367 | #ifdef USE_CUDA
368 | 		printf(" %10.3f", get_gflops_cuda_dgemm(0, dim, &total_time[0]));
369 | #endif
370 | #ifdef USE_MAGMA
371 | 		printf(" %10.3f", get_gflops_cuda_dgemm(1, dim, &total_time[1]));
372 | #endif
373 | 		if(total_time[0] > 0.0)
374 | 			printf(" %10.3f", total_time[0]);
375 | 		if(total_time[1] > 0.0)
376 | 			printf(" %10.3f", total_time[1]);
377 | 		printf("\n");
378 | 	}
379 | 
380 | 	return EXIT_SUCCESS;
381 | }
382 | 


--------------------------------------------------------------------------------
/dgemv_bench_mt.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample Program of BLAS Level 2                */
  4 | /*                       with Intel Math Kernel, */
  5 | /*                              cuBLAS and MAGMA */
  6 | /* Last Update: 2015-03-27 (Fri) T.Kouya         */
  7 | /*************************************************/
  8 | #include <stdio.h>
  9 | #include <stdlib.h>
 10 | #include "get_secv.h"
 11 | #ifdef USE_IMKL
 12 | 	#include "mkl.h"
 13 | 	#include "mkl_cblas.h" // for Intel Math Kernel Library
 14 | #else
 15 | 	#include "cblas.h"
 16 | #endif
 17 | 
 18 | #ifdef USE_CUDA
 19 | 	// mycuda_calloc, mycuda_free関数
 20 | 	#include "mycuda.h"
 21 | 	#include "cublas_v2.h"
 22 | #ifdef USE_MAGMA
 23 | 	#include "magma.h"
 24 | #endif
 25 | #endif
 26 | 
 27 | // column major only
 28 | // my_matvec_mul: vec_b := mat_a * vec_x
 29 | void my_matvec_mul_col(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x)
 30 | {
 31 | 	int i, j;
 32 | 
 33 | 	// メインループ
 34 | 	for(i = 0; i < row_dim; i++)
 35 | 	{
 36 | 		vec_b[i] = 0.0;
 37 | 		for(j = 0; j < col_dim; j++)
 38 | 			vec_b[i] += mat_a[i + j * row_dim] * vec_x[j];
 39 | 	}
 40 | }
 41 | 
 42 | // flag = 0 ... use DGEMV in BLAS
 43 | // flag = 1 ... use handmade mavec_mul
 44 | double get_gflops_dgemv(int flag, int dim, double *total_running_time, int *iterative_times)
 45 | {
 46 | #ifdef USE_ATLAS
 47 | 	int i, j;
 48 | #else
 49 | 	MKL_INT i, j; // dimension of vectors
 50 | #endif
 51 | 	double *ma, *vb, *vc;
 52 | 	double alpha, beta;
 53 | 	double running_time;
 54 | 	double gflops;
 55 | 	int inc_vb, inc_vc;
 56 | 	int itimes, max_itimes = 200000;
 57 | 	double tmp_time;
 58 | 
 59 | 	// Initialize
 60 | 	ma = (double *)calloc(dim * dim, sizeof(double));
 61 | 	vb = (double *)calloc(dim, sizeof(double));
 62 | 	vc = (double *)calloc(dim, sizeof(double));
 63 | 
 64 | 	// row major
 65 | 	for(i = 0; i < dim; i++)
 66 | 	{
 67 | 		for(j = 0; j < dim; j++)
 68 | 			ma[i + j * dim] = sqrtf(2.0) * (i + j + 1);
 69 | 
 70 | 		vb[i] = sqrtf(2.0) * (dim * 2 - i);
 71 | 	}
 72 | 
 73 | 	if(flag == 0)
 74 | 	{
 75 | 		// vc := 1.0 * ma * vb + 0.0 * vc
 76 | 		alpha = 1.0;
 77 | 		beta = 0.0;
 78 | 		tmp_time = 0.0;
 79 | 		inc_vb = 1;
 80 | 		inc_vc = 1;
 81 | 		for(itimes = 0; itimes < max_itimes; itimes++)
 82 | 		{
 83 | 			running_time = get_real_secv();
 84 | 			cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, ma, dim, vb, inc_vb, beta, vc, inc_vc);
 85 | 			tmp_time += get_real_secv() - running_time; // end
 86 | 			if((tmp_time > 1.0) && (itimes > 5))
 87 | 			{
 88 | 				running_time = tmp_time / (itimes + 1);
 89 | 				break;
 90 | 			}	
 91 | 		}
 92 | 		*total_running_time = tmp_time;
 93 | 		*iterative_times = itimes;
 94 | 	}
 95 | 	else if(flag == 1)
 96 | 	{
 97 | 		tmp_time = 0.0;
 98 | 		for(itimes = 0; itimes < max_itimes; itimes++)
 99 | 		{
100 | 			running_time = get_real_secv();
101 | 			my_matvec_mul_col(vc, ma, dim, dim, vb);
102 | 			tmp_time += get_real_secv() - running_time; // end
103 | 			if((tmp_time > 1.0) && (itimes > 5))
104 | 			{
105 | 				running_time = tmp_time / (itimes + 1);
106 | 				break;
107 | 			}
108 | 		}
109 | 		*total_running_time = tmp_time;
110 | 		*iterative_times = itimes;
111 | 	}
112 | 
113 | 	// print
114 | /*	for(i = 0; i < dim; i++)
115 | 	{
116 | 		printf("%3d: ", i);
117 | 		printf("%10f ", vc[i]);
118 | 		printf("\n");
119 | 	}
120 | */
121 | //	printf("dim = %d, Running Time(sec) = %f\n", dim, running_time);
122 | 	gflops = (2.0 * (double)dim * (double)dim - (double)dim) / running_time / 1024.0 / 1024.0 / 1024.0;
123 | //	printf("%f Gflops\n", gflops);
124 | 
125 | 	free(ma);
126 | 	free(vb);
127 | 	free(vc);
128 | 
129 | 	return gflops;
130 | }
131 | 
132 | // GPGPU
133 | #ifdef USE_CUDA
134 | 
135 | // flag = 0 ... use DGEMV in cuBLAS
136 | // flag = 1 ... use magma_dgemv in MAGMA
137 | double get_gflops_cuda_dgemv(int flag, int dim, double *total_running_time, int *iterative_times)
138 | {
139 | 	int i, j; // dimension of vectors
140 | 	double *ma, *vb, *vc, *vc_host; // on CPU
141 | 	double *dev_ma, *dev_vb, *dev_vc; // on GPU
142 | 	double alpha, beta;
143 | 	double running_time, tmp_time, tmp_total_time;
144 | 	double gflops;
145 | 	int inc_vb, inc_vc;
146 | 	int itimes, max_itimes = 10000;
147 | 	cublasStatus_t status;
148 | 	cublasHandle_t handle;
149 | 
150 | 	// Initialize on CPU
151 | 	ma = (double *)calloc(dim * dim, sizeof(double));
152 | 	vb = (double *)calloc(dim, sizeof(double));
153 | 	vc = (double *)calloc(dim, sizeof(double));
154 | 	vc_host = (double *)calloc(dim, sizeof(double));
155 | 
156 | 	// Initialize on GPU
157 | 	dev_ma = (double *)mycuda_calloc(dim * dim, sizeof(double));
158 | 	dev_vb = (double *)mycuda_calloc(dim, sizeof(double));
159 | 	dev_vc = (double *)mycuda_calloc(dim, sizeof(double));
160 | 
161 | 	// input ma and mb
162 | 	for(j = 0; j < dim; j++)
163 | 	{
164 | 		for(i = 0; i < dim; i++)
165 | 		{
166 | 			// column major
167 | 			ma[i + j * dim] = sqrt(2.0) * (i + j + 1);
168 | 		}
169 | 		vb[i] = sqrt(2.0) * (dim * 2 - i);
170 | 		vc[i] = 0.0;
171 | 		vc_host[i] = 0.0;
172 | 	}
173 | 
174 | 	// set matrix
175 | 	status = cublasCreate(&handle);
176 | 	if(status != CUBLAS_STATUS_SUCCESS)
177 | 	{
178 | 		printf("cuBLASの初期化に失敗しました。\n");
179 | 
180 | 		mycuda_free(dev_ma);
181 | 		mycuda_free(dev_vb);
182 | 		mycuda_free(dev_vc);
183 | 		cublasDestroy(handle);
184 | 
185 | 		return 0;
186 | 	}
187 | 
188 | 	inc_vb = 1;
189 | 	inc_vc = 1;
190 | 
191 | 	// cublasDgemm
192 | 	if(flag == 0)
193 | 	{
194 | 		// vc := 1.0 * ma * vb + 0.0 * vc
195 | 		alpha = 1.0; // on CPU
196 | 		beta = 0.0; // on CPU
197 | 		tmp_time = 0.0;
198 | 		tmp_total_time = 0.0;
199 | 		for(itimes = 0; itimes < max_itimes; itimes++)
200 | 		{
201 | 			tmp_total_time = get_real_secv();
202 | 
203 | 			status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim);
204 | 			if(status != CUBLAS_STATUS_SUCCESS)
205 | 				printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n");
206 | 			status = cublasSetVector(dim, sizeof(double), (void *)vb, inc_vb, (void *)dev_vb, inc_vb);
207 | 			if(status != CUBLAS_STATUS_SUCCESS)
208 | 				printf("vb -> dev_vb: cublasSetVector失敗しました。\n");
209 | 
210 | 			tmp_time = get_real_secv();
211 | 			status = cublasDgemv(handle, CUBLAS_OP_N, dim, dim, &alpha, dev_ma, dim, dev_vb, inc_vb, &beta, dev_vc, inc_vc);
212 | 			//magma_dgemv(MagmaNoTrans, dim, dim, alpha, dev_ma, dim, dev_vb, inv_vb, beta, dev_vc, inc_vc);
213 | 			cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須！
214 | 			running_time += get_real_secv() - tmp_time; // end
215 | 			if(status != CUBLAS_STATUS_SUCCESS)
216 | 				printf("cublasDgemv失敗しました。\n");
217 | 
218 | 			//printf("%f \n", get_real_secv() - running_time);
219 | 
220 | 			status = cublasGetVector(dim, sizeof(double), dev_vc, inc_vc, vc, inc_vc);
221 | 			if(status != CUBLAS_STATUS_SUCCESS)
222 | 				printf("dev_vc -> vc: cublasGetMatrix失敗しました。\n");
223 | 
224 | 			*total_running_time += get_real_secv() - tmp_total_time;
225 | 
226 | 			if((*total_running_time > 1.0) && (itimes > 5))
227 | 				break;
228 | 		}
229 | 
230 | 		*iterative_times = itimes;
231 | 	}
232 | #ifdef USE_MAGMA
233 | 	// magmablas_dgemm
234 | 	else if(flag == 1)
235 | 	{
236 | 		// initialize
237 | 		magma_init();
238 | 		
239 | 		// mc := 1.0 * ma * mb + 0.0 * mc
240 | 		alpha = 1.0; // on CPU
241 | 		beta = 0.0; // on CPU
242 | 		tmp_time = 0.0;
243 | 		tmp_total_time = 0.0;
244 | 		for(itimes = 0; itimes < max_itimes; itimes++)
245 | 		{
246 | 			tmp_total_time = get_real_secv();
247 | 
248 | 			status = cublasSetMatrix(dim, dim, sizeof(double), ma, dim, dev_ma, dim);
249 | 			if(status != CUBLAS_STATUS_SUCCESS)
250 | 				printf("ma -> dev_ma: cublasSetMatrix失敗しました。\n");
251 | 			status = cublasSetVector(dim, sizeof(double), vb, inc_vb, dev_vb, inc_vb);
252 | 			if(status != CUBLAS_STATUS_SUCCESS)
253 | 				printf("vb -> dev_vb: cublasSetVector失敗しました。\n");
254 | 
255 | 			tmp_time = get_real_secv();
256 | 			magma_dgemv(MagmaNoTrans, dim, dim, alpha, dev_ma, dim, dev_vb, inc_vb, beta, dev_vc, inc_vc);
257 | 			cudaDeviceSynchronize(); // <--CPUタイマーを使う際には必須！
258 | 			running_time += get_real_secv() - tmp_time; // end
259 | 
260 | 			//printf("%f \n", get_real_secv() - running_time);
261 | 
262 | 			status = cublasGetVector(dim, sizeof(double), dev_vc, inc_vc, vc, inc_vc);
263 | 			if(status != CUBLAS_STATUS_SUCCESS)
264 | 				printf("dev_vc -> vc: cublasGetVector失敗しました。\n");
265 | 
266 | 			*total_running_time += get_real_secv() - tmp_total_time;
267 | 
268 | 			if((*total_running_time > 1.0) && (itimes > 5))
269 | 				break;
270 | 		}
271 | 
272 | 		// finalize
273 | 		magma_finalize();
274 | 
275 | 		*iterative_times = itimes;
276 | 	}
277 | #endif // USE_MAGMA
278 | 
279 | 	running_time /= (itimes + 1);
280 | 	*total_running_time /= (itimes + 1);
281 | 	//printf("itimes = %d, total_time, running_time = %f, %f \n", itimes, *total_running_time, running_time);
282 | 
283 | 	// on CPU
284 | 	cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, ma, dim, vb, inc_vb, beta, vc_host, inc_vc);
285 | 	// ||vc_host - vc||_F
286 | 	cblas_daxpy(dim, -1.0, vc, 1, vc_host, 1);
287 | 	//printf("||vc - vc_host||_F = %25.17e\n", cblas_snrm2(dim, vc_host, 1) / cblas_snrm2(dim, vc, 1));
288 | 
289 | 	// print
290 | /*	for(i = 0; i < dim; i++)
291 | 	{
292 | 		printf("%3d: ", i);
293 | 		for(j = 0; j < dim; j++)
294 | 			printf("%10f ", *(mc + i * dim + j));
295 | 		printf("\n");
296 | 	}
297 | */
298 | //	printf("dim = %d, Running Time(sec) = %lf (%lf ms)\n", dim, running_time, running_time * 1000);
299 | 	gflops = (2.0 * (double)dim * (double)dim - (double)dim) / running_time / 1024.0 / 1024.0 / 1024.0;
300 | 	//printf("%f Gflops\n", gflops);
301 | 
302 | 	// free on CPU
303 | 	free(ma);
304 | 	free(vb);
305 | 	free(vc);
306 | 
307 | 	// free on GPU
308 | 	mycuda_free(dev_ma);
309 | 	mycuda_free(dev_vb);
310 | 	mycuda_free(dev_vc);
311 | 
312 | 	cublasDestroy(handle);
313 | 
314 | 	return gflops;
315 | }
316 | #endif
317 | 
318 | #ifdef USE_ATLAS
319 | 	#define ALT_NTHREADS 4
320 | #endif
321 | 
322 | int main()
323 | {
324 | 	int start_dim, end_dim, step_dim, dim;
325 | 	char str_mkl_version[1024];
326 | 	int max_num_threads;
327 | 	double total_time[3] = {0.0, 0.0, 0.0};
328 | 	int iterative_times[3] = {0, 0, 0};
329 | 
330 | 	// print MKL version
331 | #ifdef USE_IMKL
332 | 	MKL_Get_Version_String(str_mkl_version, 1024);
333 | 	printf("%s\n", str_mkl_version);
334 | 
335 | 	max_num_threads = mkl_get_max_threads();
336 | 	printf("Max Number of Threads: %d\n", max_num_threads);
337 | 	mkl_set_num_threads(max_num_threads);
338 | #endif
339 | 
340 | 	// input dimension
341 | 	printf("Start Dim = "); scanf("%d", &start_dim);
342 | 	printf("End   Dim = "); scanf("%d", &end_dim);
343 | 	printf("Step  Dim = "); scanf("%d", &step_dim);
344 | 
345 | 	printf("  DIM  DGEMV(GFlops)");
346 | #ifdef USE_CUDA
347 | 	printf(" cuBLAS(GFlops)");
348 | #endif
349 | #ifdef USE_MAGMA
350 | 	printf(" magmablas(GFlops)");
351 | 	printf(" Total time(sec)");
352 | #endif
353 | 	printf("\n");
354 | 
355 | 	for(dim = start_dim; dim <= end_dim; dim += step_dim)
356 | 	{
357 | 		printf("%5d %10.3lg", dim, get_gflops_dgemv(0, dim, &total_time[0], &iterative_times[0]));
358 | #ifdef USE_CUDA
359 | 		printf(" %10.3lg", get_gflops_cuda_dgemv(0, dim, &total_time[1], &iterative_times[1]));
360 | #endif
361 | #ifdef USE_MAGMA
362 | 		printf(" %10.3lg", get_gflops_cuda_dgemv(1, dim, &total_time[2], &iterative_times[2]));
363 | #endif
364 | 		if(total_time[0] > 0.0)
365 | 			printf(" %10.3lg(%6d)", total_time[0], iterative_times[0]);
366 | 		if(total_time[1] > 0.0)
367 | 			printf(" %10.3lg(%6d)", total_time[1], iterative_times[1]);
368 | 		if(total_time[2] > 0.0)
369 | 			printf(" %10.3lg(%6d)", total_time[2], iterative_times[2]);
370 | 		printf("\n");
371 | 	}
372 | 
373 | 	return EXIT_SUCCESS;
374 | }
375 | 


--------------------------------------------------------------------------------
/first.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* First programming with float & double types   */
 4 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
 5 | /*************************************************/
 6 | #include <stdio.h>
 7 | #include <math.h>
 8 | 
 9 | int main()
10 | {
11 | 	float sc = 0.0, sa = -2.0, sb = 3.0;
12 | 	double dc = 0.0, da = -2.0, db = 3.0;
13 | 	double relerr;
14 | 
15 | 	// basic arithmetic: float
16 | 	printf("--- float data type(single precsion floating-point number) ---\n");
17 | 	sc = sa + sb;
18 | 	printf("%25.17e := %25.17e + %25.17e\n", sc, sa, sb);
19 | 	sc = sa - sb;
20 | 	printf("%25.17e := %25.17e - %25.17e\n", sc, sa, sb);
21 | 	sc = sa * sb;
22 | 	printf("%25.17e := %25.17e * %25.17e\n", sc, sa, sb);
23 | 	sc = sa / sb;
24 | 	printf("%25.17e := %25.17e / %25.17e\n", sc, sa, sb);
25 | 
26 | 	// absolute value and square root: float
27 | 	sc = fabsf(sa);
28 | 	printf("%25.17e := |%25.17e|\n", sc, sa);
29 | 	sc = sqrtf(sb);
30 | 	printf("%25.17e := sqrt(%25.17e)\n", sc, sb);
31 | 
32 | 	// basic arithmetic: double
33 | 	printf("--- double data type(double precsion floating-point number) ---\n");
34 | 	dc = da + db;
35 | 	printf("%25.17e := %25.17e + %25.17e\n", dc, da, db);
36 | 	dc = da - db;
37 | 	printf("%25.17e := %25.17e - %25.17e\n", dc, da, db);
38 | 	dc = da * db;
39 | 	printf("%25.17e := %25.17e * %25.17e\n", dc, da, db);
40 | 	dc = da / db;
41 | 	printf("%25.17e := %25.17e / %25.17e\n", dc, da, db);
42 | 
43 | 	// absolute value and square root: double
44 | 	dc = fabs(da);
45 | 	printf("%25.17e := |%25.17e|\n", dc, da);
46 | 	dc = sqrt(db);
47 | 	printf("%25.17e := sqrt(%25.17e)\n", dc, db);
48 | 
49 | 	// relative error of single precision square root
50 | 	relerr = fabs(sc - dc);
51 | 	if(fabs(dc) > 0.0)
52 | 		relerr /= fabs(dc);
53 | 
54 | 	printf("Single Prec.  : %25.17e\n", sc);
55 | 	printf("Double Prec.  : %25.17e\n", dc);
56 | 	printf("Relative Error: %10.3e\n", relerr);
57 | 
58 | 	return 0;
59 | }
60 | 


--------------------------------------------------------------------------------
/get_sec.c:
--------------------------------------------------------------------------------
  1 | /**********************************************/
  2 | /* get_sec.c:                                 */
  3 | /* Copyright (C) 2003-2016 Tomonori Kouya     */
  4 | /*                                            */
  5 | /* This library is free software; you can re- */
  6 | /* distribute it and/or modify it under the   */
  7 | /* terms of the GNU Lesser General Public     */
  8 | /* License as published by the Free Software  */
  9 | /* Foundation; either version 2.1 of the      */
 10 | /* License, or (at your option) any later     */
 11 | /* version.                                   */
 12 | /*                                            */
 13 | /* This library is distributed in the hope    */
 14 | /* that it will be useful, but WITHOUT ANY    */
 15 | /* WARRANTY; without even the implied         */
 16 | /* warranty of MERCHANTABILITY or FITNESS FOR */
 17 | /* A PARTICULAR PURPOSE.  See the GNU Lesser  */
 18 | /* General Public License for more details.   */
 19 | /**********************************************/
 20 | #include <stdio.h>
 21 | #include <time.h>
 22 | 
 23 | #ifdef WIN32
 24 | #include <windows.h>
 25 | #else
 26 | #include <sys/times.h>
 27 | #include <sys/time.h>
 28 | #ifndef CLK_TCK
 29 | 	#include <unistd.h>
 30 | 	#define CLK_TCK (sysconf(_SC_CLK_TCK))
 31 | #endif
 32 | #endif
 33 | 
 34 | #include "get_secv.h"
 35 | 
 36 | #define DIVTIMES 2
 37 | 
 38 | //#define USE_CLOCK
 39 | 
 40 | /* flag == 0: No print */
 41 | double get_sec(int flag)
 42 | {
 43 | #ifdef WIN32
 44 | 	static int first = 1;
 45 | 	static LARGE_INTEGER _tstart;
 46 | 	static LARGE_INTEGER freq;
 47 | 
 48 | 	if(first) {
 49 | 		QueryPerformanceFrequency(&freq);
 50 | 		first = 0;
 51 | 	}
 52 | 	QueryPerformanceCounter(&_tstart);
 53 | 	return ((double)_tstart.QuadPart)/((double)freq.QuadPart);
 54 | #else
 55 | 	double ret;
 56 | 
 57 | 	struct tms tmp;
 58 | 
 59 | #ifdef USE_CLOCK
 60 | 	if(flag != 0)
 61 | 		printf("Time  : %d / %d\n", (int)clock(), CLOCKS_PER_SEC);
 62 | 	ret = (double)(clock()) / CLOCKS_PER_SEC;
 63 | #else
 64 | #ifdef USE_MPFRTIME
 65 | //int
 66 | //cputime ()
 67 | //{
 68 | #include <sys/types.h>
 69 | #include <sys/resource.h>
 70 | //
 71 |   struct rusage rus;
 72 | 
 73 |   getrusage (0, &rus);
 74 |   //return rus.ru_utime.tv_sec * 1000 + rus.ru_utime.tv_usec / 1000;
 75 |   ret =  rus.ru_utime.tv_sec + rus.ru_utime.tv_usec / 1000 / 1000;
 76 | //}
 77 | #else
 78 | 	times(&tmp);
 79 | 	if(flag != 0)
 80 | 	{
 81 | 		printf("User Time  : %ld / %ld\n", tmp.tms_utime, CLK_TCK);
 82 | 		printf("System Time: %ld / %ld\n", tmp.tms_stime, CLK_TCK);
 83 | 		printf("CUser Time  : %ld / %ld\n", tmp.tms_cutime, CLK_TCK);
 84 | 		printf("CSystem Time: %ld / %ld\n", tmp.tms_cstime, CLK_TCK);
 85 | 		printf("Ret(utime+stime)   : %g\n", (double)(tmp.tms_utime + tmp.tms_stime) / CLK_TCK);
 86 | 		printf("Ret(cutime+cstime) : %g\n", (double)(tmp.tms_cutime + tmp.tms_cstime) / CLK_TCK);
 87 | 	}
 88 | 	ret = (double)(tmp.tms_utime + tmp.tms_stime) / CLK_TCK;
 89 | #endif
 90 | #endif
 91 | 	return ret;
 92 | #endif
 93 | }
 94 | 
 95 | /* double get_secv(void) */
 96 | double get_secv(void)
 97 | {
 98 | 	return get_sec(0);
 99 | }
100 | 
101 | /* float fget_sec(int flag) */
102 | float fget_sec(int flag)
103 | {
104 | 	return (float)get_sec(flag);
105 | }
106 | 
107 | /* float fget_sec(void) */
108 | float fget_secv(void)
109 | {
110 | 	return (float)get_sec(0);
111 | }
112 | 
113 | /* get REAL time */
114 | /* for Multi-threads programming */
115 | double get_real_sec(int flag)
116 | {
117 | #ifdef WIN32
118 | 	static int first = 1;
119 | 	static LARGE_INTEGER _tstart;
120 | 	static LARGE_INTEGER freq;
121 | 
122 | 	if(first) {
123 | 		QueryPerformanceFrequency(&freq);
124 | 		first = 0;
125 | 	}
126 | 	QueryPerformanceCounter(&_tstart);
127 | 	return ((double)_tstart.QuadPart)/((double)freq.QuadPart);
128 | #else
129 | 	double ret;
130 | 	struct timeval tmp;
131 | 
132 | 	gettimeofday(&tmp, NULL);
133 | 	if(flag != 0)
134 | 	{
135 | 		printf("tv_sec : %ld\n", tmp.tv_sec);
136 | 		printf("tv_usec: %ld\n", tmp.tv_usec);
137 | 		printf("Ret    : %g\n", (double)tmp.tv_sec + (double)tmp.tv_usec / 1000.0 / 1000.0);
138 | 	}
139 | 	ret = (double)tmp.tv_sec + (double)tmp.tv_usec / 1000.0 / 1000.0;
140 | 
141 | 	return ret;
142 | #endif
143 | }
144 |  
145 | double get_real_secv(void)
146 | {
147 | 	return get_real_sec(0);
148 | }
149 | 
150 | float fget_real_sec(int flag)
151 | {
152 | 	return (float)get_real_sec(flag);
153 | }
154 | 
155 | float fget_real_secv(void)
156 | {
157 | 	return (float)get_real_sec(0);
158 | }
159 | 


--------------------------------------------------------------------------------
/get_secv.h:
--------------------------------------------------------------------------------
 1 | /**********************************************/
 2 | /* get_secv.h:                                */
 3 | /* Copyright (C) 2003-2016 Tomonori Kouya     */
 4 | /*                                            */
 5 | /* This library is free software; you can re- */
 6 | /* distribute it and/or modify it under the   */
 7 | /* terms of the GNU Lesser General Public     */
 8 | /* License as published by the Free Software  */
 9 | /* Foundation; either version 2.1 of the      */
10 | /* License, or (at your option) any later     */
11 | /* version.                                   */
12 | /*                                            */
13 | /* This library is distributed in the hope    */
14 | /* that it will be useful, but WITHOUT ANY    */
15 | /* WARRANTY; without even the implied         */
16 | /* warranty of MERCHANTABILITY or FITNESS FOR */
17 | /* A PARTICULAR PURPOSE.  See the GNU Lesser  */
18 | /* General Public License for more details.   */
19 | /**********************************************/
20 | #ifndef __TK_GET_SECV_H
21 | #define __TK_GET_SECV_H
22 | 
23 | double get_sec(int flag);
24 | 
25 | /* double get_secv(void) */
26 | double get_secv(void);
27 | 
28 | /* float fget_sec(int flag) */
29 | float fget_sec(int flag);
30 | 
31 | /* float fget_sec(void) */
32 | float fget_secv(void);
33 | 
34 | /* get REAL time */
35 | /* for Multi-threads programming */
36 | double get_real_sec(int flag);
37 |  
38 | double get_real_secv(void);
39 | 
40 | float fget_real_sec(int flag);
41 | 
42 | float fget_real_secv(void);
43 | 
44 | #endif // __TK_GET_SECV_H
45 | 


--------------------------------------------------------------------------------
/integral_eq/Makefile.unix:
--------------------------------------------------------------------------------
 1 | #*************************************************#
 2 | # LAPACK/BLAS Tutorial                            #
 3 | # Makefile for Linux gcc or icc environment       #
 4 | # Last Update: 2016-12-02 (Fri) T.Kouya           #
 5 | #*************************************************#
 6 | # Intel C compiler
 7 | include ../lapack_icc.inc
 8 | 
 9 | # GNU Compiler Collection
10 | #include ../lapack_gcc.inc
11 | 
12 | all: gauss_integral
13 | 
14 | get_secv: ../get_secv.h ../get_sec.c
15 | 	$(CC) $(LAPACKE_INC) -I../ -c ../get_sec.c -o get_sec.o
16 | 
17 | gauss_integral: get_secv gauss_integral.c iteration.c ../tkaux.c ../tkaux.h
18 | 	$(CC) -DDEBUG $(LAPACKE_INC) -I../ gauss_integral.c -o gauss_integral $(LAPACKE_LIB)
19 | 	$(CC) $(LAPACKE_INC) -I../ -c ../tkaux.c -o tkaux.o
20 | 	$(CC) $(LAPACKE_INC) -I../ -c gauss_integral.c -o gauss_integral.o
21 | 	$(CC) $(LAPACKE_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o secant $(LAPACKE_LIB)
22 | 	$(CC) -DUSE_NEW_SECANT $(LAPACKE_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o new_secant $(LAPACKE_LIB)
23 | 
24 | gauss_integral_imkl: get_secv gauss_integral.c iteration.c ../tkaux.c ../tkaux.h
25 | 	$(CC) $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o secant_imkl $(IMKL_LIB)
26 | 	$(CC) -DUSE_PARALLEL_DIFFMAT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o secant_imkl_pd $(IMKL_LIB)
27 | 	$(CC) -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o new_secant_imkl $(IMKL_LIB)
28 | 	$(CC) -DUSE_PARALLEL_DIFFMAT -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.o tkaux.o get_sec.o -o new_secant_imkl_pd $(IMKL_LIB)
29 | 
30 | clean:
31 | 	-rm *.o
32 | 	-rm gauss_integral
33 | 	-rm secant
34 | 	-rm secant_imkl
35 | 	-rm secant_imkl_pd
36 | 	-rm new_secant
37 | 	-rm new_secant_imkl
38 | 	-rm new_secant_imkl_pd
39 | 


--------------------------------------------------------------------------------
/integral_eq/Makefile.win_intel:
--------------------------------------------------------------------------------
 1 | include ../lapack_win_intel.inc
 2 | 
 3 | INC = /I..\windows
 4 | 
 5 | all: gauss_integral
 6 | 
 7 | get_secv: ../get_secv.h ../get_sec.c
 8 | 	$(CC) $(LAPACKE_INC) -I../ -c ../get_sec.c -o get_sec.o
 9 | 
10 | gauss_integral: get_secv gauss_integral.c iteration.c ../tkaux.c ../tkaux.h
11 | 	$(CC) -DDEBUG $(LAPACKE_INC) -I../ gauss_integral.c -o gauss_integral $(LAPACKE_LIB)
12 | 	$(CC) $(LAPACKE_INC) -I../ -c ../tkaux.c -o tkaux.obj
13 | 	$(CC) $(LAPACKE_INC) -I../ -c gauss_integral.c -o gauss_integral.obj
14 | 	$(CC) $(LAPACKE_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o secant $(LAPACKE_LIB)
15 | 	$(CC) $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o secant_imkl $(IMKL_LIB)
16 | 	$(CC) -DUSE_PARALLEL_DIFFMAT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o secant_imkl_pd $(IMKL_LIB)
17 | 	$(CC) -DUSE_NEW_SECANT $(LAPACKE_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o new_secant $(LAPACKE_LIB)
18 | 	$(CC) -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o new_secant_imkl $(IMKL_LIB)
19 | 	$(CC) -DUSE_PARALLEL_DIFFMAT -DUSE_NEW_SECANT $(OPENMP) -DUSE_IMKL $(IMKL_INC) -I../ iteration.c gauss_integral.obj tkaux.obj get_sec.obj -o new_secant_imkl_pd $(IMKL_LIB)
20 | 
21 | clean:
22 | 	-del *.obj
23 | 	-del *.exe
24 | 


--------------------------------------------------------------------------------
/integral_eq/ex1.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Hammerstein Integral equation                 */
  4 | /* Last Update: 2016-12-02 (Fri) T.Kouya         */
  5 | /*************************************************/
  6 | //
  7 | // Example 1: x(s) = 1 + 1/2 * int[0, 1] K(s, t) (|x(t)| + (x(t))^2) dt
  8 | //            where K(s, t) = (1 - s) * t (t \le s) or (1 - t) * s (s \le t)
  9 | //
 10 | double get_bij(int i, int j, double *abscissa, double *weight)
 11 | {
 12 | 	double ret;
 13 | 
 14 | 	// abscissa[i] = s_i
 15 | 	// abscissa[j] = t_j
 16 | 	if(j <= i)
 17 | 		ret = weight[j] * abscissa[j] * (1.0 - abscissa[i]);
 18 | 	else
 19 | 		ret = weight[j] * abscissa[i] * (1.0 - abscissa[j]);
 20 | 
 21 | 	return ret;
 22 | }
 23 | 
 24 | void get_bmatrix(double *B, int row_dim, int col_dim, double *abscissa, double *weight)
 25 | {
 26 | 	int i, j;
 27 | 
 28 | #ifdef _OPENMP
 29 | 	#pragma omp parallel for private(j)
 30 | #endif // _OPENMP
 31 | 	for(i = 0; i < row_dim; i++)
 32 | 		for(j = 0; j < col_dim; j++)
 33 | 			B[i * col_dim + j] = get_bij(i, j, abscissa, weight);
 34 | 
 35 | }
 36 | 
 37 | double *dmat_b;
 38 | double *dvec_abscissa, *dvec_weight, *dvec_xhat, *dvec_xsqr;
 39 | 
 40 | void init_derivative_free_iteration_dvector(int dim)
 41 | {
 42 | 	dvec_abscissa = (double *)calloc(dim, sizeof(double));
 43 | 	dvec_weight = (double *)calloc(dim, sizeof(double));
 44 | 	dvec_xhat = (double *)calloc(dim, sizeof(double));
 45 | 	dvec_xsqr = (double *)calloc(dim, sizeof(double));
 46 | 
 47 | 	gauss_integral_eig_d(dvec_abscissa, dvec_weight, dim, GAUSS_LEGENDRE);
 48 | 	dshifted_gauss_legendre(0.0, 1.0, dvec_abscissa, dvec_weight, dim);
 49 | 
 50 | 	dmat_b = (double *)calloc(dim * dim, sizeof(double));
 51 | 
 52 | 	get_bmatrix(dmat_b, dim, dim, dvec_abscissa, dvec_weight);
 53 | 
 54 | }
 55 | 
 56 | void free_derivative_free_iteration_dvector(void)
 57 | {
 58 | 	free(dvec_abscissa);
 59 | 	free(dvec_weight);
 60 | 	free(dvec_xhat);
 61 | 	free(dvec_xsqr);
 62 | 
 63 | 	free(dmat_b);
 64 | }
 65 | 
 66 | // ret = |x|
 67 | void abs_dvector(double *ret, double *x, int dim)
 68 | {
 69 | 	int i;
 70 | 
 71 | #ifdef _OPENMP
 72 | 	#pragma omp parallel for
 73 | #endif // _OPENMP
 74 | 	for(i = 0; i < dim; i++)
 75 | 		ret[i] = fabs(x[i]);
 76 | }
 77 | 
 78 | // ret = x^2
 79 | void sqr_dvector(double *ret, double *x, int dim)
 80 | {
 81 | 	int i;
 82 | 
 83 | #ifdef _OPENMP
 84 | 	#pragma omp parallel for
 85 | #endif // _OPENMP
 86 | 	for(i = 0; i < dim; i++)
 87 | 		ret[i] = x[i] * x[i];
 88 | }
 89 | 
 90 | // ret = 1
 91 | void allone_dvector(double *ret, int dim)
 92 | {
 93 | 	int i;
 94 | 
 95 | #ifdef _OPENMP
 96 | 	#pragma omp parallel for
 97 | #endif // _OPENMP
 98 | 	for(i = 0; i < dim; i++)
 99 | 		ret[i] = 1.0;
100 | }
101 | 
102 | // double vfunc_index(int, DVector)
103 | double vf_index(int index, double *x, int dim)
104 | {
105 | 	double ret, tmp;
106 | 	int j;
107 | 
108 | 	// 0.5 * B * (x_hat + x_sqr)
109 | 	tmp = 0.0;
110 | #ifdef _OPENMP
111 | 	#pragma omp parallel for reduction(+:tmp)
112 | #endif // _OPENMP
113 | 	for(j = 0; j < dim; j++)
114 | 		tmp += dmat_b[index * dim + j] * (fabs(x[j]) + x[j] * x[j]);
115 | 
116 | 	ret = x[index] - 1.0 - 0.5 * tmp;
117 | 
118 | 	return ret;
119 | }
120 | 
121 | 
122 | // void vfunc(DVector, DVector)
123 | void vf(double *ret_vec, double *x, int dim)
124 | {
125 | 	double ret;
126 | 	int i;
127 | 	double *tmp_vec;
128 | 
129 | 	tmp_vec = (double *)calloc(dim, sizeof(double));
130 | 
131 | 	// 0.5 * B * (x_hat + x_sqr)
132 | 	abs_dvector(dvec_xhat, x, dim);
133 | 	sqr_dvector(dvec_xsqr, x, dim);
134 | 	cblas_daxpy(dim, 1.0, dvec_xhat, 1, dvec_xsqr, 1);
135 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, dmat_b, dim, dvec_xsqr, 1, 0.0, ret_vec, 1);
136 | 	cblas_dscal(dim, 0.5, ret_vec, 1);
137 | 
138 | 	// x - 1 - 0.5 * B * (x_hat + x_sqr)
139 | 	allone_dvector(tmp_vec, dim);
140 | 	cblas_daxpy(dim, 1.0, ret_vec, 1, tmp_vec, 1);
141 | 	cblas_dcopy(dim, x, 1, ret_vec, 1);
142 | 	cblas_daxpy(dim, -1.0, tmp_vec, 1, ret_vec, 1);
143 | 
144 | 	free(tmp_vec);
145 | }
146 | 


--------------------------------------------------------------------------------
/integral_eq/gauss_integral.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Gaussian integration formulas                 */
  4 | /*                              with LAPACK/BLAS */
  5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | #ifdef USE_IMKL
 12 | 	#include "mkl.h" // for Intel Math Kernel Library
 13 | 	#include "mkl_cblas.h"
 14 | 	#include "mkl_lapacke.h"
 15 | #else // USE_IMKL
 16 | 	#include "cblas.h"
 17 | 	#include "lapacke.h"
 18 | #endif // USE_IMKL
 19 | 
 20 | #include "gauss_integral.h"
 21 | 
 22 | /* trimat:= [vec[1][0] vec[0][0  ]             0 ... 0     ] */
 23 | /*          [vec[2][0] vec[1][1  ] vec[0][1  ] 0 ... 0     ] */
 24 | /*          [        ..............................        ] */
 25 | /*          [0 ... 0   vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */
 26 | /*          [0 ... 0   0           vec[2][n-2] vec[1][n-1] ] */
 27 | /*                                                           */
 28 | int init_dmatrix_tri(double *trimat[3], int dim)
 29 | {
 30 | 	if((trimat[0] = (double *)calloc(sizeof(double), dim - 1)) == NULL)
 31 | 		return -1;
 32 | 
 33 | 	if((trimat[1] = (double *)calloc(sizeof(double), dim)) == NULL)
 34 | 	{
 35 | 		free(trimat[0]);
 36 | 		return -2;
 37 | 	}
 38 | 	if((trimat[2] = (double *)calloc(sizeof(double), dim - 1)) == NULL)
 39 | 	{
 40 | 		free(trimat[0]);
 41 | 		free(trimat[1]);
 42 | 		return -3;
 43 | 	}
 44 | 
 45 | 	return 0;
 46 | }
 47 | 
 48 | void free_dmatrix_tri(double *trimat[3])
 49 | {
 50 | 	free(trimat[0]);
 51 | 	free(trimat[1]);
 52 | 	free(trimat[2]);
 53 | }
 54 | 
 55 | void print_dmatrix_tri(double *trimat[3], int dim)
 56 | {
 57 | 	long int i;
 58 | 
 59 | 	printf("%3d:                          %25.17e %25.17e\n", 0, trimat[1][0], trimat[0][0]);
 60 | 	for(i = 1; i < dim - 1; i++)
 61 | 		printf("%3ld:%25.17e %25.17e %25.17e\n", i, trimat[2][i - 1], trimat[1][i], trimat[0][i]);
 62 | 	printf("%3ld:%25.17e %25.17e\n", i, trimat[2][dim - 2], trimat[1][dim - 1]);
 63 | }
 64 | 
 65 | /* coef[2] x^2 + coef[1] x + coef[0] = 0 */
 66 | /* Output: ans_re[0] + sqrt(-1)*ans_im[0], ans_re[1] + sqrt(-1) * ans_im[1] */
 67 | int dquadratic_eq(double ans_re[2], double ans_im[2], double coef[3])
 68 | {
 69 | 	double d, den, tmp;
 70 | 
 71 | //	printf("coef: %10.3e x^2 + %10.3e x + %10.3e = 0\n", coef[2], coef[1], coef[0]);
 72 | 
 73 | 	ans_re[0] = 0.0;
 74 | 	ans_re[1] = 0.0;
 75 | 	ans_im[0] = 0.0;
 76 | 	ans_im[1] = 0.0;
 77 | 
 78 | 	if(coef[2] == 0.0)
 79 | 	{
 80 | 		if(coef[1] == 0.0)
 81 | 		{
 82 | 			fprintf(stderr, "ERROR: No answers (dquadraric_eq)\n");
 83 | 			return -1;
 84 | 		}
 85 | 		ans_re[0] = -coef[0] / coef[1];
 86 | 		return 1; // number of answers
 87 | 	}
 88 | 
 89 | 	den = 2.0 * coef[2];
 90 | 	d = coef[1] * coef[1] - 4.0 * coef[2] * coef[0];
 91 | 	ans_re[0] = -coef[1] / den;
 92 | 
 93 | 	/* anss are real numbers */
 94 | 	if(d == 0.0)
 95 | 	{
 96 | 		ans_re[1] = ans_re[0];
 97 | 	}
 98 | 	else if(d > 0)
 99 | 	{
100 | 		tmp = sqrt(d);
101 | 		if(coef[1] > 0.0)
102 | 			tmp = -tmp / den;
103 | 		else
104 | 			tmp = tmp / den;
105 | 		ans_re[0] -= tmp;
106 | 		ans_re[1] = coef[0] / (coef[2] * ans_re[0]);
107 | 	}
108 | 	/* complex numbers */
109 | 	else
110 | 	{
111 | 		tmp = sqrt(-d) / den;
112 | 		ans_re[1] = ans_re[0];
113 | 		ans_im[0] = tmp;
114 | 		ans_im[1] = -tmp;
115 | 	}
116 | 	return 2;
117 | }
118 | 
119 | 
120 | /* Eigen polynomial of Unsymmetric Real Tridiagonal Matrix   */
121 | /*                                                           */
122 | /* trimat:= [vec[1][0] vec[0][0  ]             0 ... 0     ] */
123 | /*          [vec[2][0] vec[1][1  ] vec[0][1  ] 0 ... 0     ] */
124 | /*          [        ..............................        ] */
125 | /*          [0 ... 0   vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */
126 | /*          [0 ... 0   0           vec[2][n-2] vec[1][n-1] ] */
127 | /*                                                           */
128 | void get_dtrimat_legendre(double *trimat[3], double *diagmat, int dim)
129 | {
130 | 	long int i, j, k;
131 | 	double tmp, a1, a2, a3, a4, a, b, c;
132 | 	double di, dip1;
133 | 
134 | 	/* Frank Matrix */
135 | 	for(i = 0; i < dim; i++)
136 | 	{
137 | 		// Legendre
138 | 		a1 = (double)(i + 1);
139 | 		a2 = 0.0;
140 | 		a3 = (double)(2 * i + 1);
141 | 		a4 = (double)i;
142 | 
143 | 		a = a3 / a1;
144 | 		b = a2 / a1;
145 | 		c = a4 / a1;
146 | 		for(j = 0; j < dim; j++)
147 | 		{
148 | 			if(j == (i - 1))
149 | 			{
150 | 				tmp = c / a;
151 | 				trimat[2][j] = tmp;
152 | 			}
153 | 			else if(j == i)
154 | 			{
155 | 				tmp = -b / a;
156 | 				trimat[1][j] = tmp;
157 | 			}
158 | 			else if(j == (i + 1))
159 | 			{
160 | 				tmp = 1.0 / a;
161 | 				trimat[0][j - 1] = tmp;
162 | 			}
163 | 		}
164 | 	}
165 | 
166 | 	/* to be Symmetric */
167 | 	di = 1.0;
168 | 	diagmat[0] = di;
169 | 	for(i = 0; i < dim - 1; i++)
170 | 	{
171 | 		dip1 = sqrt(trimat[0][i] / trimat[2][i]) * di;
172 | 		trimat[0][i] = di / dip1 * trimat[0][i];
173 | 		trimat[2][i] = dip1 / di * trimat[2][i];
174 | 		di = dip1;
175 | 		diagmat[i + 1] = dip1;
176 | 	}
177 | }
178 | 
179 | 
180 | /* Eigen polynomial of Unsymmetric Real Tridiagonal Matrix   */
181 | /*                                                           */
182 | /* trimat:= [vec[1][0] vec[0][0  ]             0 ... 0     ] */
183 | /*          [vec[2][0] vec[1][1  ] vec[0][1  ] 0 ... 0     ] */
184 | /*          [        ..............................        ] */
185 | /*          [0 ... 0   vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */
186 | /*          [0 ... 0   0           vec[2][n-2] vec[1][n-1] ] */
187 | /*                                                           */
188 | void get_dtrimat_leguerre(double *trimat[3], double *diagmat, int dim)
189 | {
190 | 	long int i, j, k;
191 | 	double tmp, a1, a2, a3, a4, a, b, c;
192 | 	double di, dip1;
193 | 
194 | 	/* Frank Matrix */
195 | 	for(i = 0; i < dim; i++)
196 | 	{
197 | 		// Leguerre: alpha = 0.0
198 | 		a1 = (double)(i + 1);
199 | 		a2 = (double)(2 * i + 1);
200 | 		a3 = (double)(-1);
201 | 		a4 = (double)i;
202 | 
203 | 		a = a3 / a1;
204 | 		b = a2 / a1;
205 | 		c = a4 / a1;
206 | 		for(j = 0; j < dim; j++)
207 | 		{
208 | 			if(j == (i - 1))
209 | 			{
210 | 				trimat[2][j] = c / a;
211 | 			}
212 | 			else if(j == i)
213 | 			{
214 | 				trimat[1][j] = -b / a;
215 | 			}
216 | 			else if(j == (i + 1))
217 | 			{
218 | 				trimat[0][j - 1] = 1.0 / a;
219 | 			}
220 | 		}
221 | 	}
222 | 
223 | 	/* to be Symmetric */
224 | 	di = 1.0;
225 | 	diagmat[0] = di;
226 | 
227 | 	for(i = 0; i < dim - 1; i++)
228 | 	{
229 | 		dip1 = sqrt(trimat[0][i] / trimat[2][i]) * di;
230 | 		trimat[0][i] = di / dip1 * trimat[0][i];
231 | 
232 | 		trimat[2][i] = dip1 / di * trimat[2][i];
233 | 		di = dip1;
234 | 		diagmat[i + 1] = dip1;
235 | 	}
236 | 
237 | }
238 | 
239 | /* Eigen polynomial of Unsymmetric Real Tridiagonal Matrix   */
240 | /*                                                           */
241 | /* trimat:= [vec[1][0] vec[0][0  ]             0 ... 0     ] */
242 | /*          [vec[2][0] vec[1][1  ] vec[0][1  ] 0 ... 0     ] */
243 | /*          [        ..............................        ] */
244 | /*          [0 ... 0   vec[2][n-3] vec[1][n-2] vec[0][n-2] ] */
245 | /*          [0 ... 0   0           vec[2][n-2] vec[1][n-1] ] */
246 | /*                                                           */
247 | void get_dtrimat_hermite(double *trimat[3], double *diagmat, int dim)
248 | {
249 | 	long int i, j, k;
250 | 	double tmp, a1, a2, a3, a4, a, b, c, next_a, next_c;
251 | 	double di, dip1;
252 | 
253 | 	/* Frank Matrix */
254 | 	for(i = 0; i < dim; i++)
255 | 	{
256 | 		// Hermite
257 | 		a1 = (double)(1);
258 | 		a2 = (double)(0);
259 | 		a3 = (double)(2);
260 | 		a4 = (double)(2 * i);
261 | 
262 | 		a = a3 / a1;
263 | 		b = a2 / a1;
264 | 		c = a4 / a1;
265 | 		for(j = 0; j < dim; j++)
266 | 		{
267 | 			if(j == (i - 1))
268 | 			{
269 | 				trimat[2][j] = c / a;
270 | 			}
271 | 			else if(j == i)
272 | 			{
273 | 				trimat[1][j] = -b / a;
274 | 			}
275 | 			else if(j == (i + 1))
276 | 			{
277 | 				trimat[0][j - 1] = 1.0 / a;
278 | 			}
279 | 		}
280 | 	}
281 | 
282 | 	/* to be Symmetric */
283 | 	for(i = 0; i < dim; i++)
284 | 	{
285 | 		// Hermite
286 | 		a1 = (double)(1);
287 | 		a2 = (double)(0);
288 | 		a3 = (double)(2);
289 | 		a4 = (double)(2 * i);
290 | 
291 | 		a = a3 / a1;
292 | 		b = a2 / a1;
293 | 		c = a4 / a1;
294 | 		next_a = a;                          // ONLY for Hermite!!
295 | 		next_c = (double)(2 * (i + 1)) / a1; // ONLY for Hermite!!
296 | 		for(j = 0; j < dim; j++)
297 | 		{
298 | 			if(j == (i - 1))
299 | 			{
300 | 				trimat[2][j] = trimat[0][j];
301 | 			}
302 | 			else if(j == i)
303 | 			{
304 | 				trimat[1][j] = -b / a;
305 | 			}
306 | 			else if(j == (i + 1))
307 | 			{
308 | 				trimat[0][j - 1] = sqrt(next_c / (a * next_a));
309 | 			}
310 | 		}
311 | 		
312 | 	}
313 | 
314 | }
315 | 
316 | //
317 | void gauss_integral_eig_d(double *abscissa, double *weight, int deg, int gauss_int_coef)
318 | {
319 | 	double *dtrimat[3], *dtrimat_org[3];
320 | 	double *dinit_vec, *dweight_vecs;
321 | 	double dmu0, dweight, daeps, dreps;
322 | 	int i, j;
323 | 
324 | /* Double */
325 | 	/* initialize */
326 | 	init_dmatrix_tri(dtrimat, deg);
327 | 	init_dmatrix_tri(dtrimat_org, deg);
328 | 	dinit_vec = (double *)calloc(sizeof(double), deg);
329 | 	dweight_vecs = (double *)calloc(sizeof(double), deg * deg);
330 | 
331 | 	/* get problem */
332 | 	/* abscissas & weight */
333 | 	switch(gauss_int_coef)
334 | 	{
335 | 		case GAUSS_LEGUERRE:
336 | 			get_dtrimat_leguerre(dtrimat, dinit_vec, deg);
337 | 			get_dtrimat_leguerre(dtrimat_org, dinit_vec, deg);
338 | 			dmu0 = 1.0;
339 | 			break;
340 | 		case GAUSS_HERMITE:
341 | 			get_dtrimat_hermite(dtrimat, dinit_vec, deg);
342 | 			get_dtrimat_hermite(dtrimat_org, dinit_vec, deg);
343 | 			dmu0 = sqrt(M_PI);
344 | 			break;
345 | 		default: // gauss_legendre
346 | 		case GAUSS_LEGENDRE:
347 | 			get_dtrimat_legendre(dtrimat, dinit_vec, deg);
348 | 			get_dtrimat_legendre(dtrimat_org, dinit_vec, deg);
349 | 			dmu0 = 2.0;
350 | 			break;
351 | 	}
352 | 
353 | 	// dweight_vecs := I
354 | 	for(i = 0; i < deg; i++)
355 | 	{
356 | 		for(j = 0; j < deg; j++)
357 | 			dweight_vecs[i * deg + j] = 0.0;
358 | 		dweight_vecs[i * deg + i] = 1.0;
359 | 	}
360 | 
361 | 	// eigenvalues and eigenvectors of symmetric  tridiagonal matrix
362 | 	LAPACKE_dsteqr(LAPACK_ROW_MAJOR, 'V', deg, dtrimat[1], dtrimat[2], dweight_vecs, deg);
363 | 
364 | /*	printf("Eigenvalues: \n");
365 | 	for(i = 0; i < deg; i++)
366 | 	{
367 | 		printf("%3d: %10g\n", i, dtrimat[1][i]);
368 | 		for(j = 0; j < deg; j++)
369 | 			printf("\t %10g\n", dweight_vecs[i * deg]);
370 | 	}
371 | */
372 | //	print_dmatrix_tri(dtrimat);
373 | //	print_dvector(dinit_vec);
374 | 
375 | 	/* abscissa */
376 | 	cblas_dcopy(deg, dtrimat[1], 1, abscissa, 1);
377 | 
378 | 	for(i = 0; i < deg; i++)
379 | 	{
380 | 		dweight = dweight_vecs[i];
381 | 		dweight = dweight * dweight * dmu0;
382 | 		//printf("%5d, %25.17e, %25.17e\n", i, dtrimat[1][i], dweight);
383 | 
384 | 		/* weight */
385 | 		weight[i] = dweight;
386 | 	}
387 | 
388 | 	/* free */
389 | 	free_dmatrix_tri(dtrimat);
390 | 	free_dmatrix_tri(dtrimat_org);
391 | 	free(dinit_vec);
392 | 	free(dweight_vecs);
393 | }
394 | 
395 | //shifted_gauss_legendre(dvec_abscissa, dvec_weight, dim);
396 | void dshifted_gauss_legendre(double min_val, double max_val, double *abscissa, double *weight, int deg)
397 | {
398 | 	int i, div;
399 | 	double x, trans1, trans2;
400 | 
401 | 	div = deg;
402 | 
403 | 	/* transform of variables */
404 | 	trans1 = (max_val - min_val) / 2.0;
405 | 	trans2 = (max_val + min_val) / 2.0;
406 | 
407 | 	/* Main loop */
408 | 	/* sum^N w_i * func(t) */
409 | 	for(i = 0; i < div; i++)
410 | 	{
411 | 		/* x = (b - a) / 2 * t + (b + a) / 2 */
412 | 		abscissa[i] = trans1 * abscissa[i] + trans2;
413 | 
414 | 		//printf("%25.17e, %25.17e\n", abscissa[i], weight[i]);
415 | 	}
416 | }
417 | 
418 | /* int[max_val, min_val] func(x) dx */
419 | double dgauss_legendre_integral(double min_val, double max_val, double (* func)(double), double *abscissa, double *weight, int deg)
420 | {
421 | 	int i, div;
422 | 	double x, trans1, trans2, func_val, ret;
423 | 
424 | 	div = deg;
425 | 
426 | 	/* transform of variables */
427 | 	trans1 = (max_val - min_val) / 2.0;
428 | 	trans2 = (max_val + min_val) / 2.0;
429 | 
430 | 	ret = 0.0;
431 | 
432 | 	/* Main loop */
433 | 	/* sum^N w_i * func(t) */
434 | 	for(i = 0; i < div; i++)
435 | 	{
436 | 		/* x = (b - a) / 2 * t + (b + a) / 2 */
437 | 		x = trans1 * abscissa[i] + trans2;
438 | 		ret += func(x) * weight[i];
439 | 	}
440 | 	ret *= trans1;
441 | 
442 | 	return ret;
443 | }
444 | 
445 | #ifdef DEBUG
446 | 
447 | // usage
448 | void usage(const char *progname)
449 | {
450 | 	printf("$ %s [kind of Gaussian Integration Scheme] [deg]\n", progname);
451 | 	printf("     0: Gauss-Legendre, 1: Gauss-Leguerre, 2: Gauss-Hermite\n");
452 | }
453 | 
454 | // ex1: cos(x)
455 | double ex1_func(double x)
456 | {
457 | 	return cos(x);
458 | }
459 | 
460 | // ex2: x^2
461 | double ex2_func(double x)
462 | {
463 | 	return x * x;
464 | }
465 | 
466 | int main(int argc, char *argv[])
467 | {
468 | 	int deg = 3, kind_of_scheme, i;
469 | 	double *abscissa, *weight;
470 | 
471 | 	if(argc <= 1)
472 | 	{
473 | 		usage(argv[0]);
474 | 		return EXIT_SUCCESS;
475 | 	}
476 | 
477 | 	kind_of_scheme = atoi(argv[1]);
478 | 
479 | 	if(kind_of_scheme == 1)
480 | 	{
481 | 		kind_of_scheme = GAUSS_LEGUERRE;
482 | 		printf("Gauss-Leguerre ");
483 | 	}
484 | 	else if(kind_of_scheme == 2)
485 | 	{
486 | 		kind_of_scheme = GAUSS_HERMITE;
487 | 		printf("Gauss-Hermite ");
488 | 	}
489 | 	else
490 | 	{
491 | 		kind_of_scheme = GAUSS_LEGENDRE;
492 | 		printf("Gauss-Legendre ");
493 | 	}
494 | 
495 | 	if(argc >= 3)
496 | 	{
497 | 		deg = atoi(argv[2]);
498 | 		if(deg <= 1)
499 | 			deg = 2;
500 | 	}
501 | 
502 | 	printf(" %d points Formura:\n", deg);
503 | 
504 | 	abscissa = (double *)calloc(deg, sizeof(double));
505 | 	weight = (double *)calloc(deg, sizeof(double));
506 | 
507 | 	gauss_integral_eig_d(abscissa, weight, deg, kind_of_scheme);
508 | 
509 | 	for(i = 0; i < deg; i++)
510 | 		printf("%5d: %25.17e %25.17e\n", i, abscissa[i], weight[i]);
511 | 
512 | 	if(kind_of_scheme == GAUSS_LEGENDRE)
513 | 	{
514 | 		// Check values of constant integrals with the given Gauss-Legedre formula
515 | 		printf("ex1: int[0, PI/2] cos x dx = 1 approx %25.17e\n", dgauss_legendre_integral(0.0, M_PI / 2, ex1_func, abscissa, weight, deg));
516 | 		printf("ex2: int[0, 1   ] x^2 dx = 1/3 approx %25.17e\n", dgauss_legendre_integral(0.0, 1.0, ex2_func, abscissa, weight, deg));
517 | 	}
518 | 
519 | 	free(abscissa);
520 | 	free(weight);
521 | 
522 | 	return EXIT_SUCCESS;
523 | }
524 | #endif // DEBUG


--------------------------------------------------------------------------------
/integral_eq/gauss_integral.h:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Gaussian integration formulas                 */
 4 | /*                              with LAPACK/BLAS */
 5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
 6 | /*************************************************/
 7 | /* types of Gauss qudrature rules */
 8 | #define GAUSS_LEGENDRE	10
 9 | #define GAUSS_LEGUERRE	20
10 | #define GAUSS_HERMITE	30
11 | 
12 | /* M_PI: PI */
13 | #ifndef M_PI
14 | 	#define M_PI 3.1415926535897932384626433832795
15 | #endif // M_PI
16 | 


--------------------------------------------------------------------------------
/integral_eq/iteration.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Integral equation by using derivative free    */
  4 | /*                                        solver */
  5 | /* Last Update: 2016-12-02 (Fri) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | #ifdef USE_IMKL
 12 | 	#include "mkl.h" // for Intel Math Kernel Library
 13 | 	#include "mkl_cblas.h"
 14 | 	#include "mkl_lapacke.h"
 15 | #else // USE_IMKL
 16 | 	#include "cblas.h"
 17 | 	#include "lapacke.h"
 18 | #endif // USE_IMKL
 19 | 
 20 | #ifdef _OPENMP
 21 | 	#include "omp.h"
 22 | #endif // _OPENMP
 23 | 
 24 | #include "tkaux.h"
 25 | #include "get_secv.h"
 26 | #include "gauss_integral.h"
 27 | 
 28 | // Definition of the integral equation to be solved
 29 | #include "ex1.c" // Example 1
 30 | //#include "prob1.c" // Exersise 8.1 (1)
 31 | //#include "prob2.c" // Exersise 8.1 (2)
 32 | 
 33 | // difference: [u, v; F]
 34 | // vfunc(ret_vec, vec): ret_vec = vfunc(vec)
 35 | void difference_dmat_parallel(double *ret_mat, double *u, double *v, double (* vfunc_index)(int, double *, int), int dim)
 36 | {
 37 | 	int i, j, k, thread_index, num_threads;
 38 | 	double *arg_vec[256];
 39 | 	double den[128], num[128]; // Max. #thread = 128
 40 | 
 41 | #ifdef _OPENMP
 42 | 	num_threads = omp_get_max_threads();
 43 | #else // _OPENMP
 44 | 	num_threads = 1;
 45 | #endif // _OPENMP
 46 | 
 47 | 	// initialize
 48 | 	for(i = 0; i < num_threads * 2; i++)
 49 | 		arg_vec[i] = (double *)calloc(dim, sizeof(double));
 50 | 
 51 | 	// main loop
 52 | 	for(i = 0; i < dim; i++)
 53 | 	{
 54 | #ifdef _OPENMP
 55 | 		#pragma omp parallel for private(k, thread_index) 
 56 | #endif // _OPENMP
 57 | 		for(j = 0; j < dim; j++)
 58 | 		{
 59 | #ifdef _OPENMP
 60 | 			thread_index = omp_get_thread_num();
 61 | #else // _OPENMP
 62 | 			thread_index = 0;
 63 | #endif // _OPENMP
 64 | 
 65 | 			// arg_vec[0] := [.........., u_j, v_j+1, ..., v_dim-1]
 66 | 			for(k = 0; k <= j; k++)
 67 | 				arg_vec[thread_index * 2][k] = u[k];
 68 | 			for(k = j + 1; k < dim; k++)
 69 | 				arg_vec[thread_index * 2][k] = v[k];
 70 | 			
 71 | 			// arg_vec[1] := [..., u_j-1, v_j, .........., v_dim-1]
 72 | 			for(k = 0; k < j; k++)
 73 | 				arg_vec[thread_index * 2 + 1][k] = u[k];
 74 | 			for(k = j; k < dim; k++)
 75 | 				arg_vec[thread_index * 2 + 1][k] = v[k];
 76 | 
 77 | 			// (F_i(arg_vec[0]) - F_i(arg_vec[1])) / (u_j - v_j)
 78 | 			num[thread_index] = vfunc_index(i, arg_vec[thread_index * 2], dim) - vfunc_index(i, arg_vec[thread_index * 2 + 1], dim);
 79 | 			den[thread_index] = u[j] - v[j];
 80 | 			
 81 | 			if(den[thread_index] == 0.0)
 82 | 			{
 83 | 				fprintf(stderr, "WARNING: dmat_difference (%d, %d) is divided by zero!\n", i, j);
 84 | 			}
 85 | 			else
 86 | 				num[thread_index] /= den[thread_index];
 87 | 			
 88 | 			ret_mat[i * dim + j] = num[thread_index];
 89 | 		}
 90 | 	}
 91 | 
 92 | 	// free
 93 | 	for(i = 0; i < num_threads * 2; i++)
 94 | 		free(arg_vec[i]);
 95 | }
 96 | 
 97 | 
 98 | // difference: [u, v; F]
 99 | // vfunc(ret_vec, vec): ret_vec = vfunc(vec)
100 | void difference_dmat_serial(double *ret_mat, double *u, double *v, double (* vfunc_index)(int, double *, int), int dim)
101 | {
102 | 	int i, j, k;
103 | 	double *arg_vec[2];
104 | 	double den, num;
105 | 
106 | 	// initialize
107 | 	arg_vec[0] = (double *)calloc(dim, sizeof(double));
108 | 	arg_vec[1] = (double *)calloc(dim, sizeof(double));
109 | 
110 | 	// main loop
111 | 	for(i = 0; i < dim; i++)
112 | 	{
113 | 		for(j = 0; j < dim; j++)
114 | 		{
115 | 
116 | 			// arg_vec[0] := [.........., u_j, v_j+1, ..., v_dim-1]
117 | 			for(k = 0; k <= j; k++)
118 | 				arg_vec[0][k] = u[k];
119 | 			for(k = j + 1; k < dim; k++)
120 | 				arg_vec[0][k] = v[k];
121 | 			
122 | 			// arg_vec[1] := [..., u_j-1, v_j, .........., v_dim-1]
123 | 			for(k = 0; k < j; k++)
124 | 				arg_vec[1][k] = u[k];
125 | 			for(k = j; k < dim; k++)
126 | 				arg_vec[1][k] = v[k];
127 | 
128 | 			// (F_i(arg_vec[0]) - F_i(arg_vec[1])) / (u_j - v_j)
129 | 			num = vfunc_index(i, arg_vec[0], dim) - vfunc_index(i, arg_vec[1], dim);
130 | 			den = u[j] - v[j];
131 | 			
132 | 			if(den == 0.0)
133 | 			{
134 | 				fprintf(stderr, "WARNING: dmat_difference (%d, %d) is divided by zero!\n", i, j);
135 | 			}
136 | 			else
137 | 				num /= den;
138 | 			
139 | 			ret_mat[i * dim + j] = num;
140 | 		}
141 | 	}
142 | 
143 | 	// free
144 | 	free(arg_vec[0]);
145 | 	free(arg_vec[1]);
146 | }
147 | 
148 | // New Secant Method
149 | // work_vec[0] = delta + zeta
150 | void new_secant_1step_dvector(double *next_x, double *xm1, double *x0, double (* vfunc_index)(int, double *, int), void (* vfunc)(double *, double *, int), double *work_vec[4], double *diff_mat, int dim)
151 | {
152 | 	int i;
153 | 	int *pivot;
154 | 
155 | 	pivot = (int *)calloc(dim, sizeof(int));
156 | 
157 | 	// Solve [x[-1], x[0]; F] delta = -F(x[0])
158 | #ifdef USE_PARALLEL_DIFFMAT
159 | 	difference_dmat_parallel(diff_mat, xm1, x0, vfunc_index, dim);
160 | #else // USE_PARALLEL_DIFFMAT
161 | 	difference_dmat_serial(diff_mat, xm1, x0, vfunc_index, dim);
162 | #endif // USE_PARALLEL_DIFFMAT
163 | 
164 | 	//print_dmatrix(diff_mat);
165 | 
166 | 	vfunc(work_vec[0], x0, dim);
167 | 	cblas_dscal(dim, -1.0, work_vec[0], 1);
168 | 
169 | 	LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, diff_mat, dim, pivot);
170 | 
171 | 	LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, diff_mat, dim, pivot, work_vec[0], 1);
172 | 
173 | 	// y = x[0] + delta
174 | 	cblas_dcopy(dim, work_vec[0], 1, work_vec[2], 1);
175 | 	cblas_daxpy(dim, 1.0, x0, 1, work_vec[2], 1);
176 | 
177 | 	// Solve [x[-1], x[0]; F] zeta = -F(y)
178 | 	vfunc(work_vec[3], work_vec[2], dim);
179 | 	cblas_dscal(dim, -1.0, work_vec[3], 1);
180 | 	
181 | 	LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, diff_mat, dim, pivot, work_vec[3], 1);
182 | 	
183 | 	// x[1] := x[0] + delta + zeta
184 | 	cblas_daxpy(dim, 1.0, work_vec[3], 1, work_vec[0], 1);
185 | 	cblas_daxpy(dim, 1.0, work_vec[2], 1, work_vec[3], 1);
186 | 
187 | 	cblas_dcopy(dim, work_vec[3], 1, next_x, 1);
188 | 
189 | 	free(pivot);
190 | }
191 | 
192 | // Secant Method
193 | // work_vec[0] = delta
194 | void secant_1step_dvector(double *next_x, double *xm1, double *x0, double (* vfunc_index)(int, double *, int), void (* vfunc)(double *, double *, int), double *work_vec[4], double *diff_mat, int dim)
195 | {
196 | 	int i;
197 | 	int *pivot;
198 | 
199 | 	pivot = (int *)calloc(dim, sizeof(int));
200 | 
201 | 	// Solve [x[-1], x[0]; F] delta = -F(x[0])
202 | #ifdef USE_PARALLEL_DIFFMAT
203 | 	difference_dmat_parallel(diff_mat, xm1, x0, vfunc_index, dim);
204 | #else // USE_PARALLEL_DIFFMAT
205 | 	difference_dmat_serial(diff_mat, xm1, x0, vfunc_index, dim);
206 | #endif // USE_PARALLEL_DIFFMAT
207 | 
208 | 	//printf_dvector("%3d %25.17e\n", diff_mat, dim * dim, 1);
209 | 
210 | 	vfunc(work_vec[0], x0, dim);
211 | 	cblas_dscal(dim, -1.0, work_vec[0], 1);
212 | 
213 | 	LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, diff_mat, dim, pivot);
214 | 
215 | 	LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, diff_mat, dim, pivot, work_vec[0], 1);
216 | 
217 | 	// x[1] := x[0] + delta
218 | 	cblas_dcopy(dim, x0, 1, next_x, 1);
219 | 	cblas_daxpy(dim, 1.0, work_vec[0], 1, next_x, 1);
220 | 
221 | 	free(pivot);
222 | }
223 | 
224 | // New secant method
225 | int derivative_free_iteration_dvector(double *ret_vec, double *xm1, double *x0, double (* vfunc_index)(int, double *, int), void (* vfunc)(double *, double *, int), int dim, double rel_tol, double abs_tol, int maxtimes)
226 | {
227 | 	int iter_times, i;
228 | 	double *work_vec[4], *old_old_vec, *old_vec, *tmp_vec;
229 | 	double *work_mat;
230 | 	double start_time, end_time;
231 | 
232 | 	// initialize
233 | 	init_derivative_free_iteration_dvector(dim);
234 | 
235 | 	for(i = 0; i < 4; i++)
236 | 		work_vec[i] = (double *)calloc(dim, sizeof(double));
237 | 	work_mat = (double *)calloc(dim * dim, sizeof(double));
238 | 
239 | 	old_old_vec = (double *)calloc(dim, sizeof(double));
240 | 	cblas_dcopy(dim, xm1, 1, old_old_vec, 1);
241 | 
242 | 	old_vec = (double *)calloc(dim, sizeof(double));
243 | 	cblas_dcopy(dim, x0, 1, old_vec, 1);
244 | 
245 | 	//printf("initial value:\n"); printf_dvector("%5d, %25.17e\n", xm1, dim, 1); printf_dvector("%5d, %25.17e\n", x0, dim, 1);
246 | 
247 | 	// main loop
248 | 	start_time = get_real_secv();
249 | 	for(iter_times = 0; iter_times < maxtimes; iter_times++)
250 | 	{
251 | 
252 | 	#ifdef USE_NEW_SECANT
253 | 		// ret_vec := old_vec + delta + zeta
254 | 		new_secant_1step_dvector(ret_vec, old_old_vec, old_vec, vfunc_index, vfunc, work_vec, work_mat, dim);
255 | 	#else // USE_NEW_SECANT
256 | 		// ret_vec := old_vec + delta
257 | 		secant_1step_dvector(ret_vec, old_old_vec, old_vec, vfunc_index, vfunc, work_vec, work_mat, dim);
258 | 	#endif // USE_NEW_SECANT
259 | 
260 | 		//printf("%ld:\n", iter_times); printf_dvector("%5d, %25.17e\n", ret_vec, dim, 1);
261 | 		printf("%5d, %25.17e\n", iter_times, cblas_dnrm2(dim, work_vec[0], 1));
262 | 
263 | 		// ||tmp_vec|| < rel_tol * ||old_vec|| + abs_tol ?
264 | 		if(cblas_dnrm2(dim, work_vec[0], 1) < rel_tol * cblas_dnrm2(dim, old_vec, 1) + abs_tol)
265 | 			break;
266 | 
267 | 		// old_old_vec := old_vec
268 | 		// old_vec := ret_vec
269 | 		cblas_dcopy(dim, old_vec, 1, old_old_vec, 1);
270 | 		cblas_dcopy(dim, ret_vec, 1, old_vec, 1);
271 | 	}
272 | 	end_time = get_real_secv() - start_time;
273 | 
274 | 	printf("elapsed time(s) of derivative_free_iteration_dvector: %f\n", end_time);
275 | 
276 | 	if(iter_times >= maxtimes)
277 | 		fprintf(stderr, "Warning: derivative_free_iteration is not convergent!(%d iter_times)\n", iter_times);
278 | 
279 | 	// free
280 | 	for(i = 0; i < 4; i++)
281 | 		free(work_vec[i]);
282 | 	free(work_mat);
283 | 
284 | 	free(old_old_vec);
285 | 	free(old_vec);
286 | 
287 | 	free_derivative_free_iteration_dvector();
288 | 
289 | 	return iter_times;
290 | }
291 | 
292 | int main(int argc, char *argv[])
293 | {
294 | 	int i, dim, itimes;
295 | 	double *dvec_u, *dvec_v, *dvec_ans;
296 | 	double *dmat;
297 | 	double start_time, end_time;
298 | 
299 | //	dim = 4;
300 | //	dim = 8;
301 | //	dim = 16;
302 | //	dim = 32;
303 | //	dim = 64;
304 | //	dim = 512;
305 | //	dim = 1024;
306 | 
307 | 	if(argc <= 1)
308 | 	{
309 | 		printf("Usage: %s [dimension] \n", argv[0]);
310 | 		return EXIT_SUCCESS;
311 | 	}
312 | 
313 | 	dim = atoi(argv[1]);
314 | 
315 | 	if(dim <= 0)
316 | 	{
317 | 		fprintf(stderr, "ERROR: dimension( = %d) is illegal!\n", dim);
318 | 		return EXIT_FAILURE;
319 | 	}
320 | 
321 | 	dvec_u = (double *)calloc(dim, sizeof(double));
322 | 	dvec_v = (double *)calloc(dim, sizeof(double));
323 | 	dvec_ans = (double *)calloc(dim, sizeof(double));
324 | 	dmat = (double *)calloc(dim * dim, sizeof(double));
325 | 
326 | #ifdef USE_IMKL
327 | 	char str_mkl_version[1024];
328 | 	int max_num_threads;
329 | 
330 | 	MKL_Get_Version_String(str_mkl_version, 1024);
331 | 	printf("%s\n", str_mkl_version);
332 | 
333 | 	max_num_threads = mkl_get_max_threads();
334 | 	printf("Max Number of Threads: %d\n", max_num_threads);
335 | 	mkl_set_num_threads(max_num_threads);
336 | #endif
337 | 
338 | 	srand(10);
339 | 
340 | #ifdef _OPENMP
341 | 	#pragma omp parallel for
342 | #endif // _OPENMP
343 | 	for(i = 0; i < dim; i++)
344 | 	{
345 | 		dvec_u[i] = (double)rand() / (double)RAND_MAX;
346 | 		dvec_v[i] = (double)rand() / (double)RAND_MAX;
347 | 	}
348 | 
349 | //	difference_dmat(dmat, dvec_u, dvec_v, vf_index, dim);
350 | //	difference_dmat(dmat, dvec_u, dvec_v, vf1_index, dim);
351 | 
352 | //	printf_dvector("%3d %25.17e\n", dmat, dim * dim, 1);
353 | 
354 | 	// secant or new secant method
355 | 	start_time = get_real_secv();
356 | 	itimes = derivative_free_iteration_dvector(dvec_ans, dvec_u, dvec_v, vf_index, vf, dim, 1.0e-10, 1.0e-50, dim * 2);
357 | 	end_time = get_real_secv() - start_time;
358 | 
359 | 	printf("dvec_ans: \n");	printf_dvector2("%5d, %25.17e, %25.17e\n", dvec_abscissa, dvec_ans, dim, 1);
360 | 	printf("elapsed time(s): %f\n", end_time);
361 | 
362 | 	free(dvec_u);
363 | 	free(dvec_v);
364 | 	free(dvec_ans);
365 | 	free(dmat);
366 | 
367 | 	return EXIT_SUCCESS;
368 | }
369 | 


--------------------------------------------------------------------------------
/integral_eq/prob1.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Hammerstein Integral equation                 */
  4 | /* Last Update: 2016-12-02 (Fri) T.Kouya         */
  5 | /*************************************************/
  6 | //
  7 | // Problem 8.1: x(s) = exp(s) - s * sin(s) + int[0, 1] K(s, t) (x(t))^2 dt
  8 | //              where K(s, t) = exp(-2t) * sin(s)
  9 | //
 10 | // [caution!] original problem is defined in [0, 1], but cannot converge!
 11 | //            So smaller intervals such as [0, 0.2] are recommended.
 12 | //
 13 | double get_bij(int i, int j, double *abscissa, double *weight)
 14 | {
 15 | 	double ret;
 16 | 
 17 | 	// abscissa[i] = s_i
 18 | 	// abscissa[j] = t_j
 19 | 	ret = weight[j] * exp(-2.0 * abscissa[j]) * sin(abscissa[i]);
 20 | 
 21 | 	return ret;
 22 | }
 23 | 
 24 | void get_bmatrix(double *B, int row_dim, int col_dim, double *abscissa, double *weight)
 25 | {
 26 | 	int i, j;
 27 | 
 28 | #ifdef _OPENMP
 29 | 	#pragma omp parallel for private(j)
 30 | #endif // _OPENMP
 31 | 	for(i = 0; i < row_dim; i++)
 32 | 		for(j = 0; j < col_dim; j++)
 33 | 			B[i * col_dim + j] = get_bij(i, j, abscissa, weight);
 34 | 
 35 | }
 36 | 
 37 | double *dmat_b;
 38 | double *dvec_abscissa, *dvec_weight, *dvec_exp_s_sin, *dvec_xsqr;
 39 | 
 40 | void init_derivative_free_iteration_dvector(int dim)
 41 | {
 42 | 	dvec_abscissa = (double *)calloc(dim, sizeof(double));
 43 | 	dvec_weight = (double *)calloc(dim, sizeof(double));
 44 | 	dvec_xsqr = (double *)calloc(dim, sizeof(double));
 45 | 	dvec_exp_s_sin = (double *)calloc(dim, sizeof(double));
 46 | 
 47 | 	gauss_integral_eig_d(dvec_abscissa, dvec_weight, dim, GAUSS_LEGENDRE);
 48 | 	dshifted_gauss_legendre(0.0, 0.2, dvec_abscissa, dvec_weight, dim); // easy !
 49 | 	//dshifted_gauss_legendre(0.0, 1.0, dvec_abscissa, dvec_weight, dim); // impossible !
 50 | 
 51 | 	dmat_b = (double *)calloc(dim * dim, sizeof(double));
 52 | 
 53 | 	get_bmatrix(dmat_b, dim, dim, dvec_abscissa, dvec_weight);
 54 | 
 55 | }
 56 | 
 57 | void free_derivative_free_iteration_dvector(void)
 58 | {
 59 | 	free(dvec_abscissa);
 60 | 	free(dvec_weight);
 61 | 	free(dvec_xsqr);
 62 | 	free(dvec_exp_s_sin);
 63 | 
 64 | 	free(dmat_b);
 65 | }
 66 | 
 67 | // ret = x^2
 68 | void sqr_dvector(double *ret, double *x, int dim)
 69 | {
 70 | 	int i;
 71 | 
 72 | #ifdef _OPENMP
 73 | 	#pragma omp parallel for
 74 | #endif // _OPENMP
 75 | 	for(i = 0; i < dim; i++)
 76 | 		ret[i] = x[i] * x[i];
 77 | }
 78 | 
 79 | // ret = exp(s) - s * sin(s)
 80 | void exp_s_sin_dvector(double *ret, double *s, int dim)
 81 | {
 82 | 	int i;
 83 | 
 84 | #ifdef _OPENMP
 85 | 	#pragma omp parallel for
 86 | #endif // _OPENMP
 87 | 	for(i = 0; i < dim; i++)
 88 | 		ret[i] = exp(s[i]) - s[i] * sin(s[i]);
 89 | }
 90 | 
 91 | 
 92 | // double vfunc_index(int, DVector)
 93 | double vf_index(int index, double *x, int dim)
 94 | {
 95 | 	double ret, tmp;
 96 | 	int j;
 97 | 
 98 | 	// B * x_sqr
 99 | 	tmp = 0.0;
100 | #ifdef _OPENMP
101 | 	#pragma omp parallel for reduction(+:tmp)
102 | #endif // _OPENMP
103 | 	for(j = 0; j < dim; j++)
104 | 		tmp += dmat_b[index * dim + j] * (x[j] * x[j]);
105 | 
106 | 	ret = x[index] - exp(dvec_abscissa[index]) + dvec_abscissa[index] * sin(dvec_abscissa[index]) - tmp;
107 | 
108 | 	return ret;
109 | }
110 | 
111 | 
112 | // void vfunc(DVector, DVector)
113 | void vf(double *ret_vec, double *x, int dim)
114 | {
115 | 	double ret;
116 | 	int i;
117 | 
118 | 	// ret_vec := exp(s) - s * sin(s)
119 | 	exp_s_sin_dvector(ret_vec, dvec_abscissa, dim);
120 | 
121 | 	//  ret_vec := exp(s) - s * sin(s) + B * x_sqr
122 | 	sqr_dvector(dvec_xsqr, x, dim);
123 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, dmat_b, dim, dvec_xsqr, 1, 1.0, ret_vec, 1);
124 | 
125 | 	// ret_vec := -ret_vec
126 | 	cblas_dscal(dim, -1.0, ret_vec, 1);
127 | 
128 | 	// ret_vec := x + ret_vec
129 | 	cblas_daxpy(dim, 1.0, x, 1, ret_vec, 1);
130 | }
131 | 


--------------------------------------------------------------------------------
/integral_eq/prob2.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Hammerstein Integral equation                 */
  4 | /* Last Update: 2016-12-02 (Fri) T.Kouya         */
  5 | /*************************************************/
  6 | //
  7 | // Problem 8.2: x(s) = s^2+ sin(s) * int[-1, 1] K(s, t) (x(t))^2 dt
  8 | //              where K(s, t) = exp(-2t)
  9 | //
 10 | double get_bij(int i, int j, double *abscissa, double *weight)
 11 | {
 12 | 	double ret;
 13 | 
 14 | 	// abscissa[i] = s_i
 15 | 	// abscissa[j] = t_j
 16 | 	ret = weight[j] * exp(-2.0 * abscissa[j]) * sin(abscissa[i]);
 17 | 
 18 | 	return ret;
 19 | }
 20 | 
 21 | void get_bmatrix(double *B, int row_dim, int col_dim, double *abscissa, double *weight)
 22 | {
 23 | 	int i, j;
 24 | 
 25 | #ifdef _OPENMP
 26 | 	#pragma omp parallel for private(j)
 27 | #endif // _OPENMP
 28 | 	for(i = 0; i < row_dim; i++)
 29 | 		for(j = 0; j < col_dim; j++)
 30 | 			B[i * col_dim + j] = get_bij(i, j, abscissa, weight);
 31 | 
 32 | }
 33 | 
 34 | double *dmat_b;
 35 | double *dvec_abscissa, *dvec_weight, *dvec_ssqr, *dvec_xsqr;
 36 | 
 37 | void init_derivative_free_iteration_dvector(int dim)
 38 | {
 39 | 	dvec_abscissa = (double *)calloc(dim, sizeof(double));
 40 | 	dvec_weight = (double *)calloc(dim, sizeof(double));
 41 | 	dvec_xsqr = (double *)calloc(dim, sizeof(double));
 42 | 	dvec_ssqr = (double *)calloc(dim, sizeof(double));
 43 | 
 44 | 	gauss_integral_eig_d(dvec_abscissa, dvec_weight, dim, GAUSS_LEGENDRE);
 45 | 	dshifted_gauss_legendre(-1.0, 1.0, dvec_abscissa, dvec_weight, dim);
 46 | 
 47 | 	dmat_b = (double *)calloc(dim * dim, sizeof(double));
 48 | 
 49 | 	get_bmatrix(dmat_b, dim, dim, dvec_abscissa, dvec_weight);
 50 | 
 51 | }
 52 | 
 53 | void free_derivative_free_iteration_dvector(void)
 54 | {
 55 | 	free(dvec_abscissa);
 56 | 	free(dvec_weight);
 57 | 	free(dvec_xsqr);
 58 | 	free(dvec_ssqr);
 59 | 
 60 | 	free(dmat_b);
 61 | }
 62 | 
 63 | // ret = x^2
 64 | void sqr_dvector(double *ret, double *x, int dim)
 65 | {
 66 | 	int i;
 67 | 
 68 | #ifdef _OPENMP
 69 | 	#pragma omp parallel for
 70 | #endif // _OPENMP
 71 | 	for(i = 0; i < dim; i++)
 72 | 		ret[i] = x[i] * x[i];
 73 | }
 74 | 
 75 | // ret = exp(s) - s * sin(s)
 76 | void ssqr_dvector(double *ret, double *s, int dim)
 77 | {
 78 | 	int i;
 79 | 
 80 | #ifdef _OPENMP
 81 | 	#pragma omp parallel for
 82 | #endif // _OPENMP
 83 | 	for(i = 0; i < dim; i++)
 84 | 		ret[i] = s[i] * s[i];
 85 | }
 86 | 
 87 | 
 88 | // double vfunc_index(int, DVector)
 89 | double vf_index(int index, double *x, int dim)
 90 | {
 91 | 	double ret, tmp;
 92 | 	int j;
 93 | 
 94 | 	// B * x_sqr
 95 | 	tmp = 0.0;
 96 | #ifdef _OPENMP
 97 | 	#pragma omp parallel for reduction(+:tmp)
 98 | #endif // _OPENMP
 99 | 	for(j = 0; j < dim; j++)
100 | 		tmp += dmat_b[index * dim + j] * (x[j] * x[j]);
101 | 
102 | 	ret = x[index] - dvec_abscissa[index] * dvec_abscissa[index] - tmp;
103 | 
104 | 	return ret;
105 | }
106 | 
107 | 
108 | // void vfunc(DVector, DVector)
109 | void vf(double *ret_vec, double *x, int dim)
110 | {
111 | 	double ret;
112 | 	int i;
113 | 	double *tmp_vec;
114 | 
115 | 	tmp_vec = (double *)calloc(dim, sizeof(double));
116 | 
117 | 	//  ret_vec := B * x_sqr
118 | 	sqr_dvector(dvec_xsqr, x, dim);
119 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, dmat_b, dim, dvec_xsqr, 1, 0.0, ret_vec, 1);
120 | 
121 | 	// ret_vec := s * s + B * x_sqr
122 | 	ssqr_dvector(tmp_vec, dvec_abscissa, dim);
123 | 	cblas_daxpy(dim, 1.0, tmp_vec, 1, ret_vec, 1);
124 | 
125 | 	// ret_vec := -ret_vec
126 | 	cblas_dscal(dim, -1.0, ret_vec, 1);
127 | 
128 | 	// ret_vec := x + ret_vec
129 | 	cblas_daxpy(dim, 1.0, x, 1, ret_vec, 1);
130 | 
131 | 	free(tmp_vec);
132 | }
133 | 


--------------------------------------------------------------------------------
/invpower_eig.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/invpower_eig.c


--------------------------------------------------------------------------------
/jacobi_iteration.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Jacobi Iterative Refinement                   */
  4 | /* Last Update: 2015-02-20 (Fri) T.Kouya         */
  5 | /*************************************************/
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <math.h>
  9 | #include "cblas.h"
 10 | 
 11 | // relative difference
 12 | double reldiff_dvector(double *vec1, double *vec2, int dim)
 13 | {
 14 | 	double *tmp_vec;
 15 | 	double ret, norm;
 16 | 
 17 | 	tmp_vec = (double *)calloc(dim, sizeof(double));
 18 | 
 19 | 	cblas_dcopy(dim, vec1, 1, tmp_vec, 1);
 20 | 	cblas_daxpy(dim, -1.0, vec2, 1, tmp_vec, 1);
 21 | 	ret = cblas_dnrm2(dim, tmp_vec, 1);
 22 | 	norm = cblas_dnrm2(dim, vec1, 1);
 23 | 
 24 | 	if(norm != 0.0)
 25 | 		ret /= norm;
 26 | 
 27 | 	return ret;
 28 | }
 29 | 
 30 | int main()
 31 | {
 32 | 	int i, j, dim, itimes;
 33 | 	double *ma, *md, *vy, *vx, *vx_true, *vx_new, *vb;
 34 | 	double reps, aeps;
 35 | 
 36 | 	// input dimension
 37 | 	printf("Dim = "); scanf("%d", &dim);
 38 | 
 39 | 	if(dim <= 0)
 40 | 	{
 41 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 42 | 		return EXIT_FAILURE;
 43 | 	}
 44 | 
 45 | 	// Initialize
 46 | 	ma = (double *)calloc(dim * dim, sizeof(double));
 47 | 	md = (double *)calloc(dim, sizeof(double));
 48 | 	vy = (double *)calloc(dim, sizeof(double));
 49 | 	vx = (double *)calloc(dim, sizeof(double));
 50 | 	vx_new = (double *)calloc(dim, sizeof(double));
 51 | 	vx_true = (double *)calloc(dim, sizeof(double));
 52 | 	vb = (double *)calloc(dim, sizeof(double));
 53 | 
 54 | 	// input ma and vx_true
 55 | 	for(i = 0; i < dim; i++)
 56 | 	{
 57 | 		for(j = 0; j < dim; j++)
 58 | 			ma[i * dim + j] = sqrt(2.0) * (double)(i + j + 1);
 59 | 		ma[i * dim + i] = sqrt(2.0) * dim * dim;
 60 | 		vx_true[i] = sqrt(2.0) * (double)(i + 1);
 61 | 	}
 62 | 
 63 | 	// vb := 1.0 * ma * vx_true
 64 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, ma, dim, vx_true, 1, 0.0, vb, 1);
 65 | 
 66 | 	reps = 1.0e-10;
 67 | 	aeps = 0.0;
 68 | 
 69 | 	// md := D^{-1} = diag[1/a11, 1/a22, ..., 1/ann]
 70 | 	for(i = 0; i < dim; i++)
 71 | 		md[i] = 1.0 / ma[i * dim + i];
 72 | 
 73 | 	// vx := 0
 74 | 	for(i = 0; i < dim; i++)
 75 | 		vx[i] = 0.0;
 76 | 
 77 | 	// Jacobi Iteration
 78 | 	for(itimes = 0; itimes < dim * 10; itimes++)
 79 | 	{
 80 | 		// vy := vb
 81 | 		cblas_dcopy(dim, vb, 1, vy, 1);
 82 | 
 83 | 		// vx_new := vx
 84 | 		cblas_dcopy(dim, vx, 1, vx_new, 1);
 85 | 
 86 | 		// y := b - A * x
 87 | 		cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, -1.0, ma, dim, vx, 1, 1.0, vy, 1);
 88 | 		//for(i = 0; i < dim; i++)
 89 | 		//	printf("%3d %15.7e %15.7e\n", i, vx[i], vy[i]);
 90 | 		//printf("\n");
 91 | 
 92 | 		// x_new := x + D^{-1} * y
 93 | 		cblas_dsbmv(CblasRowMajor, CblasUpper, dim, 0, 1.0, md, 1, vy, 1, 1.0, vx_new, 1);
 94 | 		//for(i = 0; i < dim; i++)
 95 | 		//	vx_new[i] = vx[i] + md[i] * vy[i];
 96 | 
 97 | 		//for(i = 0; i < dim; i++)
 98 | 		//	printf("%3d %15.7e %15.7e: %15.7e\n", i, vy[i], vx_new[i], md[i]);
 99 | 		//printf("\n");
100 | 
101 | 		// || x_new - x || < reps || x_new || + aeps
102 | 		cblas_daxpy(dim, -1.0, vx_new, 1, vx, 1);
103 | 		if(cblas_dnrm2(dim, vx, 1) <= reps * cblas_dnrm2(dim, vx_new, 1) + aeps)
104 | 		{
105 | 			// vx := vx_new
106 | 			cblas_dcopy(dim, vx_new, 1, vx, 1);
107 | 			break;
108 | 		}
109 | 
110 | 		printf("%3d %10.3e\n", itimes, cblas_dnrm2(dim, vx, 1));
111 | 
112 | 		// vx := vx_new
113 | 		cblas_dcopy(dim, vx_new, 1, vx, 1);
114 | 	}
115 | 
116 | 	// print
117 | 	printf("Iterative Times = %d\n", itimes);
118 | 	printf("Rel.Diff = %10.3e\n", reldiff_dvector(vx, vx_true, dim));
119 | 
120 | 	for(i = 0; i < dim; i++)
121 | 	{
122 | 		printf("%3d %25.17e %25.17e\n", i, vx[i], vx_true[i]);
123 | 	}
124 | 
125 | 	// free
126 | 	free(ma);
127 | 	free(md);
128 | 	free(vy);
129 | 	free(vx);
130 | 	free(vx_true);
131 | 	free(vx_new);
132 | 	free(vb);
133 | 
134 | 
135 | 	return EXIT_SUCCESS;
136 | }
137 | 


--------------------------------------------------------------------------------
/jacobi_iteration_csr_mkl.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/jacobi_iteration_csr_mkl.c


--------------------------------------------------------------------------------
/jacobi_iteration_mkl.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/jacobi_iteration_mkl.c


--------------------------------------------------------------------------------
/jacobi_iteration_spblas.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/jacobi_iteration_spblas.c


--------------------------------------------------------------------------------
/lapack_complex_row_column_major.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample probram with lapack complex row-major  */
  4 | /*                     and column major matrices */
  5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | #include "lapacke.h"
 12 | #include "cblas.h"
 13 | 
 14 | int main()
 15 | {
 16 | 	int i, j, row_dim, col_dim;
 17 | 
 18 | 	lapack_complex_double *mat_a;
 19 | 
 20 | 	// input dimension
 21 | 	printf("Row    Dim = "); scanf("%d", &row_dim);
 22 | 	printf("Column Dim = "); scanf("%d", &col_dim);
 23 | 
 24 | 	if((row_dim <= 0) || (col_dim <= 0))
 25 | 	{
 26 | 		printf("Illegal dimension! (row_dim = %d, col_dim = %d)\n", row_dim, col_dim);
 27 | 		return EXIT_FAILURE;
 28 | 	}
 29 | 
 30 | 	// initialize matrix area
 31 | 	mat_a = (lapack_complex_double *)calloc(row_dim * col_dim, sizeof(lapack_complex_double));
 32 | 
 33 | 	printf("Row Major: %d x %d\n", row_dim, col_dim);
 34 | 
 35 | 	// Row Major
 36 | 	// mat_a = A
 37 | 	// A = [1+i 2+2i ....... n+ni]
 38 | 	//     [n+i n+2i ...... 2n+ni]
 39 | 	//     [.....................]
 40 | 	//     [(m-1)n+1+i ..... mn+ni]
 41 | 	for(i = 0; i < row_dim; i++)
 42 | 	{
 43 | 		for(j = 0; j < col_dim; j++)
 44 | 			mat_a[i * col_dim + j] = lapack_make_complex_double((double)(i * col_dim + j + 1), (double)(j + 1));
 45 | 	}
 46 | 
 47 | 	// print (1)
 48 | 	printf("1 dimension: \n");
 49 | 	printf("[");
 50 | 	for(i = 0; i < row_dim * col_dim; i++)
 51 | 		printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i]), lapack_complex_double_imag(mat_a[i]));
 52 | 	printf("]\n");
 53 | 
 54 | 	// print (2)
 55 | 	printf("2 dimension: \n");
 56 | 	for(i = 0; i < row_dim; i++)
 57 | 	{
 58 | 		printf("[");
 59 | 		for(j = 0; j < col_dim; j++)
 60 | 			printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i * col_dim + j]), lapack_complex_double_imag(mat_a[i * col_dim + j]));
 61 | 		printf("]\n");
 62 | 	}
 63 | 
 64 | 	printf("Column Major: %d x %d\n", row_dim, col_dim);
 65 | 
 66 | 	// Column Major
 67 | 	// mat_a = A
 68 | 	// A = [1   2   ....... n]
 69 | 	//     [n+1 n+2 ...... 2n]
 70 | 	//     [.................]
 71 | 	//     [(m-1)n+1 ..... mn]
 72 | 	for(j = 0; j < col_dim; j++)
 73 | 	{
 74 | 		for(i = 0; i < row_dim; i++)
 75 | 			mat_a[i + row_dim * j] = lapack_make_complex_double((double)(i * col_dim + j + 1), (double)(j + 1));
 76 | 	}
 77 | 
 78 | 	// print (1)
 79 | 	printf("1 dimension: \n");
 80 | 	printf("[");
 81 | 	for(i = 0; i < row_dim * col_dim; i++)
 82 | 		printf(" %6.3lf  %+-6.3lf * i ", lapack_complex_double_real(mat_a[i]), lapack_complex_double_imag(mat_a[i]));
 83 | 	printf("]\n");
 84 | 
 85 | 	// print (2)
 86 | 	printf("2 dimension: \n");
 87 | 	for(i = 0; i < row_dim; i++)
 88 | 	{
 89 | 		printf("[");
 90 | 		for(j = 0; j < col_dim; j++)
 91 | 			printf(" %6.3lf %+-6.3lf * i ", lapack_complex_double_real(mat_a[i + row_dim * j]), lapack_complex_double_real(mat_a[i + row_dim * j]));
 92 | 		printf("]\n");
 93 | 	}
 94 | 
 95 | 	// free
 96 | 	free(mat_a);
 97 | 
 98 | 	return EXIT_SUCCESS;
 99 | }
100 | 


--------------------------------------------------------------------------------
/lapack_complex_row_column_major_cpp.cc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/lapack_complex_row_column_major_cpp.cc


--------------------------------------------------------------------------------
/lapack_dgecon.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample Program of DEGCON                      */
  4 | /*                        with Intel Math Kernel */
  5 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #ifdef USE_IMKL
 10 | 	#include "mkl.h" // for Intel Math Kernel Library
 11 | 	#include "mkl_lapack.h" // for dlange
 12 | 	#include "mkl_lapacke.h" // for dgecon
 13 | 	#include "mkl_cblas.h" // for dcopy
 14 | #else // USE_IMKL
 15 | 	#include "cblas.h"
 16 | 	#include "lapacke.h"
 17 | #endif // USE_IMKL
 18 | 
 19 | int main()
 20 | {
 21 | 	int i, j, dim; // dimension of vectors
 22 | 	int info, *pivot;
 23 | 	double *ma, *ma_work;
 24 | 	double norm1, cond1; // ||A||_1 and cond_1(A)
 25 | 	double normi, condi; // ||A||_inf and cond_inf(A)
 26 | 	char str_mkl_version[1024];
 27 | 
 28 | #ifdef USE_IMKL
 29 | 	// print MKL version
 30 | 	MKL_Get_Version_String(str_mkl_version, 1024);
 31 | 	printf("%s\n", str_mkl_version);
 32 | #endif // USE_IMKL
 33 | 
 34 | 	// input dimension
 35 | 	printf("Dim = "); scanf("%d", &dim);
 36 | 
 37 | 	if(dim <= 0)
 38 | 	{
 39 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 40 | 		return EXIT_FAILURE;
 41 | 	}
 42 | 
 43 | 	// Initialize
 44 | 	ma = (double *)calloc(sizeof(double), dim * dim);
 45 | 	ma_work = (double *)calloc(sizeof(double), dim * dim);
 46 | 	pivot = (int *)calloc(dim, sizeof(int));
 47 | 
 48 | 	// input va and vb
 49 | 	for(i = 0; i < dim; i++)
 50 | 	{
 51 | 		for(j = 0; j < dim; j++)
 52 | 		{
 53 | 			// A = hirbert matrix
 54 | 			*(ma + i * dim + j) = 1.0 / (double)(i + j + 1);
 55 | 			/* ma[i * dim + j] = (double)rand() / (double)(RAND_MAX);
 56 | 			if(rand() % 2 != 0)
 57 | 				ma[i * dim + j] = -ma[i * dim + j];
 58 | 			*/
 59 | 		}
 60 | 	}
 61 | 
 62 | 	// A := A
 63 | 	cblas_dcopy(dim * dim, ma, 1, ma_work, 1);
 64 | 
 65 | 	// print
 66 | /*	printf("A = \n");
 67 | 	for(i = 0; i < dim; i++)
 68 | 	{
 69 | 		printf("%3d: ", i);
 70 | 		for(j = 0; j < dim; j++)
 71 | 			printf("%10f ", *(ma + i * dim + j));
 72 | 		printf("\n");
 73 | 
 74 | 	}
 75 | */
 76 | 	// Get ||A||_1
 77 | 	norm1 = LAPACKE_dlange(LAPACK_ROW_MAJOR, '1', dim, dim, ma_work, dim);
 78 | 
 79 | 	// LU decomposition
 80 | 	info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, ma_work, dim, pivot);
 81 | 	//printf("DGETRF info = %d\n", info);
 82 | 
 83 | 	// Compute condition number of A
 84 | 	info = LAPACKE_dgecon(LAPACK_ROW_MAJOR, '1', dim, ma_work, dim, norm1, &cond1);
 85 | 
 86 | 	// error occurs if info > 0
 87 | 	if(info < 0)
 88 | 	{
 89 | 		printf("The %d-th parameter is illegal!\n", -info);
 90 | 		return EXIT_FAILURE;
 91 | 	}
 92 | 
 93 | 	// A := A
 94 | 	cblas_dcopy(dim * dim, ma, 1, ma_work, 1);
 95 | 
 96 | 	// Get ||A||_inf
 97 | 	normi = LAPACKE_dlange(LAPACK_ROW_MAJOR, 'I', dim, dim, ma_work, dim);
 98 | 
 99 | 	// LU decomposition
100 | 	info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, ma_work, dim, pivot);
101 | 	//printf("DGETRF info = %d\n", info);
102 | 
103 | 	// Compute condition number of A
104 | 	info = LAPACKE_dgecon(LAPACK_ROW_MAJOR, 'I', dim, ma_work, dim, normi, &condi);
105 | 
106 | 	// error occurs if info > 0
107 | 	if(info < 0)
108 | 	{
109 | 		printf("The %d-th parameter is illegal!\n", -info);
110 | 		return EXIT_FAILURE;
111 | 	}
112 | 
113 | 	// print norm and condition number of A
114 | 	printf("||A||_1   = %25.17e, cond_1(A)   = %25.17e\n", norm1, 1.0 / cond1);
115 | 	printf("||A||_inf = %25.17e, cond_inf(A) = %25.17e\n", normi, 1.0 / condi);
116 | 
117 | 	// free
118 | 	free(ma);
119 | 	free(ma_work);
120 | 
121 | 	return EXIT_SUCCESS;
122 | }
123 | 


--------------------------------------------------------------------------------
/lapack_dgeev.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/lapack_dgeev.c


--------------------------------------------------------------------------------
/lapack_dgeev_magma.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/lapack_dgeev_magma.c


--------------------------------------------------------------------------------
/lapack_dsyev.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample Program of DSYEV                       */
  4 | /*                        with Intel Math Kernel */
  5 | /* Last Update: 2025-02-04 (Tue) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h> // 2025-02-04(Tue)
 10 | #include "cblas.h"
 11 | #include "lapacke.h"
 12 | 
 13 | #define MIN(i, j) ((i) < (j) ? (i) : (j))
 14 | #define MAX(i, j) ((i) > (j) ? (i) : (j))
 15 | 
 16 | int main()
 17 | {
 18 | 	int i, j, dim; // dimension of vectors
 19 | 	int info;
 20 | 	double *ma;
 21 | 	double *cmat, *ma_org, *diag, alpha, beta;
 22 | 	int absmax_eig_index, absmin_eig_index;
 23 | 	double *eig, absmax_eig, absmin_eig, abs_eig, ma_norm;
 24 | 
 25 | 	// input dimension
 26 | 	printf("Dim = "); scanf("%d", &dim);
 27 | 
 28 | 	if(dim <= 0)
 29 | 	{
 30 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 31 | 		return EXIT_FAILURE;
 32 | 	}
 33 | 
 34 | 	// Initialize
 35 | 	eig = (double *)calloc(sizeof(double), dim);
 36 | 	ma = (double *)calloc(sizeof(double), dim * dim);
 37 | 	ma_org = (double *)calloc(sizeof(double), dim * dim);
 38 | 	diag = (double *)calloc(sizeof(double), dim * dim);
 39 | 	cmat = (double *)calloc(sizeof(double), dim * dim);
 40 | 
 41 | 	// input ma, ma_org
 42 | 	for(i = 0; i < dim; i++)
 43 | 	{
 44 | 		for(j = 0; j < dim; j++)
 45 | 		{
 46 | 			// A = Hilbert matrix
 47 | 			//ma[i * dim + j] = 1.0 / (double)(i + j + 1);
 48 | 			// A = Frank matrix
 49 | 			ma[i * dim + j] = (double)(dim - MAX(i, j));
 50 | 			// A = Random matrix
 51 | 			//ma[i * dim + j] = (double)rand() / (double)RAND_MAX;
 52 | 		
 53 | 			// ma_org = ma
 54 | 			ma_org[i * dim + j] = ma[i * dim + j];
 55 | 		}
 56 | 	}
 57 | 
 58 | 	ma_norm = cblas_dnrm2(dim * dim, ma, 1);
 59 | 
 60 | 	// print
 61 | /*	printf("A = \n");
 62 | 	for(i = 0; i < dim; i++)
 63 | 	{
 64 | 		printf("%3d: ", i);
 65 | 		for(j = 0; j < dim; j++)
 66 | 			printf("%10g ", ma[i * dim + j]);
 67 | 		printf("\n");
 68 | 
 69 | 	}
 70 | */
 71 | 	// solve A * V = \lambda * V
 72 | 	// 'V' ... get eigenvectors
 73 | 	info = LAPACKE_dsyev(LAPACK_ROW_MAJOR, 'V', 'U', dim, ma, dim, eig);
 74 | 
 75 | 	// error occurs if info > 0
 76 | 	if(info > 0)
 77 | 	{
 78 | 		printf("QR decomposition failed! (%d) \n", info);
 79 | 		return EXIT_FAILURE;
 80 | 	}
 81 | 	else if(info < 0)
 82 | 	{
 83 | 		printf("%d-th argument of DSYEV is illegal!\n", info);
 84 | 		return EXIT_FAILURE;
 85 | 	}
 86 | 
 87 | 
 88 | 	printf("Envenvectors: \n");
 89 | 	for(i = 0; i < dim; i++)
 90 | 	{
 91 | 		printf("%3d: ", i);
 92 | 		for(j = 0; j < dim; j++)
 93 | 			printf("%5g ", ma[i * dim + j]);
 94 | 		printf("\n");
 95 | 
 96 | 	}
 97 | 
 98 | 	// diag := diag(eig)
 99 | 	absmax_eig_index = absmin_eig_index = 0;
100 | 	absmax_eig = absmin_eig = fabs(eig[0]);
101 | 	for(i = 0; i < dim; i++)
102 | 	{
103 | 		for(j = 0; j < dim; j++)
104 | 			diag[i * dim + j] = 0.0;
105 | 		diag[i * dim + i] = eig[i];
106 | 
107 | 		abs_eig = fabs(eig[i]);
108 | 		if(absmax_eig < abs_eig)
109 | 		{
110 | 			absmax_eig = abs_eig;
111 | 			absmax_eig_index = i;
112 | 		}
113 | 		if(absmin_eig > abs_eig)
114 | 		{
115 | 			absmin_eig = abs_eig;
116 | 			absmin_eig_index = i;
117 | 		}
118 | 	}
119 | 
120 | 	// print
121 | 	printf("Eigenvalues = \n");
122 | 	for(i = 0; i < dim; i++)
123 | 	{
124 | 		printf("%3d: ", i);
125 | 		printf("%10g\n", eig[i]);
126 | 	}
127 | 	printf("\n");
128 | 
129 | 	printf("absmax_eig = %25.17e\n", eig[absmax_eig_index]);
130 | 	printf("absmin_eig = %25.17e\n", eig[absmin_eig_index]);
131 | 	printf("cond2      = %25.17e\n", absmax_eig / absmin_eig);
132 | 
133 | 	alpha = 1.0;
134 | 	beta = 0.0;
135 | 
136 | 	// ev^T * A * ev - lambda * I ?
137 | 	cblas_dgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, ma_org, dim, beta, cmat, dim);
138 | 	cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, cmat, dim, ma, dim, beta, ma_org, dim);
139 | 
140 | 	alpha = -1.0;
141 | 	cblas_daxpy(dim * dim, alpha, diag, 1, ma_org, 1);
142 | 
143 | 	printf("||ev^T * A * ev - lambda||_F / ||A||_F  = %15.7e\n", cblas_dnrm2(dim * dim, ma_org, 1) / ma_norm);
144 | 
145 | 	/*
146 | 	for(i = 0; i < dim; i++)
147 | 	{
148 | 		printf("%3d: ", i);
149 | 		for(j = 0; j < dim; j++)
150 | 			printf("%5g ", ma_org[i * dim + j]);
151 | 		printf("\n");
152 | 	}
153 | */
154 | 	// free
155 | 	free(eig);
156 | 	free(cmat);
157 | 	free(ma);
158 | 	free(ma_org);
159 | 	free(diag);
160 | 
161 | 	return EXIT_SUCCESS;
162 | }
163 | 


--------------------------------------------------------------------------------
/lapack_gcc.inc:
--------------------------------------------------------------------------------
 1 | #*************************************************#
 2 | # LAPACK/BLAS Tutorial                            #
 3 | # Configuration file for GNU compiler collection  #
 4 | # Last Update: 2016-12-02 (Fri) T.Kouya           #
 5 | #*************************************************#
 6 | CC=gcc
 7 | FC=gfortran
 8 | CPP=g++
 9 | 
10 | INC = -I/usr/local/include
11 | LIB = -L/usr/local/lib/gcc  -lgfortran -lm
12 | 
13 | CBLAS_INC = $(INC)
14 | CBLAS_LIB = $(LIB) -lcblas -lrefblas -lgfortran
15 | 
16 | LAPACKE_INC = -I/usr/local/include/lapacke $(CBLAS_INC)
17 | LAPACKE_LIB = -L/usr/local/lib/gcc -llapacke -llapack $(CBLAS_LIB)
18 | 
19 | IMKL_INC=-I/opt/intel/mkl/include 
20 | #IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -L/opt/intel/lib/intel64 -lifcore
21 | #IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L/opt/intel/lib/intel64 -lifcore -liomp5
22 | 
23 | OPENMP = -fopenmp
24 | OPENMP_LIB = -lgomp
25 | 


--------------------------------------------------------------------------------
/lapack_icc.inc:
--------------------------------------------------------------------------------
 1 | #*************************************************#
 2 | # LAPACK/BLAS Tutorial                            #
 3 | # Configuration file for Intel C compiler         #
 4 | # Last Update: 2016-12-02 (Fri) T.Kouya           #
 5 | #*************************************************#
 6 | CC=icc
 7 | FC=ifort
 8 | CPP=icpc
 9 | 
10 | INC = -I/usr/local/include
11 | LIB = -L/usr/local/lib -L/usr/lib64 -L/opt/intel/lib/intel64 -lifcore
12 | 
13 | CBLAS_INC = $(INC)
14 | CBLAS_LIB = $(LIB) -lcblas -lrefblas -lm
15 | 
16 | LAPACKE_INC = -I/usr/local/include/lapacke $(CBLAS_INC)
17 | LAPACKE_LIB = -L/usr/local/lib -llapacke -llapack $(CBLAS_LIB) -L/opt/intel/lib/intel64 -lifcore
18 | 
19 | IMKL_INC=-I/opt/intel/include -I/opt/intel/mkl/include 
20 | #IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_sequential -lmkl_core -lpthread -L/opt/intel/lib/intel64 -lifcore
21 | IMKL_LIB=-L/opt/intel/mkl/lib/intel64 -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L/opt/intel/lib/intel64 -lifcore -liomp5
22 | 
23 | OPENMP = -openmp -openmp-report2
24 | OPENMP_LIB = 
25 | 


--------------------------------------------------------------------------------
/lapack_icx.inc:
--------------------------------------------------------------------------------
 1 | #*************************************************#
 2 | # LAPACK/BLAS Tutorial                            #
 3 | # Configuration file for Intel C compiler         #
 4 | # Last Update: 2026-02-04 (Tue) T.Kouya           #
 5 | #*************************************************#
 6 | CC=icx-cc
 7 | FC=ifx
 8 | CPP=icx
 9 | INTEL_ONEAPI=/opt/intel/oneapi
10 | 
11 | INC = -I/usr/local/include
12 | LIB = -L/usr/local/lib -L/usr/lib64 -L/opt/intel/lib/intel64 -lifcore
13 | 
14 | CBLAS_INC = $(INC)
15 | CBLAS_LIB = $(LIB) -lcblas -lrefblas -lm
16 | 
17 | LAPACKE_INC = -I/usr/local/include/lapacke $(CBLAS_INC)
18 | LAPACKE_LIB = -L/usr/local/lib -llapacke -llapack $(CBLAS_LIB) -L$(INTEL_ONEAPI)/compiler/latest/lib -lifcore
19 | 
20 | IMKL_INC=-I$(INTEL_ONEAPI)/mkl/latest/include
21 | IMKL_LIB=-L$(INTEL_ONEAPI)/mkl/latest/lib -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -L$(INTEL_ONEAPI)/compiler/latest/lib -lifcore -liomp5
22 | 
23 | OPENMP = -openmp -qopt-report=max
24 | OPENMP_LIB = 
25 | 


--------------------------------------------------------------------------------
/lapack_lamch.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Sample program to get properties              */
 4 | /*                        of floating-point type */
 5 | /*                        with original routines */
 6 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
 7 | /*************************************************/
 8 | #include <stdio.h>
 9 | #include <math.h>
10 | 
11 | #include "lapacke.h"
12 | 
13 | int main()
14 | {
15 | 	float seps, sufth, softh;
16 | 	double deps, dufth, dofth;
17 | 
18 | 	// float
19 | 	seps = LAPACK_slamch("E");
20 | 	sufth = LAPACK_slamch("U");
21 | 	softh = LAPACK_slamch("O");
22 | 	printf("eps : Machine Epsilon(float)     : %15.7e\n", seps);
23 | 	printf("ufth: Underflow Threshold(float) : %15.7e\n", sufth);
24 | 	printf("ofth: Overflow Threshold(float)  : %15.7e\n", softh);
25 | 
26 | 	// double
27 | 	deps = LAPACK_dlamch("E");
28 | 	dufth = LAPACK_dlamch("U");
29 | 	dofth = LAPACK_dlamch("O");
30 | 	printf("Machine Epsilon(double)     : %25.17e\n", deps);
31 | 	printf("Underflow Threshold(double) : %25.17e\n", dufth);
32 | 	printf("Overflow Threshold(double)  : %25.17e\n", dofth);
33 | 
34 | 	return EXIT_SUCCESS;
35 | }
36 | 


--------------------------------------------------------------------------------
/lapack_ssyev.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Sample Program of SSYEV                       */
  4 | /*                        with Intel Math Kernel */
  5 | /* Last Update: 2025-02-04 (Tue) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h> // 2025-02-04(Tue)
 10 | #include "cblas.h"
 11 | #include "lapacke.h"
 12 | 
 13 | #define MIN(i, j) ((i) < (j) ? (i) : (j))
 14 | #define MAX(i, j) ((i) > (j) ? (i) : (j))
 15 | 
 16 | int main()
 17 | {
 18 | 	int i, j, dim; // dimension of vectors
 19 | 	int info;
 20 | 	float *ma;
 21 | 	float *cmat, *ma_org, *diag, alpha, beta;
 22 | 	int absmax_eig_index, absmin_eig_index;
 23 | 	float *eig, absmax_eig, absmin_eig, abs_eig, ma_norm;
 24 | 
 25 | 	// input dimension
 26 | 	printf("Dim = "); scanf("%d", &dim);
 27 | 
 28 | 	if(dim <= 0)
 29 | 	{
 30 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 31 | 		return EXIT_FAILURE;
 32 | 	}
 33 | 
 34 | 	// Initialize
 35 | 	eig = (float *)calloc(sizeof(float), dim);
 36 | 	ma = (float *)calloc(sizeof(float), dim * dim);
 37 | 	ma_org = (float *)calloc(sizeof(float), dim * dim);
 38 | 	diag = (float *)calloc(sizeof(float), dim * dim);
 39 | 	cmat = (float *)calloc(sizeof(float), dim * dim);
 40 | 
 41 | 	// input ma, ma_org
 42 | 	for(i = 0; i < dim; i++)
 43 | 	{
 44 | 		for(j = 0; j < dim; j++)
 45 | 		{
 46 | 			// A = Hilbert matrix
 47 | 			//ma[i * dim + j] = 1.0 / (float)(i + j + 1);
 48 | 			// A = Frank matrix
 49 | 			ma[i * dim + j] = (float)(dim - MAX(i, j));
 50 | 			// A = Random matrix
 51 | 			//ma[i * dim + j] = (float)rand() / (float)RAND_MAX;
 52 | 		
 53 | 			// ma_org = ma
 54 | 			ma_org[i * dim + j] = ma[i * dim + j];
 55 | 		}
 56 | 	}
 57 | 
 58 | 	ma_norm = cblas_snrm2(dim * dim, ma, 1);
 59 | 
 60 | 	// print
 61 | /*	printf("A = \n");
 62 | 	for(i = 0; i < dim; i++)
 63 | 	{
 64 | 		printf("%3d: ", i);
 65 | 		for(j = 0; j < dim; j++)
 66 | 			printf("%10g ", ma[i * dim + j]);
 67 | 		printf("\n");
 68 | 
 69 | 	}
 70 | */
 71 | 	// solve A * V = \lambda * V
 72 | 	// 'V' ... get eigenvectors
 73 | 	info = LAPACKE_ssyev(LAPACK_ROW_MAJOR, 'V', 'U', dim, ma, dim, eig);
 74 | 
 75 | 	// error occurs if info > 0
 76 | 	if(info > 0)
 77 | 	{
 78 | 		printf("QR decomposition failed! (%d) \n", info);
 79 | 		return EXIT_FAILURE;
 80 | 	}
 81 | 	else if(info < 0)
 82 | 	{
 83 | 		printf("%d-th argument of SSYEV is illegal!\n", info);
 84 | 		return EXIT_FAILURE;
 85 | 	}
 86 | 
 87 | /*
 88 | 	for(i = 0; i < dim; i++)
 89 | 	{
 90 | 		printf("%3d: ", i);
 91 | 		for(j = 0; j < dim; j++)
 92 | 			printf("%5g ", ma[i * dim + j]);
 93 | 		printf("\n");
 94 | 
 95 | 	}
 96 | */
 97 | 	// diag := diag(eig)
 98 | 	absmax_eig_index = absmin_eig_index = 0;
 99 | 	absmax_eig = absmin_eig = fabs(eig[0]);
100 | 	for(i = 0; i < dim; i++)
101 | 	{
102 | 		for(j = 0; j < dim; j++)
103 | 			diag[i * dim + j] = 0.0;
104 | 		diag[i * dim + i] = eig[i];
105 | 
106 | 		abs_eig = fabs(eig[i]);
107 | 		if(absmax_eig < abs_eig)
108 | 		{
109 | 			absmax_eig = abs_eig;
110 | 			absmax_eig_index = i;
111 | 		}
112 | 		if(absmin_eig > abs_eig)
113 | 		{
114 | 			absmin_eig = abs_eig;
115 | 			absmin_eig_index = i;
116 | 		}
117 | 	}
118 | 
119 | 	// print
120 | /*	printf("Eigenvalues = \n");
121 | 	for(i = 0; i < dim; i++)
122 | 	{
123 | 		printf("%3d: ", i);
124 | 		printf("%10g\n", eig[i]);
125 | 	}
126 | 	printf("\n");
127 | */
128 | 	printf("absmax_eig = %15.7e\n", eig[absmax_eig_index]);
129 | 	printf("absmin_eig = %15.7e\n", eig[absmin_eig_index]);
130 | 	printf("cond2      = %15.7e\n", absmax_eig / absmin_eig);
131 | 
132 | 	alpha = 1.0;
133 | 	beta = 0.0;
134 | 
135 | 	// ev^T * A * ev - lambda * I ?
136 | 	cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, dim, dim, dim, alpha, ma, dim, ma_org, dim, beta, cmat, dim);
137 | 	cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, dim, dim, dim, alpha, cmat, dim, ma, dim, beta, ma_org, dim);
138 | 
139 | 	alpha = -1.0;
140 | 	cblas_saxpy(dim * dim, alpha, diag, 1, ma_org, 1);
141 | 
142 | 	printf("||ev^T * A * ev - lambda * I||_2 = %15.7e\n", cblas_snrm2(dim * dim, ma_org, 1) / ma_norm);
143 | 
144 | /*	for(i = 0; i < dim; i++)
145 | 	{
146 | 		printf("%3d: ", i);
147 | 		for(j = 0; j < dim; j++)
148 | 			printf("%5g ", ma_org[i * dim + j]);
149 | 		printf("\n");
150 | 	}
151 | */
152 | 	// free
153 | 	free(eig);
154 | 	free(cmat);
155 | 	free(ma);
156 | 	free(ma_org);
157 | 	free(diag);
158 | 
159 | 	return EXIT_SUCCESS;
160 | }
161 | 


--------------------------------------------------------------------------------
/lapack_win_intel.inc:
--------------------------------------------------------------------------------
 1 | #*************************************************#
 2 | # LAPACK/BLAS Tutorial                            #
 3 | # Configuration file for Intel C compiler         #
 4 | #                                      on Windows #
 5 | # Last Update: 2016-12-02 (Fri) T.Kouya           #
 6 | #*************************************************#
 7 | CC=icl /TC /Qstd=c99 /D:WIN32
 8 | FC=ifort
 9 | CPP=icl /TP
10 | 
11 | VCL=cl
12 | 
13 | INTEL_ROOT = "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries\windows"
14 | INTEL_LIB_ROOT = $(INTEL_ROOT)\compiler\lib\intel64
15 | INTEL_MKL_ROOT = $(INTEL_ROOT)\mkl\lib\intel64
16 | 
17 | CBLAS_INC = $(INC)
18 | CBLAS_LIB = /link $(INTEL_MKL_ROOT)\mkl_intel_lp64.lib /link $(INTEL_MKL_ROOT)\mkl_core.lib /link $(INTEL_MKL_ROOT)\mkl_sequential.lib
19 | 
20 | IMKL_INC = $(INC)
21 | IMKL_LIB = /link $(INTEL_MKL_ROOT)\mkl_intel_lp64.lib /link $(INTEL_MKL_ROOT)\mkl_core.lib /link $(INTEL_MKL_ROOT)\mkl_sequential.lib
22 | 
23 | LAPACKE_INC = $(INC)
24 | LAPACKE_LIB = /link $(INTEL_MKL_ROOT)\mkl_intel_lp64.lib /link $(INTEL_MKL_ROOT)\mkl_core.lib /link $(INTEL_MKL_ROOT)\mkl_sequential.lib
25 | 
26 | OPENMP = -openmp -openmp-report2
27 | OPENMP_LIB = 
28 | 


--------------------------------------------------------------------------------
/linear_eq.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/linear_eq.c


--------------------------------------------------------------------------------
/linear_eq_dgbsv.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Solver for Linear equation with DSGESV        */
  4 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
  5 | /*************************************************/
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <math.h>
  9 | 
 10 | #include "lapacke.h"
 11 | #include "cblas.h"
 12 | 
 13 | #define IJMAX(i, j) ( ((i) > (j)) ? (i) : (j) )
 14 | #define IJMIN(i, j) (((i) < (j)) ? (i) : (j))
 15 | 
 16 | int main()
 17 | {
 18 | 	lapack_int i, j, k, dim, shift, index, ku, kl;
 19 | 	lapack_int inc_vec_x, inc_vec_b;
 20 | 	lapack_int *pivot, info;
 21 | 
 22 | 	double *mat_a, *vec_b, *vec_x, *mat_a_full;
 23 | 	double alpha, beta;
 24 | 	double running_time;
 25 | 
 26 | 	// input dimension of a linear equation to be solved
 27 | 	printf("Dim = "); scanf("%d", &dim);
 28 | 
 29 | 	if(dim <= 0)
 30 | 	{
 31 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 32 | 		return EXIT_FAILURE;
 33 | 	}
 34 | 
 35 | 	// initialize a tridiagonal matrix(mat_a) and vectors
 36 | 	kl = 1;
 37 | 	ku = 1; // necessary for pivoting
 38 | 	mat_a = (double *)calloc((kl * 2 + ku + 1) * dim, sizeof(double));
 39 | 	vec_x = (double *)calloc(dim, sizeof(double));
 40 | 	vec_b = (double *)calloc(dim, sizeof(double));
 41 | 
 42 | 	// dim * dim
 43 | 	mat_a_full = (double *)calloc(dim * dim, sizeof(double));
 44 | 
 45 | 	for(i = 0; i < dim; i++)
 46 | 		vec_b[i] = 0.0;
 47 | 
 48 | 	// input mat_a and vec_x
 49 | 	for(i = 0; i < dim; i++)
 50 | 	{
 51 | 		// mat_a_full := 0
 52 | 		for(j = 0; j < dim; j++)
 53 | 			mat_a_full[i * dim + j] = 0.0;
 54 | 	}
 55 | 
 56 | 	for(i = 0; i < dim; i++)
 57 | 	{
 58 | 		// upper subdiagonal element
 59 | 		if((i + 1) < dim)
 60 | 		{
 61 | 			j = i + 1;
 62 | 			index = i * dim + j;
 63 | 			mat_a_full[index] = (double)(j + 1);
 64 | 
 65 | 			if((i + j + 1) % 2 != 0)
 66 | 				mat_a_full[index] = -mat_a_full[index];
 67 | 		}
 68 | 
 69 | 		// diagonal element
 70 | 		j = i;
 71 | 		index = i * dim + j;
 72 | 		mat_a_full[index] = 1.0 / (double)(i + j + 1);
 73 | 		if((i + j + 1) % 2 != 0)
 74 | 			mat_a_full[index] = -mat_a_full[index];
 75 | 
 76 | 		mat_a_full[index] += 2.0;
 77 | 
 78 | 		// lower subdiagonal element
 79 | 		if((i - 1) >= 0)
 80 | 		{
 81 | 			j = i - 1;
 82 | 			index = i * dim + j;
 83 | 			mat_a_full[index] = 1.0 / (double)(i + 1);
 84 | 
 85 | 			if((i + j + 1) % 2 != 0)
 86 | 				mat_a_full[index] = -mat_a_full[index];
 87 | 		}
 88 | 
 89 | 		//vec_x[i] = 1.0 / (double)(i + 1);
 90 | 		vec_x[i] = 1.0 ;
 91 | 	}
 92 | 
 93 | 	// print
 94 | 	for(i = 0; i < dim; i++)
 95 | 	{
 96 | 		for(j = 0; j < dim; j++)
 97 | 			printf("%10.3e ", mat_a_full[i * dim + j]);
 98 | 		printf("\n");
 99 | 	}
100 | 	printf("\n");
101 | 
102 | 	// convert
103 | 	for(j = 0; j < dim; j++)
104 | 	{
105 | 		k = ku - j;
106 | 		for(i = IJMAX(0, j - ku); i < IJMIN(dim, j + kl + 1); i++)
107 | 		{
108 | 			printf("(%d, %d) -> (%d, %d)\n", i, j, k + i, j);
109 | 			mat_a[(k + i) * dim + j] = mat_a_full[i * dim + j];
110 | 		}
111 | 	}
112 | 
113 | 	// print
114 | 	for(i = 0; i < (ku + kl + 1); i++)
115 | 	{
116 | 		for(j = 0; j < dim; j++)
117 | 			printf("%10.3e ", mat_a[i * dim + j]);
118 | 		printf("\n");
119 | 	}
120 | 	printf("\n");
121 | 
122 | 	// size(vec_x) == size(vec_b)
123 | 	inc_vec_x = inc_vec_b = 1;
124 | 
125 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
126 | 	alpha = 1.0;
127 | 	beta = 0.0;
128 | 	//cblas_dgbmv(CblasRowMajor, CblasTrans, dim, dim, kl, ku, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
129 | 	cblas_dgbmv(CblasColMajor, CblasNoTrans, dim, dim, kl, ku, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
130 | 
131 | 	// print
132 | 	for(i = 0; i < dim; i++)
133 | 	{
134 | 		printf("[");
135 | 		for(j = 0; j < dim; j++)
136 | 		{
137 | 			if(j == (i + 1))
138 | 				printf("%10.3e ", mat_a[shift + j]);
139 | 			else if(j == i)
140 | 				printf("%10.3e ", mat_a[shift + dim + j]);
141 | 			else if(j == (i - 1))
142 | 				printf("%10.3e ", mat_a[shift + 2 * dim + j]);
143 | 			else 
144 | 				printf("%10.3e ", 0.0);
145 | 		}
146 | 
147 | 		printf("]  %10.3f = %10.3f\n", vec_x[i], vec_b[i]);
148 | 	}
149 | 
150 | 	// initialize pivot
151 | 	pivot = (lapack_int *)calloc(dim, sizeof(lapack_int));
152 | 
153 | 	// solve A * X = C -> C := X
154 | 	info = LAPACKE_dgbsv(LAPACK_COL_MAJOR, dim, kl, ku, 1, mat_a, kl * 2 + ku + 1, pivot, vec_b, dim);
155 | 
156 | 	printf("info = %d\n", info);
157 | 
158 | 	// print
159 | 	printf("calculated x = \n");
160 | 	for(i = 0; i < dim; i++)
161 | 	{
162 | 		printf("%3d -> %3d: ", i, pivot[i]);
163 | 		printf("%25.17e ", vec_b[i]);
164 | 		printf("\n");
165 | 	}
166 | 
167 | 	// diff
168 | 	printf("x - calculated x = \n");
169 | 	for(i = 0; i < dim; i++)
170 | 	{
171 | 		printf("%3d: ", i);
172 | 		printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i]));
173 | 		printf("\n");
174 | 	}
175 | 	
176 | 	// free
177 | 	free(mat_a);
178 | 	free(vec_x);
179 | 	free(vec_b);
180 | 	free(pivot);
181 | 	free(mat_a_full);
182 | 
183 | 	return EXIT_SUCCESS;
184 | }
185 | 


--------------------------------------------------------------------------------
/linear_eq_dgetrf.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Solver for Linear equation                    */
  4 | /*                          with xGETRF & xGETRS */
  5 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | #include "lapacke.h"
 12 | #include "cblas.h"
 13 | 
 14 | int main()
 15 | {
 16 | 	lapack_int i, j, dim;
 17 | 	lapack_int inc_vec_x, inc_vec_b;
 18 | 	lapack_int *pivot, info;
 19 | 
 20 | 	double *mat_a, *vec_b, *vec_x;
 21 | 	double alpha, beta;
 22 | 	double running_time;
 23 | 
 24 | 	// input dimension of linear equation to be solved
 25 | 	printf("Dim = "); scanf("%d", &dim);
 26 | 
 27 | 	if(dim <= 0)
 28 | 	{
 29 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 30 | 		return EXIT_FAILURE;
 31 | 	}
 32 | 
 33 | 	// initialize a matrix and vectors
 34 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
 35 | 	vec_x = (double *)calloc(dim, sizeof(double));
 36 | 	vec_b = (double *)calloc(dim, sizeof(double));
 37 | 
 38 | 	// input mat_a and vec_x
 39 | 	for(i = 0; i < dim; i++)
 40 | 	{
 41 | 		for(j = 0; j < dim; j++)
 42 | 		{
 43 | 			//mat_a[i * dim + j] = (double)rand() / (double)RAND_MAX;
 44 | 			mat_a[i * dim + j] = 1.0 / (double)(i + j + 1);
 45 | 			if((i + j + 1) % 2 != 0)
 46 | 				mat_a[i * dim + j] = -mat_a[i * dim + j];
 47 | 		}
 48 | 		mat_a[i * dim + i] += 2.0;
 49 | 		vec_x[i] = 1.0 / (double)(i + 1);
 50 | 	}
 51 | 
 52 | 	// size(vec_x) == size(vec_b)
 53 | 	inc_vec_x = inc_vec_b = 1;
 54 | 
 55 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
 56 | 	alpha = 1.0;
 57 | 	beta = 0.0;
 58 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
 59 | 
 60 | 	// print
 61 | 	for(i = 0; i < dim; i++)
 62 | 	{
 63 | 		printf("[");
 64 | 		for(j = 0; j < dim; j++)
 65 | 			printf("%10.3f ", mat_a[i * dim + j]);
 66 | 		printf("]  %10.3f = %10.3f\n", vec_x[i], vec_b[i]);
 67 | 	}
 68 | 
 69 | 	// initialize pivot
 70 | 	pivot = (lapack_int *)calloc(dim, sizeof(lapack_int));
 71 | 
 72 | 	// LU decomposition
 73 | 	info = LAPACKE_dgetrf(LAPACK_ROW_MAJOR, dim, dim, mat_a, dim, pivot);
 74 | 	printf("DGETRF info = %d\n", info);
 75 | 
 76 | 	// forward & backward substitution
 77 | 	info = LAPACKE_dgetrs(LAPACK_ROW_MAJOR, 'N', dim, 1, mat_a, dim, pivot, vec_b, 1);
 78 | 	printf("DGETRS info = %d\n", info);
 79 | 
 80 | 	// print
 81 | 	printf("calculated x = \n");
 82 | 	for(i = 0; i < dim; i++)
 83 | 	{
 84 | 		printf("%3d -> %3d: ", i, pivot[i]);
 85 | 		printf("%25.17e ", vec_b[i]);
 86 | 		printf("\n");
 87 | 	}
 88 | 
 89 | 	// diff
 90 | 	printf("x - calculated x = \n");
 91 | 	for(i = 0; i < dim; i++)
 92 | 	{
 93 | 		printf("%3d: ", i);
 94 | 		printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i]));
 95 | 		printf("\n");
 96 | 	}
 97 | 	
 98 | 	// free
 99 | 	free(mat_a);
100 | 	free(vec_x);
101 | 	free(vec_b);
102 | 	free(pivot);
103 | 
104 | 	return EXIT_SUCCESS;
105 | }
106 | 


--------------------------------------------------------------------------------
/linear_eq_dsgesv.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Solver for Linear equation with DSGESV        */
  4 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
  5 | /*************************************************/
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <math.h>
  9 | 
 10 | #include "lapacke.h"
 11 | #include "cblas.h"
 12 | 
 13 | int main()
 14 | {
 15 | 	lapack_int i, j, dim, itimes;
 16 | 	lapack_int inc_vec_x, inc_vec_b;
 17 | 	lapack_int *pivot, info;
 18 | 
 19 | 	double *mat_a, *vec_b, *vec_x, *vec_x_approx;
 20 | 	double alpha, beta;
 21 | 	double running_time;
 22 | 
 23 | 	// input dimension of a linear equation to be solved
 24 | 	printf("Dim = "); scanf("%d", &dim);
 25 | 
 26 | 	if(dim <= 0)
 27 | 	{
 28 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 29 | 		return EXIT_FAILURE;
 30 | 	}
 31 | 
 32 | 	// inisialize a matrix and vectors
 33 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
 34 | 	vec_x = (double *)calloc(dim, sizeof(double));
 35 | 	vec_x_approx = (double *)calloc(dim, sizeof(double));
 36 | 	vec_b = (double *)calloc(dim, sizeof(double));
 37 | 
 38 | 	// input mat_a and vec_x
 39 | 	for(i = 0; i < dim; i++)
 40 | 	{
 41 | 		for(j = 0; j < dim; j++)
 42 | 		{
 43 | 			//mat_a[i * dim + j] = (double)rand() / (double)RAND_MAX;
 44 | 			mat_a[i * dim + j] = 1.0 / (double)(i + j + 1);
 45 | 			if((i + j + 1) % 2 != 0)
 46 | 				mat_a[i * dim + j] = -mat_a[i * dim + j];
 47 | 		}
 48 | 		mat_a[i * dim + i] += 2.0;
 49 | 		vec_x[i] = 1.0 / (double)(i + 1);
 50 | 	}
 51 | 
 52 | 	// size(vec_x) == size(vec_b)
 53 | 	inc_vec_x = inc_vec_b = 1;
 54 | 
 55 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
 56 | 	alpha = 1.0;
 57 | 	beta = 0.0;
 58 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
 59 | 
 60 | 	// print
 61 | 	for(i = 0; i < dim; i++)
 62 | 	{
 63 | 		printf("[");
 64 | 		for(j = 0; j < dim; j++)
 65 | 			printf("%10.3f ", mat_a[i * dim + j]);
 66 | 		printf("]  %10.3f = %10.3f\n", vec_x[i], vec_b[i]);
 67 | 	}
 68 | 
 69 | 	// initialize pivot
 70 | 	pivot = (lapack_int *)calloc(dim, sizeof(lapack_int));
 71 | 
 72 | 	// single-double mixed presicion iterative refinement method
 73 | 	info = LAPACKE_dsgesv(LAPACK_ROW_MAJOR, dim, 1, mat_a, dim, pivot, vec_b, 1, vec_x_approx, 1, &itimes);
 74 | 	printf("DSGESV info = %d, Iterative Times = %d\n", info, itimes);
 75 | 
 76 | 	// print
 77 | 	printf("calculated x = \n");
 78 | 	for(i = 0; i < dim; i++)
 79 | 	{
 80 | 		printf("%3d -> %3d: ", i, pivot[i]);
 81 | 		printf("%25.17e ", vec_x_approx[i]);
 82 | 		printf("\n");
 83 | 	}
 84 | 
 85 | 	// diff
 86 | 	printf("x - calculated x = \n");
 87 | 	for(i = 0; i < dim; i++)
 88 | 	{
 89 | 		printf("%3d: ", i);
 90 | 		printf("%10.2e ", fabs((vec_x[i] - vec_x_approx[i]) / vec_x[i]));
 91 | 		printf("\n");
 92 | 	}
 93 | 	
 94 | 	// free
 95 | 	free(mat_a);
 96 | 	free(vec_x);
 97 | 	free(vec_x_approx);
 98 | 	free(vec_b);
 99 | 	free(pivot);
100 | 
101 | 	return EXIT_SUCCESS;
102 | }
103 | 


--------------------------------------------------------------------------------
/linear_eq_dsposv.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Solver for Linear equation with DSPOSV        */
 4 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
 5 | /*************************************************/
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <math.h>
 9 | 
10 | #include "lapacke.h"
11 | #include "cblas.h"
12 | 
13 | int main()
14 | {
15 | 	lapack_int i, j, dim, itimes;
16 | 	lapack_int inc_vec_x, inc_vec_b;
17 | 	lapack_int info;
18 | 
19 | 	double *mat_a, *vec_b, *vec_x, *vec_x_iter;
20 | 	double alpha, beta;
21 | 	double running_time;
22 | 
23 | 	// input dimension of linear equation to be solved
24 | 	printf("Dim = "); scanf("%d", &dim);
25 | 
26 | 	if(dim <= 0)
27 | 	{
28 | 		printf("Illegal dimension! (dim = %d)\n", dim);
29 | 		return EXIT_FAILURE;
30 | 	}
31 | 
32 | 	// initialize a matrix and vectors
33 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
34 | 	vec_x = (double *)calloc(dim, sizeof(double));
35 | 	vec_x_iter = (double *)calloc(dim, sizeof(double));
36 | 	vec_b = (double *)calloc(dim, sizeof(double));
37 | 
38 | 	// input mat_a and vec_x
39 | 	for(i = 0; i < dim; i++)
40 | 	{
41 | 		for(j = 0; j < dim; j++)
42 | 		{
43 | 			//mat_a[i * dim + j] = (double)rand() / (double)RAND_MAX;
44 | 			mat_a[i * dim + j] = 1.0 / (double)(i + j + 1);
45 | 			if((i + j + 1) % 2 != 0)
46 | 				mat_a[i * dim + j] = -mat_a[i * dim + j];
47 | 		}
48 | 		mat_a[i * dim + i] += 2.0;
49 | 		vec_x[i] = 1.0 / (double)(i + 1);
50 | 	}
51 | 
52 | 	// size(vec_x) == size(vec_b)
53 | 	inc_vec_x = inc_vec_b = 1;
54 | 
55 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
56 | 	alpha = 1.0;
57 | 	beta = 0.0;
58 | 	cblas_dsymv(CblasRowMajor, CblasUpper, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
59 | 
60 | 	// print
61 | 	for(i = 0; i < dim; i++)
62 | 	{
63 | 		printf("[");
64 | 		for(j = 0; j < dim; j++)
65 | 			printf("%10.3f ", mat_a[i * dim + j]);
66 | 		printf("]  %10.3f = %10.3f\n", vec_x[i], vec_b[i]);
67 | 	}
68 | 
69 | 	// solve A * X = C -> C := X
70 | 	info = LAPACKE_dsposv(LAPACK_ROW_MAJOR, 'U', dim, 1, mat_a, dim, vec_b, 1, vec_x_iter, 1, &itimes);
71 | 	printf("DSPOSV info = %d, Iterative Times = %d\n", info, itimes);
72 | 
73 | 	// print
74 | 	printf("calculated x = \n");
75 | 	for(i = 0; i < dim; i++)
76 | 	{
77 | 		//printf("%3d -> %3d: ", i, pivot[i]);
78 | 		printf("%25.17e ", vec_x_iter[i]);
79 | 		printf("\n");
80 | 	}
81 | 
82 | 	// diff
83 | 	printf("x - calculated x = \n");
84 | 	for(i = 0; i < dim; i++)
85 | 	{
86 | 		printf("%3d: ", i);
87 | 		printf("%10.2e ", fabs((vec_x[i] - vec_x_iter[i]) / vec_x[i]));
88 | 		printf("\n");
89 | 	}
90 | 	
91 | 	// free
92 | 	free(mat_a);
93 | 	free(vec_x);
94 | 	free(vec_x_iter);
95 | 	free(vec_b);
96 | 
97 | 	return EXIT_SUCCESS;
98 | }
99 | 


--------------------------------------------------------------------------------
/linear_eq_dsysv.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/linear_eq_dsysv.c


--------------------------------------------------------------------------------
/linear_eq_magma.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Solver for Linear equation with MAGMA         */
  4 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
  5 | /*************************************************/
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <math.h>
  9 | 
 10 | // BLAS on CPU
 11 | #include "cblas.h"
 12 | 
 13 | // CUDA & MAGMA
 14 | #include "cuda.h"
 15 | #include "magma.h"
 16 | 
 17 | int main()
 18 | {
 19 | 	int i, j, dim;
 20 | 	int inc_vec_x, inc_vec_b;
 21 | 	int *pivot, info;
 22 | 
 23 | 	double *mat_a, *vec_b, *vec_x; // CPU
 24 | 	double alpha, beta;
 25 | 
 26 | 	// input dimension
 27 | 	printf("Dim = "); scanf("%d", &dim);
 28 | 
 29 | 	if(dim <= 0)
 30 | 	{
 31 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 32 | 		return EXIT_FAILURE;
 33 | 	}
 34 | 
 35 | 	// initialize a matrix and vectors on CPU
 36 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
 37 | 	vec_x = (double *)calloc(dim, sizeof(double));
 38 | 	vec_b = (double *)calloc(dim, sizeof(double));
 39 | 
 40 | 	// input mat_a and vec_x on CPU
 41 | 	for(j = 0; j < dim; j++)
 42 | 	{
 43 | 		// Column-major
 44 | 		for(i = 0; i < dim; i++)
 45 | 		{
 46 | 			mat_a[i + j * dim] = (double)rand() / (double)RAND_MAX;
 47 | 			if(rand() % 2 != 0)
 48 | 				mat_a[i + j * dim] = -mat_a[i + j * dim];
 49 | 		}
 50 | 		vec_x[j] = 1.0 / (double)(j + 1);
 51 | 	}
 52 | 
 53 | 	// size(vec_x) == size(vec_b)
 54 | 	inc_vec_x = inc_vec_b = 1;
 55 | 
 56 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
 57 | 	alpha = 1.0;
 58 | 	beta = 0.0;
 59 | 	cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
 60 | 
 61 | 	// print
 62 | /*	for(i = 0; i < dim; i++)
 63 | 	{
 64 | 		printf("[");
 65 | 		for(j = 0; j < dim; j++)
 66 | 			printf("%10.3f ", mat_a[i + j * dim]);
 67 | 		printf("]  %10.3f = %10.3f\n", vec_x[i], vec_b[i]);
 68 | 	}
 69 | */
 70 | 	// start MAGMA
 71 | 	magma_init();
 72 | 
 73 | 	// initialize pivot area
 74 | 	pivot = (int *)calloc(sizeof(int), dim);
 75 | 
 76 | 	// solve A * X = C -> C := X
 77 | 	magma_dgesv(dim, 1, mat_a, dim, pivot, vec_b, dim, &info);
 78 | 
 79 | 	printf("info = %d\n", info);
 80 | 
 81 | 	// print
 82 | /*	printf("calculated x = \n");
 83 | 	for(i = 0; i < dim; i++)
 84 | 	{
 85 | 		printf("%3d -> %3d: ", i, pivot[i]);
 86 | 		printf("%25.17e ", vec_b[i]);
 87 | 		printf("\n");
 88 | 	}
 89 | 
 90 | 	// diff
 91 | 	printf("x - calculated x = \n");
 92 | 	for(i = 0; i < dim; i++)
 93 | 	{
 94 | 		printf("%3d: ", i);
 95 | 		printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i]));
 96 | 		printf("\n");
 97 | 	}
 98 | */
 99 | 	// norm relative error: ||vec_b - vec_x||_2 / ||vec_x||_2
100 | 	cblas_daxpy(dim, -1.0, vec_x, 1, vec_b, 1);
101 | 	printf("||x - calculated x||_2 = %10.7e\n", cblas_dnrm2(dim, vec_b, 1) / cblas_dnrm2(dim, vec_x, 1));
102 | 
103 | 	// free
104 | 	free(mat_a);
105 | 	free(vec_x);
106 | 	free(vec_b);
107 | 	free(pivot);
108 | 
109 | 	// finalize MAGMA
110 | 	magma_finalize();
111 | 
112 | 	return EXIT_SUCCESS;
113 | }
114 | 


--------------------------------------------------------------------------------
/matvec_mul.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/matvec_mul.c


--------------------------------------------------------------------------------
/matvec_mul_cublas.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Mutiplication of matrix and vector            */
  4 | /*                                   with cuBLAS */
  5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | // BLAS on CPU
 12 | #include "cblas.h"
 13 | 
 14 | // define mycuda_calloc and mycuda_free functions
 15 | #include "mycuda.h" // CUDA
 16 | 
 17 | #include "cublas_v2.h" // cuBLAS
 18 | 
 19 | int main()
 20 | {
 21 | 	int i, j, dim;
 22 | 	int inc_vec_x, inc_vec_b;
 23 | 
 24 | 	double *mat_a, *vec_b, *vec_x, *vec_b_gpu; // CPU
 25 | 	double *dev_mat_a, *dev_vec_b, *dev_vec_x; // GPU
 26 | 	double alpha, beta;
 27 | 
 28 | 	// variables for cuBLAS
 29 | 	cublasStatus_t status;
 30 | 	cublasHandle_t handle;
 31 | 
 32 | 	// input dimension
 33 | 	printf("Dim = "); scanf("%d", &dim);
 34 | 
 35 | 	if(dim <= 0)
 36 | 	{
 37 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 38 | 		return EXIT_FAILURE;
 39 | 	}
 40 | 
 41 | 	// initialize a matrix and vectors on CPU
 42 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
 43 | 	vec_x = (double *)calloc(dim, sizeof(double));
 44 | 	vec_b = (double *)calloc(dim, sizeof(double));
 45 | 
 46 | 	// input mat_a and vec_x
 47 | 	for(j = 0; j < dim; j++)
 48 | 	{
 49 | 		for(i = 0; i < dim; i++)
 50 | 		{
 51 | 			// column-major
 52 | 			mat_a[i + j * dim] = (double)rand() / (double)RAND_MAX;
 53 | 			if(rand() % 2 != 0)
 54 | 				mat_a[i + j * dim] *= -1.0;
 55 | 		}
 56 | 		vec_x[j] = 1.0 / (double)(j + 1);
 57 | 	}
 58 | 
 59 | 	// size(vec_x) == size(vec_b)
 60 | 	inc_vec_x = inc_vec_b = 1;
 61 | 
 62 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
 63 | 	alpha = 1.0;
 64 | 	beta = 0.0;
 65 | 	cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
 66 | 
 67 | 	// print
 68 | /*	for(i = 0; i < dim; i++)
 69 | 	{
 70 | 		printf("[");
 71 | 		for(j = 0; j < dim; j++)
 72 | 			printf("%10.3lf ", mat_a[i + j * dim]);
 73 | 		printf("]  %10.3lf = %10.3lf\n", vec_x[i], vec_b[i]);
 74 | 	}
 75 | */
 76 | 	// GPU
 77 | 
 78 | 	// initialize a matrix and vectors on GPU
 79 | 	dev_mat_a = (double *)mycuda_calloc(dim * dim, sizeof(double));
 80 | 	dev_vec_x = (double *)mycuda_calloc(dim, sizeof(double));
 81 | 	dev_vec_b = (double *)mycuda_calloc(dim, sizeof(double));
 82 | 
 83 | 	// dev_vec_x on GPU -> vec_x_gpu on CPU
 84 | 	vec_b_gpu = (double *)calloc(dim, sizeof(double));
 85 | 
 86 | 	// CPU(Host) -> GPU(device)
 87 | 
 88 | 	// start cuBLAS
 89 | 	status = cublasCreate(&handle);
 90 | 
 91 | 	if(status != CUBLAS_STATUS_SUCCESS)
 92 | 	{
 93 | 		printf("Fail to initialize cuBLAS!\n");
 94 | 
 95 | 		mycuda_free(dev_mat_a);
 96 | 		mycuda_free(dev_vec_b);
 97 | 		mycuda_free(dev_vec_x);
 98 | 		cublasDestroy(handle);
 99 | 
100 | 		return 0;
101 | 	}
102 | 
103 | 	// mat_a -> dev_mat_a
104 | 	status = cublasSetMatrix(dim, dim, sizeof(double), mat_a, dim, dev_mat_a, dim);
105 | 	if(status != CUBLAS_STATUS_SUCCESS)
106 | 		printf("mat_a -> dev_mat_a: cublasSetMatrix failed.\n");
107 | 
108 | 	// size(vec_x) == size(vec_b)
109 | 	inc_vec_x = inc_vec_b = 1;
110 | 
111 | 	// vec_x -> dev_vec_x
112 | 	status = cublasSetVector(dim, sizeof(double), vec_x, inc_vec_x, dev_vec_x, inc_vec_x);
113 | 	if(status != CUBLAS_STATUS_SUCCESS)
114 | 		printf("vec_x -> dev_vec_x: cublasSetVector failed.\n");
115 | 
116 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
117 | 	alpha = 1.0;
118 | 	beta = 0.0;
119 | 	status = cublasDgemv(handle, CUBLAS_OP_N, dim, dim, &alpha, dev_mat_a, dim, dev_vec_x, inc_vec_x, &beta, dev_vec_b, inc_vec_b);
120 | 
121 | 	if(status != CUBLAS_STATUS_SUCCESS)
122 | 		printf("cublasDgemv failed.\n");
123 | 
124 | 	// synchronize
125 | 	cudaDeviceSynchronize(handle);
126 | 
127 | 	// dev_vec_b -> vec_b_gpu
128 | 	status = cublasGetVector(dim, sizeof(double), dev_vec_b, inc_vec_b, vec_b_gpu, inc_vec_b);
129 | 	if(status != CUBLAS_STATUS_SUCCESS)
130 | 		printf("dev_vec_b -> vec_b_gpu: cublasGetVector failed.\n");
131 | 
132 | 	// print
133 | /*	for(i = 0; i < dim; i++)
134 | 	{
135 | 		printf("[");
136 | 		for(j = 0; j < dim; j++)
137 | 			printf("%10.3lf ", mat_a[i + j * dim]);
138 | 		printf("]  %10.3lf = %10.3lf\n", vec_x[i], vec_b_gpu[i]);
139 | 	}
140 | */
141 | 	// relative difference: ||vec_b_gpu - vec_b||_2 / ||vec_b||_2
142 | 	cblas_daxpy(dim, -1.0, vec_b, inc_vec_b, vec_b_gpu, inc_vec_b);
143 | 	printf("||vec_b_gpu - vec_b||_2 / ||vec_b||_2 = %15.7e\n", cblas_dnrm2(dim, vec_b_gpu, 1) / cblas_dnrm2(dim, vec_b, 1));
144 | 
145 | 	// free
146 | 	free(mat_a);
147 | 	free(vec_x);
148 | 	free(vec_b);
149 | 	free(vec_b_gpu);
150 | 
151 | 	mycuda_free(dev_mat_a);
152 | 	mycuda_free(dev_vec_b);
153 | 	mycuda_free(dev_vec_x);
154 | 
155 | 	return EXIT_SUCCESS;
156 | }
157 | 


--------------------------------------------------------------------------------
/matvec_mul_magma.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/matvec_mul_magma.c


--------------------------------------------------------------------------------
/matvec_mul_magma_pure.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Mutiplication of matrix and vector            */
  4 | /*                                with magmablas */
  5 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | // BLAS on CPU
 12 | #include "cblas.h"
 13 | 
 14 | // MAGMA
 15 | #include "magma.h"
 16 | 
 17 | int main()
 18 | {
 19 | 	int i, j, dim;
 20 | 	int inc_vec_x, inc_vec_b;
 21 | 
 22 | 	double *mat_a, *vec_b, *vec_x, *vec_b_gpu; // CPU
 23 | 	double *dev_mat_a, *dev_vec_b, *dev_vec_x; // GPU
 24 | 	double alpha, beta;
 25 | 
 26 | 	// input dimension
 27 | 	printf("Dim = "); scanf("%d", &dim);
 28 | 
 29 | 	if(dim <= 0)
 30 | 	{
 31 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 32 | 		return EXIT_FAILURE;
 33 | 	}
 34 | 
 35 | 	// start MAGMA
 36 | 	magma_init();
 37 | 
 38 | 	// initialize a matrix and vectors on CPU
 39 | 	magma_dmalloc_cpu(&mat_a, dim * dim);
 40 | 	magma_dmalloc_cpu(&vec_b, dim);
 41 | 	magma_dmalloc_cpu(&vec_x, dim);
 42 | 
 43 | 	// input mat_a and vec_x
 44 | 	for(j = 0; j < dim; j++)
 45 | 	{
 46 | 		for(i = 0; i < dim; i++)
 47 | 		{
 48 | 			// column-major
 49 | 			mat_a[i + j * dim] = (double)rand() / (double)RAND_MAX;
 50 | 			if(rand() % 2 != 0)
 51 | 				mat_a[i + j * dim] *= -1.0;
 52 | 		}
 53 | 		vec_x[j] = 1.0 / (double)(j + 1);
 54 | 	}
 55 | 
 56 | 	// size(vec_x) == size(vec_b)
 57 | 	inc_vec_x = inc_vec_b = 1;
 58 | 
 59 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
 60 | 	alpha = 1.0;
 61 | 	beta = 0.0;
 62 | 	cblas_dgemv(CblasColMajor, CblasNoTrans, dim, dim, alpha, mat_a, dim, vec_x, inc_vec_x, beta, vec_b, inc_vec_b);
 63 | 
 64 | 	// print
 65 | 	for(i = 0; i < dim; i++)
 66 | 	{
 67 | 		printf("[");
 68 | 		for(j = 0; j < dim; j++)
 69 | 			printf("%10.3lf ", mat_a[i + j * dim]);
 70 | 		printf("]  %10.3lf = %10.3lf\n", vec_x[i], vec_b[i]);
 71 | 	}
 72 | 
 73 | 	// GPU
 74 | 
 75 | 	// initialize a matrix and vectors on GPU (MAGMA)
 76 | 	magma_dmalloc(&dev_mat_a, dim * dim);
 77 | 	magma_dmalloc(&dev_vec_x, dim);
 78 | 	magma_dmalloc(&dev_vec_b, dim);
 79 | 
 80 | 	// dev_vec_x on GPU -> vec_x_gpu on CPU
 81 | 	magma_dmalloc_cpu(&vec_b_gpu, dim);
 82 | 
 83 | 	// CPU(Host) -> GPU(device)
 84 | 
 85 | 	// mat_a -> dev_mat_a
 86 | 	magma_dsetmatrix(dim, dim, mat_a, dim, dev_mat_a, dim);
 87 | 
 88 | 	// size(vec_x) == size(vec_b)
 89 | 	inc_vec_x = inc_vec_b = 1;
 90 | 
 91 | 	// vec_x -> dev_vec_x
 92 | 	magma_dsetvector(dim, vec_x, inc_vec_x, dev_vec_x, inc_vec_x);
 93 | 
 94 | 	// vec_b := 1.0 * mat_a * vec_x + 0.0 * vec_b
 95 | 	alpha = 1.0;
 96 | 	beta = 0.0;
 97 | 	magmablas_dgemv(MagmaNoTrans, dim, dim, alpha, dev_mat_a, dim, dev_vec_x, inc_vec_x, beta, dev_vec_b, inc_vec_b);
 98 | 
 99 | 	// dev_vec_b -> vec_b_gpu
100 | 	magma_dgetvector(dim, dev_vec_b, inc_vec_b, vec_b_gpu, inc_vec_b);
101 | 
102 | 	// print
103 | 	for(i = 0; i < dim; i++)
104 | 	{
105 | 		printf("[");
106 | 		for(j = 0; j < dim; j++)
107 | 			printf("%10.3lf ", mat_a[i + j * dim]);
108 | 		printf("]  %10.3lf = %10.3lf\n", vec_x[i], vec_b_gpu[i]);
109 | 	}
110 | 
111 | 	// relative difference: ||vec_b_gpu - vec_b||_2 / ||vec_b||_2
112 | 	cblas_daxpy(dim, -1.0, vec_b, inc_vec_b, vec_b_gpu, inc_vec_b);
113 | 	printf("||vec_b_gpu - vec_b||_2 / ||vec_b||_2 = %15.7e\n", cblas_dnrm2(dim, vec_b_gpu, 1) / cblas_dnrm2(dim, vec_b, 1));
114 | 
115 | 	// free
116 | 	magma_free_cpu(mat_a);
117 | 	magma_free_cpu(vec_x);
118 | 	magma_free_cpu(vec_b);
119 | 	magma_free_cpu(vec_b_gpu);
120 | 
121 | 	magma_free(dev_mat_a);
122 | 	magma_free(dev_vec_b);
123 | 	magma_free(dev_vec_x);
124 | 
125 | 	// finalize MAGMA
126 | 	magma_finalize();
127 | 
128 | 	return EXIT_SUCCESS;
129 | }
130 | 


--------------------------------------------------------------------------------
/mm/diagtest.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate real general
 2 | %-------------------------------------------------------------------------------
 3 | % Test Sparse Matrix
 4 | % [ 5 0 0 0 0 ]
 5 | % [ 0 6 0 0 0 ]
 6 | % [ 0 0 7 0 0 ]
 7 | % [ 0 0 0 8 0 ]
 8 | % [ 0 0 0 0 9 ]
 9 | %-------------------------------------------------------------------------------
10 | 5 5 5
11 | 1 1 5
12 | 2 2 6
13 | 3 3 7
14 | 4 4 8
15 | 5 5 9
16 | 


--------------------------------------------------------------------------------
/mm/matrix_market_io.h:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* I/O functions for MatrixMarcket format        */
  3 | /*                                               */
  4 | /* Version 0.1: 2015-04-15(Wed)                  */
  5 | /*                                               */
  6 | /* Original code:                                */
  7 | /* http://math.nist.gov/MatrixMarket/mmio-c.html */
  8 | /* *************** Public Domain *************** */
  9 | /*************************************************/
 10 | 
 11 | #ifndef __MATRIX_MARKET_IO_H__
 12 | #define __MATRIX_MARKET_IO_H__
 13 | 
 14 | #include <stdio.h>
 15 | #include <stdlib.h>
 16 | #include <string.h>
 17 | 
 18 | // Max. length of string per one line
 19 | #define MM_MAX_LINE_LEN 4096
 20 | 
 21 | // Max. length of token
 22 | #define MM_MAX_TOKEN_LEN 128
 23 | 
 24 | // Bunner of Matrix Market format
 25 | #define MM_BANNER "%%MatrixMarket"
 26 | 
 27 | // return codes
 28 | #define MM_ERROR  (-1)
 29 | #define MM_SUCCESS (0)
 30 | 
 31 | // boolean values
 32 | #define MM_TRUE  (1)
 33 | #define MM_FALSE (0)
 34 | 
 35 | // The number of kinds of Matrix Market typecode
 36 | #define MM_NUM_TYPECODE 4
 37 | 
 38 | // Matrix Type code
 39 | typedef unsigned char MM_typecode[MM_NUM_TYPECODE];
 40 | 
 41 | /******************************************************/
 42 | /* Matrix Market Internal definitions of Matrix types */
 43 | /* type[4] ... 4-character sequence                   */
 44 | /*                                                    */
 45 | /* Object    Sparse                  Storage          */
 46 | /*           or Dense  Data Type     Scheme           */
 47 | /*                                                    */
 48 | /* type[0]   type[1]    type[2]      type[3]          */
 49 | /*                                                    */
 50 | /* M(atrix)  C(oord)    R(real)      G(eneral)        */
 51 | /*           A(rray)    C(omplex)    H(ermitian)      */
 52 | /*                      P(attern)    S(ymmetric)      */
 53 | /*                      I(nteger) (s)K(ew-symmetric)  */
 54 | /******************************************************/
 55 | 
 56 | #define MM_MATRIX_STR		"matrix"
 57 | #define mm_is_matrix(typecode)		((typecode)[0] == 'M')
 58 | #define mm_set_matrix(typecode)		((*typecode)[0] = 'M')
 59 | 
 60 | #define MM_SPARSE_STR		"coordinate"
 61 | #define mm_is_sparse(typecode)		((typecode)[1] == 'C')
 62 | #define mm_set_sparse(typecode)		((*typecode)[1] = 'C')
 63 | 
 64 | #define MM_COODINATE_STR	"coordinate"
 65 | #define mm_is_coodinate(typecode)	((typecode)[1] == 'C')
 66 | #define mm_set_coodinate(typecode)	((*typecode)[1] = 'C')
 67 | 
 68 | #define MM_DENSE_STR		"array"
 69 | #define mm_is_dense(typecode)		((typecode)[1] == 'A')
 70 | #define mm_set_dense(typecode)		((*typecode)[1] = 'A')
 71 | 
 72 | #define MM_ARRAY_STR		"array"
 73 | #define mm_is_array(typecode)		((typecode)[1] == 'A')
 74 | #define mm_set_array(typecode)		((*typecode)[1] = 'A')
 75 | 
 76 | #define MM_REAL_STR			"real"
 77 | #define mm_is_real(typecode)		((typecode)[2] == 'R')
 78 | #define mm_set_real(typecode)		((*typecode)[2] = 'R')
 79 | 
 80 | #define MM_COMPLEX_STR		"complex"
 81 | #define mm_is_complex(typecode)		((typecode)[2] == 'C')
 82 | #define mm_set_complex(typecode)	((*typecode)[2] = 'C')
 83 | 
 84 | #define MM_PATTERN_STR		"pattern"
 85 | #define mm_is_pattern(typecode)		((typecode)[2] == 'P')
 86 | #define mm_set_pattern(typecode)	((*typecode)[2] = 'P')
 87 | 
 88 | #define MM_INTEGER_STR		"integer"
 89 | #define mm_is_integer(typecode)		((typecode)[2] == 'I')
 90 | #define mm_set_integer(typecode)	((*typecode)[2] = 'I')
 91 | 
 92 | #define MM_GENERAL_STR		"general"
 93 | #define mm_is_general(typecode)		((typecode)[3] == 'G')
 94 | #define mm_set_general(typecode)	((*typecode)[3] = 'G')
 95 | 
 96 | #define MM_HERMITIAN_STR	"hermitian"
 97 | #define mm_is_hermitian(typecode)	((typecode)[3] == 'H')
 98 | #define mm_set_hermitian(typecode)	((*typecode)[3] = 'H')
 99 | 
100 | #define MM_SYMMETRIC_STR	"symmetric"
101 | #define mm_is_symmetric(typecode)	((typecode)[3] == 'S')
102 | #define mm_set_symmetric(typecode)	((*typecode)[3] = 'S')
103 | 
104 | #define MM_SKEW_STR			"skew-symmetric"
105 | #define mm_is_skew(typecode)		((typecode)[3] == 'K')
106 | #define mm_set_skew(typecode)		((*typecode)[3] = 'K')
107 | 
108 | /* clear type codes */
109 | #define mm_clear_typecode(typecode)	((*typecode)[0] = (*typecode)[1] = (*typecode)[2] = ' ', (*typecode)[3] = 'G')
110 | #define mm_initialize_typecode(typecode) mm_clear_typecode(typecode)
111 | 
112 | /* error codes */
113 | #define MM_COULD_NOT_READ_FILE	11
114 | #define MM_COULD_NOT_READ_FILE_STR	"MM_ERROR: cannot read the file!"
115 | 
116 | #define MM_COULD_NOT_WRITE_FILE	12
117 | #define MM_COULD_NOT_WRITE_FILE_STR	"MM_ERROR: cannot write the file!"
118 | 
119 | #define MM_NOT_MATRIX			13
120 | #define MM_NOT_MATRIX_STR			"MM_ERROR: not matrix!"
121 | 
122 | #define MM_NO_HEADER			14
123 | #define MM_NO_HEADER_STR			"MM_ERROR: cannot find MM header!"
124 | 
125 | #define MM_UNSUPPORTED_TYPE		15
126 | #define MM_UNSUPPORTED_TYPE_STR		"MM_ERROR: is unsupported type!"
127 | 
128 | #define MM_LINE_TOO_LONG		16
129 | #define MM_LINE_TOO_LONG_STR		"MM_ERROR: length of line exceeded!"
130 | 
131 | #define MM_PREMATURE_EOF		17
132 | #define MM_PREMATURE_EOF_STR		"MM_ERROR: premature EOF!"
133 | 
134 | #define MM_UNDEFINED_ERROR		18
135 | #define MM_UNDEFINED_ERROR_STR		"MM_ERROR: cannot specify the king of errors!"
136 | 
137 | /******************************************/
138 | /* Functions defined in mmio              */
139 | /*                                        */
140 | /* mm_is_valid                            */
141 | /* mm_typecode_to_str                     */
142 | /* mm_read_banner                         */
143 | /* mm_read_mtx_crd_size                   */
144 | /* mm_write_crd_size                      */
145 | /* mm_read_unsymmetric_sparse (obsolete?) */
146 | /* mm_read_mtx_array_size                 */
147 | /* mm_write_mtx_array_size                */
148 | /* mm_read_mtx_crd_data                   */
149 | /* mm_read_mtx_crd_entry                  */
150 | /* mm_read_mtx_crd                        */
151 | /* mm_write_mtx_crd                       */
152 | /******************************************/
153 | #ifdef __cplusplus
154 | extern "C" {
155 | #endif // __cplusplus
156 | 
157 | // NULL or print str
158 | unsigned char *mm_put_str(unsigned char *str);
159 | 
160 | /* print error messages */
161 | int mm_print_error(unsigned char *str, int error_code);
162 | 
163 | // convert type codes to strings
164 | unsigned char *mm_typecode_to_str(MM_typecode matcode);
165 | 
166 | // check matcode
167 | int mm_is_valid(MM_typecode matcode);
168 | 
169 | // read banner
170 | int mm_read_banner(FILE *fp, MM_typecode *matcode);
171 | 
172 | // write banner
173 | int mm_write_banner(FILE *fp, MM_typecode matcode);
174 | 
175 | // read coodinate matrix size
176 | int mm_read_mtx_crd_size(FILE *fp, int *row_dim, int *col_dim, int *num_nonzeros);
177 | 
178 | // write coodinate matrix size
179 | int mm_write_crd_size(FILE *fp, int row_dim, int col_dim, int num_nonzeros);
180 | 
181 | // read unsymmetric double precision sparse matrix
182 | // I cannot understand thre reason why this function is nessesary. For practical training ?
183 | int mm_read_unsymmetric_sparse(const char *fname, int *row_dim, int *col_dim, int *num_nonzeros, double **val, int **row_index, int **col_index);
184 | 
185 | // read mtx array size
186 | int mm_read_mtx_array_size(FILE *fp, int *row_dim, int *col_dim);
187 | 
188 | // write mtx array size
189 | int mm_write_mtx_array_size(FILE *fp, int row_dim, int col_dim);
190 | 
191 | // read coodinate matrix when row_index, col_index, val are already allocated
192 | int mm_read_mtx_crd_data(FILE *fp, int row_dim, int col_dim, int num_nonzeros, int row_index[], int col_index[], double val[], MM_typecode matcode);
193 | 
194 | // read one line(entry) of coodinate matrix 
195 | int mm_read_mtx_crd_entry(FILE *fp, int *row_index, int *col_index, double *real, double *imag, MM_typecode matcode);
196 | 
197 | // read any kinds of coodinate matrix
198 | // After finishing to read the whole data, mm_read_mtx_crd fills
199 | // row_dim, col_dim, num_nonzeros, row_index, col_index and array of values,
200 | // and return typecode such as "MCRS" which means "Matrix, Coordinate, Real, Symmetric".
201 | int mm_read_mtx_crd(const char *fname, int *row_dim, int *col_dim, int *num_nonzeros, int **row_index, int **col_index, double **val, MM_typecode *matcode);
202 | 
203 | // write mtx format file
204 | int mm_write_mtx_crd(unsigned char *fname, int row_dim, int col_dim, int num_nonzeros, int row_index[], int col_index[], double val[], MM_typecode matcode);
205 | 
206 | /******************************************/
207 | /* Functions newly defined                */
208 | /*                                        */
209 | /* mm_read_mtx_array_data                 */
210 | /* mm_read_mtx_array_entry                */
211 | /* mm_read_mtx_array                      */
212 | /* mm_write_mtx_array                     */
213 | /* mm_print_header_mtx_crd                */
214 | /* mm_print_header_mtx_array              */
215 | /******************************************/
216 | 
217 | // read array type matrix when val is already allocated
218 | int mm_read_mtx_array_data(FILE *fp, int row_dim, int col_dim, double val[], MM_typecode matcode);
219 | 
220 | // read one line(entry) of coodinate matrix 
221 | int mm_read_mtx_array_entry(FILE *fp, double *real, double *imag, MM_typecode matcode);
222 | 
223 | // read general real or complex array matrix
224 | // After finishing to read the whole data, mm_read_mtx_crd fills
225 | // row_dim, col_dim and array of values,
226 | // and return typecode such as "MARG" which means "Matrix, Array, Real, General".
227 | int mm_read_mtx_array(const char *fname, int *row_dim, int *col_dim, double **val, MM_typecode *matcode);
228 | 
229 | // write mtx format file
230 | int mm_write_mtx_array(unsigned char *fname, int row_dim, int col_dim, double val[], MM_typecode matcode);
231 | 
232 | // read any kinds of coodinate matrix within specified number of lines
233 | int mm_print_header_mtx_crd(const char *fname, int num_lines_print);
234 | 
235 | // read any kinds of coodinate matrix within specified number of lines
236 | int mm_print_header_mtx_array(const char *fname, int num_lines_print);
237 | 
238 | #ifdef __cplusplus
239 | } // extern "C"
240 | #endif // __cplusplus
241 | 
242 | #endif // __MATRIX_MARKET_IO_H__
243 | 


--------------------------------------------------------------------------------
/mm/sptest.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate real general
 2 | %-------------------------------------------------------------------------------
 3 | % Test Sparse Matrix
 4 | % [ 5 0 1 ]
 5 | % [ 0 6 0 ]
 6 | % [ 0 1 7 ]
 7 | %-------------------------------------------------------------------------------
 8 | 3 3 5
 9 | 1 1 5.0
10 | 2 2 6.0
11 | 3 2 1.0
12 | 1 3 1.0
13 | 3 3 7.0
14 | 


--------------------------------------------------------------------------------
/mm/sptest_array.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix array real general
 2 | %-------------------------------------------------------------------------------
 3 | % Test Sparse Matrix
 4 | % [ 5 0 1 ]
 5 | % [ 0 6 0 ]
 6 | % [ 0 1 7 ]
 7 | %-------------------------------------------------------------------------------
 8 | 3 3
 9 | 5.0
10 | 0
11 | 0
12 | 0.0
13 | 6.0
14 | 1.0
15 | 1.0
16 | 0.0
17 | 7.0
18 | 


--------------------------------------------------------------------------------
/mm/sptest_big.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate real general
 2 | %-------------------------------------------------------------------------------
 3 | % Test Sparse Matrix
 4 | % [ 5 0 1 0  0 ]
 5 | % [ 0 6 0 0 -1 ]
 6 | % [ 0 1 7 0  0 ]
 7 | % [ 0 1 0 8  0 ]
 8 | % [ 0 0 1 0  9 ]
 9 | %-------------------------------------------------------------------------------
10 | 5 5 10
11 | 1 1 5
12 | 1 3 1
13 | 2 2 6
14 | 2 5 -1
15 | 3 2 1
16 | 3 3 7
17 | 4 2 1
18 | 4 4 8
19 | 5 3 1
20 | 5 5 9
21 | 


--------------------------------------------------------------------------------
/mm/sptest_pattern.mtx:
--------------------------------------------------------------------------------
 1 | %%MatrixMarket matrix coordinate pattern general
 2 | %-------------------------------------------------------------------------------
 3 | % Test Sparse Matrix
 4 | % [ 1 0 1 ]
 5 | % [ 0 1 0 ]
 6 | % [ 0 1 1 ]
 7 | %-------------------------------------------------------------------------------
 8 | 3 3 5
 9 | 1 1
10 | 2 2
11 | 3 2
12 | 1 3
13 | 3 3
14 | 


--------------------------------------------------------------------------------
/my_linear_eq.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Solver for Linear equation                    */
  4 | /*                        with original routines */
  5 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | // rowwise only
 12 | // my_matvec_mul: vec_b := mat_a * vec_x
 13 | void my_matvec_mul(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x)
 14 | {
 15 | 	int i, j, row_index;
 16 | 
 17 | 	// main loop
 18 | 	for(i = 0; i < row_dim; i++)
 19 | 	{
 20 | 		vec_b[i] = 0.0;
 21 | 		row_index = row_dim * i;
 22 | 
 23 | 		for(j = 0; j < col_dim; j++)
 24 | 			vec_b[i] += mat_a[row_index + j] * vec_x[j];
 25 | 	}
 26 | }
 27 | 
 28 | // rowwise only
 29 | // my_linear_eq_solve: solve mat_a * x = vec_b in x -> vec_b := x
 30 | int my_linear_eq_solve(double *mat_a, int dim, int *pivot, double *vec_b)
 31 | {
 32 | 	int i, j, k, row_index_j, row_index_i, max_j, tmp_index;
 33 | 	double absmax_aji, abs_aji, pivot_aii, vec_x;
 34 | 
 35 | 	// initialize pivot vector
 36 | 	for(i = 0; i < dim; i++)
 37 | 		pivot[i] = i;
 38 | 
 39 | 	// forward 
 40 | 	for(i = 0; i < dim; i++)
 41 | 	{
 42 | 		// partial pivoting
 43 | 		absmax_aji = fabs(mat_a[pivot[i] * dim + i]);
 44 | 		max_j = i;
 45 | 		for(j = i + 1; j < dim; j++)
 46 | 		{
 47 | 			abs_aji = mat_a[pivot[j] * dim + i];
 48 | 			if(absmax_aji < abs_aji)
 49 | 			{
 50 | 				max_j = j;
 51 | 				absmax_aji = abs_aji;
 52 | 			}
 53 | 		}
 54 | 		if(max_j != i)
 55 | 		{
 56 | 			tmp_index = pivot[max_j];
 57 | 			pivot[max_j] = pivot[i];
 58 | 			pivot[i] = tmp_index;
 59 | 		}
 60 | 
 61 | 		// select pivoted column
 62 | 		row_index_i = pivot[i] * dim;
 63 | 		pivot_aii = mat_a[row_index_i + i];
 64 | 
 65 | 		// error
 66 | 		if(fabs(pivot_aii) <= 0.0)
 67 | 			return -1;
 68 | 
 69 | 		for(j = i + 1; j < dim; j++)
 70 | 		{
 71 | 			row_index_j = pivot[j] * dim;
 72 | 			mat_a[row_index_j + i] /= pivot_aii;
 73 | 
 74 | 			for(k = i + 1; k < dim; k++)
 75 | 				mat_a[row_index_j + k] -= mat_a[row_index_j + i] * mat_a[row_index_i + k];
 76 | 		}
 77 | 	}
 78 | 	
 79 | 	// forward substitution
 80 | 	for(j = 0; j < dim; j++)
 81 | 	{
 82 | 		vec_x = vec_b[pivot[j]];
 83 | 		for(i = j + 1; i < dim; i++)
 84 | 			vec_b[pivot[i]] -= mat_a[pivot[i] * dim + j] * vec_x;
 85 | 	}
 86 | 
 87 | 	// backward substitution
 88 | 	for(i = dim - 1; i >= 0; i--)
 89 | 	{
 90 | 		vec_x = vec_b[pivot[i]];
 91 | 		row_index_i = pivot[i] * dim;
 92 | 		for(j = i + 1; j < dim; j++)
 93 | 			vec_x -= mat_a[row_index_i + j] * vec_b[pivot[j]];
 94 | 
 95 | 		vec_b[pivot[i]] = vec_x / mat_a[row_index_i + i];
 96 | 	}
 97 | 
 98 | 	// reordering
 99 | 	for(i = 0; i < dim; i++)
100 | 	{
101 | 		if(pivot[i] != i)
102 | 		{
103 | 			for(j = i + 1; j < dim; j++)
104 | 			{
105 | 				if(pivot[j] == i)
106 | 				{
107 | 					vec_x = vec_b[pivot[i]];
108 | 					vec_b[pivot[i]] = vec_b[i];
109 | 					vec_b[i] = vec_x;
110 | 					pivot[j] = pivot[i];
111 | 					pivot[i] = i;
112 | 				}
113 | 			}
114 | 		}
115 | 	}
116 | 
117 | 	return 0;
118 | }
119 | 
120 | int main()
121 | {
122 | 	int i, j, dim;
123 | 	int *pivot, info;
124 | 
125 | 	double *mat_a, *vec_b, *vec_x;
126 | 	double alpha, beta;
127 | 	double running_time;
128 | 
129 | 	// input dimension of linear equation to be solved
130 | 	printf("Dim = "); scanf("%d", &dim);
131 | 
132 | 	if(dim <= 0)
133 | 	{
134 | 		printf("Illegal dimension! (dim = %d)\n", dim);
135 | 		return EXIT_FAILURE;
136 | 	}
137 | 
138 | 	// initialize a matrix and vectors
139 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
140 | 	vec_x = (double *)calloc(dim, sizeof(double));
141 | 	vec_b = (double *)calloc(dim, sizeof(double));
142 | 
143 | 	// input mat_a and vec_x
144 | 	for(i = 0; i < dim; i++)
145 | 	{
146 | 		for(j = 0; j < dim; j++)
147 | 		{
148 | 			mat_a[i * dim + j] = 1.0 / (double)(i + j + 1);
149 | 			if((i + j + 1) % 2 != 0)
150 | 				mat_a[i * dim + j] = -mat_a[i * dim + j];
151 | 		}
152 | 		mat_a[i * dim + i] += 2.0;
153 | 		vec_x[i] = 1.0 / (double)(i + 1);
154 | 	}
155 | 
156 | 	// vec_b := mat_a * vec_x
157 | 	my_matvec_mul(vec_b, mat_a, dim, dim, vec_x);
158 | 
159 | 	// print
160 | 	for(i = 0; i < dim; i++)
161 | 	{
162 | 		printf("[");
163 | 		for(j = 0; j < dim; j++)
164 | 			printf("%10.3f ", mat_a[i * dim + j]);
165 | 		printf("]  %10.3f = %10.3f\n", vec_x[i], vec_b[i]);
166 | 	}
167 | 
168 | 	// initialize pivot vector
169 | 	pivot = (int *)calloc(sizeof(int), dim);
170 | 
171 | 	// solve A * x = b -> b := x
172 | 	info = my_linear_eq_solve(mat_a, dim, pivot, vec_b);
173 | 
174 | 	printf("info = %d\n", info);
175 | 
176 | 	// print
177 | 	printf("calculated x = \n");
178 | 	for(i = 0; i < dim; i++)
179 | 	{
180 | 		printf("%3d -> %3d: ", i, pivot[i]);
181 | 		printf("%25.17e ", vec_b[pivot[i]]);
182 | 		printf("\n");
183 | 	}
184 | 
185 | 	// diff
186 | 	printf("x - calculated x = \n");
187 | 	for(i = 0; i < dim; i++)
188 | 	{
189 | 		printf("%3d: ", i);
190 | 		printf("%10.2e ", fabs((vec_x[i] - vec_b[i]) / vec_x[i]));
191 | 		printf("\n");
192 | 	}
193 | 	
194 | 	// free
195 | 	free(mat_a);
196 | 	free(vec_x);
197 | 	free(vec_b);
198 | 	free(pivot);
199 | 
200 | 	return EXIT_SUCCESS;
201 | }
202 | 


--------------------------------------------------------------------------------
/my_linear_eq_omp.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/my_linear_eq_omp.c


--------------------------------------------------------------------------------
/my_matvec_mul.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Multiplication of matrix and vector           */
  4 | /*                                   with OpenMP */
  5 | /* Last Update: 2016-11-30 (Wed) T.Kouya         */
  6 | /*************************************************/
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | 
 11 | // rowwise only
 12 | // my_matvec_mul: vec_b := mat_a * vec_x
 13 | #ifdef _OPENMP
 14 | void my_matvec_mul_omp(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x)
 15 | {
 16 | 	int i, j, row_index;
 17 | 
 18 | 	// parallelized main loop
 19 | 	#pragma omp parallel for private(j)
 20 | 	for(i = 0; i < row_dim; i++)
 21 | 	{
 22 | 		//printf("Thread No. %d:\n", omp_get_thread_num());
 23 | 		vec_b[i] = 0.0;
 24 | 		row_index = row_dim * i;
 25 | 
 26 | 		for(j = 0; j < col_dim; j++)
 27 | 			vec_b[i] += mat_a[row_index + j] * vec_x[j];
 28 | 	}
 29 | }
 30 | #endif // _OPENMP
 31 | 
 32 | // rowwise only
 33 | // my_matvec_mul: vec_b := mat_a * vec_x
 34 | void my_matvec_mul(double *vec_b, double *mat_a, int row_dim, int col_dim, double *vec_x)
 35 | {
 36 | 	int i, j, row_index;
 37 | 
 38 | 	// serial main loop
 39 | 	for(i = 0; i < row_dim; i++)
 40 | 	{
 41 | 		vec_b[i] = 0.0;
 42 | 		row_index = row_dim * i;
 43 | 
 44 | 		for(j = 0; j < col_dim; j++)
 45 | 			vec_b[i] += mat_a[row_index + j] * vec_x[j];
 46 | 	}
 47 | }
 48 | 
 49 | int main()
 50 | {
 51 | 	int i, j, dim;
 52 | 
 53 | 	double *mat_a, *vec_b, *vec_x;
 54 | 
 55 | 	// input dimension of square matrix and vector
 56 | 	printf("Dim = "); scanf("%d", &dim);
 57 | 
 58 | 	// input maximum number of threads
 59 | #ifdef _OPENMP
 60 | 	int num_threads;
 61 | 
 62 | 	printf("Max.Num.threads = "); scanf("%d", &num_threads);
 63 | 	omp_set_num_threads(num_threads);
 64 | 	printf("#threads = %d\n", omp_get_max_threads());
 65 | #endif // _OPENMP
 66 | 
 67 | 	if(dim <= 0)
 68 | 	{
 69 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 70 | 		return EXIT_FAILURE;
 71 | 	}
 72 | 
 73 | 	// initialize a matrix and vectors
 74 | 	mat_a = (double *)calloc(dim * dim, sizeof(double));
 75 | 	vec_x = (double *)calloc(dim, sizeof(double));
 76 | 	vec_b = (double *)calloc(dim, sizeof(double));
 77 | 
 78 | 	// input mat_a and vec_x
 79 | 	for(i = 0; i < dim; i++)
 80 | 	{
 81 | 		for(j = 0; j < dim; j++)
 82 | 		{
 83 | 			mat_a[i * dim + j] = (double)(i + j + 1);
 84 | 			if((i + j + 1) % 2 != 0)
 85 | 				mat_a[i * dim + j] *= -1.0;
 86 | 		}
 87 | 		vec_x[i] = 1.0 / (double)(i + 1);
 88 | 	}
 89 | 
 90 | 	// vec_b := mat_a * vec_x
 91 | #ifdef _OPENMP
 92 | 	my_matvec_mul_omp(vec_b, mat_a, dim, dim, vec_x);
 93 | #else
 94 | 	my_matvec_mul(vec_b, mat_a, dim, dim, vec_x);
 95 | #endif // _OPENMP
 96 | 
 97 | 	// print
 98 | 	for(i = 0; i < dim; i++)
 99 | 	{
100 | 		printf("[");
101 | 		for(j = 0; j < dim; j++)
102 | 			printf("%10.3lf ", mat_a[i * dim + j]);
103 | 		printf("]  %10.3lf = %10.3lf\n", vec_x[i], vec_b[i]);
104 | 	}
105 | 
106 | 	// free
107 | 	free(mat_a);
108 | 	free(vec_x);
109 | 	free(vec_b);
110 | 
111 | 	return EXIT_SUCCESS;
112 | }
113 | 


--------------------------------------------------------------------------------
/my_matvec_mul_pt.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/my_matvec_mul_pt.c


--------------------------------------------------------------------------------
/mycuda.c:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/mycuda.c


--------------------------------------------------------------------------------
/mycuda.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/mycuda.h


--------------------------------------------------------------------------------
/mycuda_daxpy.cu:
--------------------------------------------------------------------------------
  1 | /********************************************************************************/
  2 | /* mycuda_daxpy : Original DAXPY based on CUDA                                  */
  3 | /* Copyright (C) 2015 Tomonori Kouya                                            */
  4 | /*                                                                              */
  5 | /* This program is free software: you can redistribute it and/or modify it      */
  6 | /* under the terms of the GNU Lesser General Public License as published by the */
  7 | /* Free Software Foundation, either version 3 of the License or any later       */
  8 | /* version.                                                                     */
  9 | /*                                                                              */
 10 | /* This program is distributed in the hope that it will be useful, but WITHOUT  */
 11 | /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or        */
 12 | /* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License */
 13 | /* for more details.                                                            */
 14 | /*                                                                              */
 15 | /* You should have received a copy of the GNU Lesser General Public License     */
 16 | /* along with this program.  If not, see <http://www.gnu.org/licenses/>.        */
 17 | /*                                                                              */
 18 | /********************************************************************************/
 19 | #include <stdio.h>
 20 | 
 21 | #include "cblas.h"
 22 | #include "mycuda.h"
 23 | #include "tkaux.h"
 24 | 
 25 | // y := alpha * x + y
 26 | __global__ void  mycuda_daxpy_kernel (int dim, double *ptr_alpha, double x[], int x_step_dim, double y[], int y_step_dim)
 27 | {
 28 | 	int k = blockIdx.x * blockDim.x + threadIdx.x;
 29 | 
 30 | 	int x_index = k * x_step_dim;
 31 | 	int y_index = k * y_step_dim;
 32 | 
 33 | 	if ((x_index < dim) && (y_index < dim))
 34 | 	{
 35 | 		y[y_index] = *ptr_alpha * x[x_index] + y[y_index];
 36 | 	}
 37 | }
 38 | 
 39 | // Maximum threads per a block
 40 | #define MAX_NUM_THREADS_CUDA 8
 41 | 
 42 | // c := a * b
 43 | void  mycuda_daxpy(int dim, double *dev_alpha, double dev_x[], int x_step_dim, double dev_y[], int y_step_dim)
 44 | {
 45 | 	dim3 threads(MAX_NUM_THREADS_CUDA);
 46 | 	dim3 blocks(1);
 47 | 
 48 | 	threads.x = (dim > MAX_NUM_THREADS_CUDA) ? MAX_NUM_THREADS_CUDA : dim;
 49 | 
 50 | 	blocks.x = (dim > MAX_NUM_THREADS_CUDA) ? (dim / MAX_NUM_THREADS_CUDA) + 1 : 1;
 51 | 
 52 | 	printf("Threads (x): %d\n", threads.x);
 53 | 	printf("Blocks  (x): %d\n", blocks.x);
 54 | 
 55 | 	mycuda_daxpy_kernel<<<blocks, threads>>>(dim, dev_alpha, dev_x, x_step_dim, dev_y, y_step_dim);
 56 | }
 57 | 
 58 | int main()
 59 | {
 60 | 	int i, dim;
 61 | 	double host_alpha, *host_x, *host_y; // on CPU
 62 | 	double *dev_alpha, *dev_x, *dev_y;   // on GPU
 63 | 
 64 | 	printf("dim = "); scanf("%d", &dim);
 65 | 
 66 | 	host_x = (double *)calloc(dim, sizeof(double));
 67 | 	host_y = (double *)calloc(dim, sizeof(double));
 68 | 
 69 | 	// alpha = sqrt(2)
 70 | 	// x[i] = sqrt(2) * i
 71 | 	// y[i] = sqrt(3) * (dim - i)
 72 | 
 73 | 	host_alpha = sqrt(2.0);
 74 | 	for(i = 0; i < dim; i++)
 75 | 	{
 76 | 		host_x[i] = sqrt(2.0) * (double)(i + 1);
 77 | 		host_y[i] = sqrt(3.0) * (double)(dim - i);
 78 | 	}
 79 | 
 80 | 	// host to device
 81 | 	dev_alpha = (double *)mycuda_calloc(1, sizeof(double));
 82 | 	dev_x = (double *)mycuda_calloc(dim, sizeof(double));
 83 | 	dev_y = (double *)mycuda_calloc(dim, sizeof(double));
 84 | 
 85 | 	// host_x -> dev_x
 86 | 	// host_y -> dev_y
 87 | 	cudaMemcpy((void *)dev_alpha, (void *)&host_alpha, sizeof(double), cudaMemcpyHostToDevice);
 88 | 	cudaMemcpy((void *)dev_x, (void *)host_x, dim * sizeof(double), cudaMemcpyHostToDevice);
 89 | 	cudaMemcpy((void *)dev_y, (void *)host_y, dim * sizeof(double), cudaMemcpyHostToDevice);
 90 | 
 91 | 	// y := alpha * x + y on CPU
 92 | 	cblas_daxpy(dim, host_alpha, host_x, 1, host_y, 1);
 93 | 
 94 | 	printf_dvector("%d %25.17e\n", host_y, dim, 1);
 95 | 
 96 | 	// y := alpha * x + y on GPU
 97 | 	mycuda_daxpy(dim, dev_alpha, dev_x, 1, dev_y, 1);
 98 | 
 99 | 	// dev_y -> host_x
100 | 	cudaMemcpy((void *)host_x, (void *)dev_y, dim * sizeof(double), cudaMemcpyDeviceToHost);
101 | 
102 | 	printf_dvector("%d %25.17e\n", host_x, dim, 1);
103 | 
104 | 	mycuda_free(dev_x);
105 | 	mycuda_free(dev_y);
106 | 
107 | 	free(host_x);
108 | 	free(host_y);
109 | 
110 | 	return 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/power_eig.c:
--------------------------------------------------------------------------------
  1 | /*************************************************/
  2 | /* LAPACK/BLAS Tutorial                          */
  3 | /* Power Method for max eigenvalue & eigenvetor  */
  4 | /* Last Update: 2015-02-20 (Fri) T.Kouya         */
  5 | /*************************************************/
  6 | #include <stdio.h>
  7 | #include <stdlib.h>
  8 | #include <math.h>
  9 | #include "cblas.h"
 10 | 
 11 | // relative difference
 12 | double reldiff_dvector(double *vec1, double *vec2, int dim)
 13 | {
 14 | 	double *tmp_vec;
 15 | 	double ret, norm;
 16 | 
 17 | 	tmp_vec = (double *)calloc(dim, sizeof(double));
 18 | 
 19 | 	cblas_dcopy(dim, vec1, 1, tmp_vec, 1);
 20 | 	cblas_daxpy(dim, -1.0, vec2, 1, tmp_vec, 1);
 21 | 	ret = cblas_dnrm2(dim, tmp_vec, 1);
 22 | 	norm = cblas_dnrm2(dim, vec1, 1);
 23 | 
 24 | 	if(norm != 0.0)
 25 | 		ret /= norm;
 26 | 
 27 | 	return ret;
 28 | }
 29 | 
 30 | int main()
 31 | {
 32 | 	int i, j, dim, itimes;
 33 | 	double *ma, *md, *vy, *vx, *vx_true, *vx_new, *vb, eig, eig_old, y_norm;
 34 | 	double reps, aeps;
 35 | 
 36 | 	// input dimension
 37 | 	printf("Dim = "); scanf("%d", &dim);
 38 | 
 39 | 	if(dim <= 0)
 40 | 	{
 41 | 		printf("Illegal dimension! (dim = %d)\n", dim);
 42 | 		return EXIT_FAILURE;
 43 | 	}
 44 | 
 45 | 	// Initialize
 46 | 	ma = (double *)calloc(dim * dim, sizeof(double));
 47 | 	vy = (double *)calloc(dim, sizeof(double));
 48 | 	vx = (double *)calloc(dim, sizeof(double));
 49 | 
 50 | 	// input ma
 51 | 	for(i = 0; i < dim; i++)
 52 | 	{
 53 | 		for(j = 0; j < dim; j++)
 54 | 			ma[i * dim + j] = sqrt(2.0) * (double)(i + j + 1);
 55 | 		ma[i * dim + i] = sqrt(2.0) * dim * dim;
 56 | 	}
 57 | 
 58 | 	reps = 1.0e-10;
 59 | 	aeps = 0.0;
 60 | 
 61 | 
 62 | 	// vx := [1 1 ... 1] / sqrt(dim)
 63 | 	for(i = 0; i < dim; i++)
 64 | 		vx[i] = 1.0 / sqrt((double)dim);
 65 | 
 66 | 	eig_old = eig = 0.0;
 67 | 
 68 | 	// Power Method
 69 | 	for(itimes = 0; itimes < dim * 10; itimes++)
 70 | 	{
 71 | 		// y := A * x
 72 | 		cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, ma, dim, vx, 1, 0.0, vy, 1);
 73 | 		//for(i = 0; i < dim; i++)
 74 | 		//	printf("%3d %15.7e %15.7e\n", i, vx[i], vy[i]);
 75 | 		//printf("\n");
 76 | 
 77 | 		// eig = (A * x, x) / (x, x) = (A * x, x)
 78 | 		eig = cblas_ddot(dim, vy, 1, vx, 1);
 79 | 
 80 | 		// |eig_old - eig | <= reps * |eig| + aeps
 81 | 		if(fabs(eig_old - eig) <= reps * fabs(eig) + aeps)
 82 | 			break;
 83 | 
 84 | 		printf("%3d %25.17e\n", itimes, eig);
 85 | 
 86 | 		eig_old = eig;
 87 | 
 88 | 		// x := y / ||y||
 89 | 		y_norm = 1.0 / cblas_dnrm2(dim, vy, 1);
 90 | 		cblas_dscal(dim, y_norm, vy, 1);
 91 | 		cblas_dcopy(dim, vy, 1, vx, 1);
 92 | 	}
 93 | 
 94 | 	// eig * x == A * x?
 95 | 	cblas_dgemv(CblasRowMajor, CblasNoTrans, dim, dim, 1.0, ma, dim, vx, 1, 0.0, vy, 1);
 96 | 	cblas_dscal(dim, eig, vx, 1);
 97 | 
 98 | 	// print
 99 | 	printf("Iterative Times = %d\n", itimes);
100 | 	printf("Max. Eigenvalue = %25.17e\n", eig);
101 | 	printf("Rel.Diff = %10.3e\n", reldiff_dvector(vx, vy, dim));
102 | 
103 | 	for(i = 0; i < dim; i++)
104 | 	{
105 | 		printf("%3d %25.17e %25.17e\n", i, eig * vx[i], vy[i]);
106 | 	}
107 | 
108 | 	// free
109 | 	free(ma);
110 | 	free(vy);
111 | 	free(vx);
112 | 
113 | 
114 | 	return EXIT_SUCCESS;
115 | }
116 | 


--------------------------------------------------------------------------------
/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tkouya/lapack_blas_tutorial/a7d62bb465c5458ee9299fa84d628f462c0d65c6/readme.txt


--------------------------------------------------------------------------------
/row_column_major.c:
--------------------------------------------------------------------------------
 1 | /*************************************************/
 2 | /* LAPACK/BLAS Tutorial                          */
 3 | /* Row-major and column-major matrices           */
 4 | /* Last Update: 2016-12-01 (Thu) T.Kouya         */
 5 | /*************************************************/
 6 | #include <stdio.h>
 7 | #include <stdlib.h>
 8 | #include <math.h>
 9 | 
10 | #include "cblas.h"
11 | 
12 | int main()
13 | {
14 | 	int i, j, row_dim, col_dim;
15 | 
16 | 	double *mat_a;
17 | 
18 | 	// input dimension
19 | 	printf("Row    Dim = "); scanf("%d", &row_dim);
20 | 	printf("Column Dim = "); scanf("%d", &col_dim);
21 | 
22 | 	if((row_dim <= 0) || (col_dim <= 0))
23 | 	{
24 | 		printf("Illegal dimension! (row_dim = %d, col_dim = %d)\n", row_dim, col_dim);
25 | 		return EXIT_FAILURE;
26 | 	}
27 | 
28 | 	// initialize matrix area
29 | 	mat_a = (double *)calloc(row_dim * col_dim, sizeof(double));
30 | 
31 | 	printf("Row Major: %d x %d\n", row_dim, col_dim);
32 | 
33 | 	// Row Major
34 | 	// mat_a = A
35 | 	// A = [1   2   ....... n]
36 | 	//     [n+1 n+2 ...... 2n]
37 | 	//     [.................]
38 | 	//     [(m-1)n+1 ..... mn]
39 | 	for(i = 0; i < row_dim; i++)
40 | 	{
41 | 		for(j = 0; j < col_dim; j++)
42 | 			mat_a[i * col_dim + j] = (double)(i * col_dim + j + 1);
43 | 	}
44 | 
45 | 	// print (1)
46 | 	printf("1 dimensional: \n");
47 | 	printf("[");
48 | 	for(i = 0; i < row_dim * col_dim; i++)
49 | 		printf(" %6.3lf ", mat_a[i]);
50 | 	printf("]\n");
51 | 
52 | 	// print (2)
53 | 	printf("2 dimensional: \n");
54 | 	for(i = 0; i < row_dim; i++)
55 | 	{
56 | 		printf("[");
57 | 		for(j = 0; j < col_dim; j++)
58 | 			printf(" %6.3lf ", mat_a[i * col_dim + j]);
59 | 		printf("]\n");
60 | 	}
61 | 
62 | 	printf("Column Major: %d x %d\n", row_dim, col_dim);
63 | 
64 | 	// Column Major
65 | 	// mat_a = A
66 | 	// A = [1   2   ....... n]
67 | 	//     [n+1 n+2 ...... 2n]
68 | 	//     [.................]
69 | 	//     [(m-1)n+1 ..... mn]
70 | 	for(j = 0; j < col_dim; j++)
71 | 	{
72 | 		for(i = 0; i < row_dim; i++)
73 | 			mat_a[i + row_dim * j] = (double)(i * col_dim + j + 1);
74 | 	}
75 | 
76 | 	// print (1)
77 | 	printf("1 dimension: \n");
78 | 	printf("[");
79 | 	for(i = 0; i < row_dim * col_dim; i++)
80 | 		printf(" %6.3lf ", mat_a[i]);
81 | 	printf("]\n");
82 | 
83 | 	// print (2)
84 | 	printf("2 dimension: \n");
85 | 	for(i = 0; i < row_dim; i++)
86 | 	{
87 | 		printf("[");
88 | 		for(j = 0; j < col_dim; j++)
89 | 			printf(" %6.3lf ", mat_a[i + row_dim * j]);
90 | 		printf("]\n");
91 | 	}
92 | 
93 | 	// free
94 | 	free(mat_a);
95 | 
96 | 	return EXIT_SUCCESS;
97 | }
98 | 


--------------------------------------------------------------------------------
/tkaux.c:
--------------------------------------------------------------------------------
  1 | /********************************************************************************/
  2 | /* tkaux.c : Auxiliary Routines for textbook "LAPACK/BLAS Tutorial"             */
  3 | /* Copyright (C) 2015 Tomonori Kouya                                            */
  4 | /*                                                                              */
  5 | /* This program is free software: you can redistribute it and/or modify it      */
  6 | /* under the terms of the GNU Lesser General Public License as published by the */
  7 | /* Free Software Foundation, either version 3 of the License or any later       */
  8 | /* version.                                                                     */
  9 | /*                                                                              */
 10 | /* This program is distributed in the hope that it will be useful, but WITHOUT  */
 11 | /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or        */
 12 | /* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License */
 13 | /* for more details.                                                            */
 14 | /*                                                                              */
 15 | /* You should have received a copy of the GNU Lesser General Public License     */
 16 | /* along with this program.  If not, see <http://www.gnu.org/licenses/>.        */
 17 | /*                                                                              */
 18 | /********************************************************************************/
 19 | #include <stdio.h>
 20 | #include <stdlib.h>
 21 | #include <math.h>
 22 | 
 23 | #include "tkaux.h"
 24 | 
 25 | // printf_dvector -> printf(format, index, dvec[index])
 26 | void printf_dvector(const char *format, double *dvec, int dim_dvec, int inc_dvec)
 27 | {
 28 | 	int index;
 29 | 
 30 | 	for(index = 0; index < dim_dvec; index += inc_dvec)
 31 | 		printf(format, index, dvec[index]);
 32 | }
 33 | 
 34 | // printf_dvecto2r -> printf(format, index, dvec1[index], dvec2[index])
 35 | void printf_dvector2(const char *format, double *dvec1, double *dvec2, int dim_dvec, int inc_dvec)
 36 | {
 37 | 	int index;
 38 | 
 39 | 	for(index = 0; index < dim_dvec; index += inc_dvec)
 40 | 		printf(format, index, dvec1[index], dvec2[index]);
 41 | }
 42 | 
 43 | // zero clear
 44 | void set0_dvector(double *dvec, int dim_dvec, int inc_dvec)
 45 | {
 46 | 	int i;
 47 | 
 48 | 	for(i = 0; i < dim_dvec; i += inc_dvec)
 49 | 		dvec[i] = 0.0;
 50 | }
 51 | 
 52 | // zero clear
 53 | void set0_dmatrix(double *dmat, int row_dim, int col_dim)
 54 | {
 55 | 	int index;
 56 | 
 57 | 	for(index = 0; index < row_dim * col_dim; index++)
 58 | 		dmat[index] = 0.0;
 59 | }
 60 | 
 61 | // zero clear
 62 | void set0_zmatrix(double complex *zmat, int row_dim, int col_dim)
 63 | {
 64 | 	int index;
 65 | 
 66 | 	for(index = 0; index < row_dim * col_dim; index++)
 67 | 		zmat[index] = 0.0 + 0.0 * I;
 68 | }
 69 | 
 70 | // Column Major <- Row major
 71 | // colm_mat <- rowm_mat
 72 | void row2col_dmatrix(double *colm_mat, int colm_row_dim, int colm_col_dim, double *rowm_mat)
 73 | {
 74 | 	int i, j;
 75 | 
 76 | 	for(j = 0; j < colm_col_dim; j++)
 77 | 	{
 78 | 		for(i = 0; i < colm_row_dim; i++)
 79 | 			colm_mat[i + j * colm_row_dim] = rowm_mat[i * colm_col_dim + j];
 80 | 			//printf("%d <= %d\n", i + j * colm_row_dim, i * colm_col_dim + j);
 81 | 	}
 82 | }
 83 | 
 84 | // Row Major <- Column major
 85 | // colm_mat <- rowm_mat
 86 | void col2row_dmatrix(double *rowm_mat, int rowm_row_dim, int rowm_col_dim, double *colm_mat)
 87 | {
 88 | 	int i, j;
 89 | 
 90 | 	for(i = 0; i < rowm_row_dim; i++)
 91 | 	{
 92 | 		for(j = 0; j < rowm_col_dim; j++)
 93 | 			rowm_mat[i * rowm_col_dim + j] = colm_mat[i + j * rowm_row_dim];
 94 | 		//	printf("%d <= %d\n", i * rowm_col_dim + j, i + rowm_row_dim * j);
 95 | 	}
 96 | }
 97 | 
 98 | // norm elative error 
 99 | // || dvec_err - dvec_true ||_2 / || dvec_true ||_2
100 | double dreldiff_dvector(double *dvec_err, int dim_dvec, int inc_dvec_err, double *dvec_true, int inc_dvec_true)
101 | {
102 | 	double reldiff, norm_dvec_true, *diff_vec;
103 | 
104 | 	diff_vec = (double *)calloc(dim_dvec, sizeof(double));
105 | 
106 | 	// diff_vec := -dvec_err + dvec_true
107 | 	cblas_dcopy(dim_dvec, dvec_err, 1, diff_vec, 1);
108 | 	cblas_daxpy(dim_dvec, -1.0, dvec_true,  inc_dvec_true, diff_vec, 1);
109 | 	reldiff = cblas_dnrm2(dim_dvec, diff_vec, 1);
110 | 	norm_dvec_true = cblas_dnrm2(dim_dvec, dvec_true, 1);
111 | 	if(norm_dvec_true > 0.0)
112 | 		reldiff /= norm_dvec_true;
113 | 
114 | 	free(diff_vec);
115 | 
116 | 	return reldiff;
117 | }
118 | 


--------------------------------------------------------------------------------
/tkaux.h:
--------------------------------------------------------------------------------
 1 | /********************************************************************************/
 2 | /* tkaux.h : Auxiliary Routines for textbook "LAPACK/BLAS Tutorial"             */
 3 | /* Copyright (C) 2015 Tomonori Kouya                                            */
 4 | /*                                                                              */
 5 | /* This program is free software: you can redistribute it and/or modify it      */
 6 | /* under the terms of the GNU Lesser General Public License as published by the */
 7 | /* Free Software Foundation, either version 3 of the License or any later       */
 8 | /* version.                                                                     */
 9 | /*                                                                              */
10 | /* This program is distributed in the hope that it will be useful, but WITHOUT  */
11 | /* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or        */
12 | /* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License */
13 | /* for more details.                                                            */
14 | /*                                                                              */
15 | /* You should have received a copy of the GNU Lesser General Public License     */
16 | /* along with this program.  If not, see <http://www.gnu.org/licenses/>.        */
17 | /*                                                                              */
18 | /********************************************************************************/
19 | #include <stdio.h>
20 | #include <math.h>
21 | #include <complex.h>
22 | 
23 | #ifndef USE_IMKL
24 | 	#include "cblas.h"
25 | #endif // USE_IMKL
26 | 
27 | #ifndef __TKAUX_H__
28 | 
29 | #define __TKAUX_H__
30 | 
31 | // macros
32 | #define DMAX(a, b) (((a) > (b)) ? (a) : (b))
33 | #define DMIN(a, b) (((a) < (b)) ? (a) : (b))
34 | 
35 | #ifdef __cplusplus
36 | extern "C" {
37 | #endif
38 | 
39 | // printf_dvector(format, dvec, dim, interval -> printf(format, index, dvec[index + interval])
40 | void printf_dvector(const char *, double *, int, int);
41 | 
42 | // printf_dvecto2r -> printf(format, index, dvec1[index], dvec2[index])
43 | void printf_dvector2(const char *, double *, double *, int, int);
44 | 
45 | // zero clear
46 | void set0_dvector(double *, int, int);
47 | 
48 | // zero clear
49 | void set0_dmatrix(double *, int, int);
50 | 
51 | // zero clear
52 | void set0_zmatrix(double complex *, int, int);
53 | 
54 | // Column Major <- Row major
55 | // colm_mat <- rowm_mat
56 | void row2col_dmatrix(double *, int, int, double *);
57 | 
58 | // Row Major <- Column major
59 | // colm_mat <- rowm_mat
60 | void col2row_dmatrix(double *, int, int, double *);
61 | 
62 | // norm relative error (2-norm)
63 | // || dvec_err - dvec_true ||_2 / || dvec_true ||_2
64 | double dreldiff_dvector(double *, int, int, double *, int);
65 | 
66 | #ifdef __cplusplus
67 | }
68 | #endif
69 | 
70 | #endif // __TKAUX_H__
71 | 


--------------------------------------------------------------------------------