├── .gitignore
├── CMakeLists.txt
├── README.md
├── Test
    ├── CMakeLists.txt
    ├── cholesky.cpp
    ├── conv2d.cpp
    ├── dense.cpp
    ├── eigensymm.cpp
    ├── expression_optimizer.cpp
    ├── general_solve.cpp
    ├── getrf.cpp
    ├── hip_cholesky.cpp
    ├── hip_copy.cpp
    ├── hip_dense.cpp
    ├── hip_matrix_assign.cpp
    ├── hip_matrix_expression.cpp
    ├── hip_prod.cpp
    ├── hip_random.cpp
    ├── hip_syrk.cpp
    ├── hip_triangular_prod.cpp
    ├── hip_triangular_solve.cpp
    ├── hip_vector_assign.cpp
    ├── hip_vector_expression.cpp
    ├── hip_vector_set_expression.cpp
    ├── iterators.cpp
    ├── matrix_assign.cpp
    ├── matrix_expression.cpp
    ├── matrix_proxy.cpp
    ├── opencl_cholesky.cpp
    ├── opencl_conv2d.cpp
    ├── opencl_copy.cpp
    ├── opencl_dense.cpp
    ├── opencl_matrix_assign.cpp
    ├── opencl_matrix_expression.cpp
    ├── opencl_prod.cpp
    ├── opencl_random.cpp
    ├── opencl_syrk.cpp
    ├── opencl_triangular_prod.cpp
    ├── opencl_triangular_solve.cpp
    ├── opencl_vector_assign.cpp
    ├── opencl_vector_expression.cpp
    ├── opencl_vector_set_expression.cpp
    ├── prod.cpp
    ├── random.cpp
    ├── sparse.cpp
    ├── symm_solve.cpp
    ├── syrk.cpp
    ├── triangular_matrix.cpp
    ├── triangular_prod.cpp
    ├── triangular_solve.cpp
    ├── vector_assign.cpp
    ├── vector_expression.cpp
    └── vector_set_expression.cpp
├── doc
    ├── CMakeLists.txt
    ├── sphinx_pages
    │   ├── conf.py.in
    │   ├── index.tut
    │   └── quick_ref.rst
    └── tutToRst
    │   ├── CMakeLists.txt
    │   └── tut2rst.cpp
├── examples
    ├── Benchmarks
    │   ├── Timer.hpp
    │   ├── conv2d.cpp
    │   ├── gemm.cpp
    │   ├── opencl_conv2d.cpp
    │   ├── potrf.cpp
    │   ├── syrk.cpp
    │   └── trmm.cpp
    ├── CMakeLists.txt
    ├── createExampleSource.cmake
    └── linear_regression.tpp
└── include
    ├── CMakeLists.txt
    └── remora
        ├── assignment.hpp
        ├── cpu
            ├── dense.hpp
            ├── iterator.hpp
            ├── sparse.hpp
            ├── sparse_matrix.hpp
            └── traits.hpp
        ├── decompositions.hpp
        ├── dense.hpp
        ├── detail
            ├── check.hpp
            ├── evaluation_tags.hpp
            ├── expression_optimizers.hpp
            ├── matrix_expression_classes.hpp
            ├── proxy_optimizers_fwd.hpp
            ├── storage.hpp
            ├── structure.hpp
            ├── traits.hpp
            ├── vector_expression_classes.hpp
            └── vector_set.hpp
        ├── device_copy.hpp
        ├── expression_types.hpp
        ├── hip
            ├── buffer.hpp
            ├── copy.hpp
            ├── cublas_backend.hpp
            ├── dense.hpp
            ├── device.hpp
            ├── exception.hpp
            └── traits.hpp
        ├── io.hpp
        ├── kernels
            ├── atlas
            │   └── potrf.hpp
            ├── cblas
            │   ├── cblas_inc.hpp
            │   ├── dense_gemm.hpp
            │   ├── gemv.hpp
            │   ├── syrk.hpp
            │   ├── tpmv.hpp
            │   ├── trmm.hpp
            │   ├── trmv.hpp
            │   ├── trsm.hpp
            │   └── trsv.hpp
            ├── clBlast
            │   ├── conv2d.hpp
            │   ├── gemm.hpp
            │   ├── gemv.hpp
            │   ├── syrk.hpp
            │   ├── trmm.hpp
            │   ├── trmv.hpp
            │   ├── trsm.hpp
            │   └── trsv.hpp
            ├── conv2d.hpp
            ├── default
            │   ├── boost_align
            │   │   ├── aligned_alloc.hpp
            │   │   ├── aligned_allocator.hpp
            │   │   ├── assume_aligned.hpp
            │   │   └── detail
            │   │   │   ├── aligned_alloc.hpp
            │   │   │   ├── aligned_alloc_android.hpp
            │   │   │   ├── aligned_alloc_macos.hpp
            │   │   │   ├── aligned_alloc_msvc.hpp
            │   │   │   ├── aligned_alloc_posix.hpp
            │   │   │   ├── aligned_alloc_sunos.hpp
            │   │   │   ├── assume_aligned.hpp
            │   │   │   ├── assume_aligned_clang.hpp
            │   │   │   ├── assume_aligned_gcc.hpp
            │   │   │   ├── assume_aligned_intel.hpp
            │   │   │   ├── assume_aligned_msvc.hpp
            │   │   │   ├── is_alignment.hpp
            │   │   │   ├── is_alignment_constant.hpp
            │   │   │   ├── max_objects.hpp
            │   │   │   └── max_size.hpp
            │   ├── conv2d.hpp
            │   ├── dense_gemm.hpp
            │   ├── dot.hpp
            │   ├── fold_rows.hpp
            │   ├── gemm.hpp
            │   ├── gemv.hpp
            │   ├── getrf.hpp
            │   ├── matrix_assign.hpp
            │   ├── mgemm.hpp
            │   ├── potrf.hpp
            │   ├── pstrf.hpp
            │   ├── random.hpp
            │   ├── simd.hpp
            │   ├── syev.hpp
            │   ├── syrk.hpp
            │   ├── tpmv.hpp
            │   ├── trmm.hpp
            │   ├── trmv.hpp
            │   ├── trsm.hpp
            │   ├── trsv.hpp
            │   ├── vector_assign.hpp
            │   ├── vector_fold.hpp
            │   └── vector_max.hpp
            ├── fold_rows.hpp
            ├── gemm.hpp
            ├── gemv.hpp
            ├── getrf.hpp
            ├── hip
            │   ├── fold_rows.hpp
            │   ├── gemm.hpp
            │   ├── gemv.hpp
            │   ├── matrix_assign.hpp
            │   ├── potrf.hpp
            │   ├── random.hpp
            │   ├── syrk.hpp
            │   ├── trmm.hpp
            │   ├── trmv.hpp
            │   ├── trsm.hpp
            │   ├── trsv.hpp
            │   ├── vector_assign.hpp
            │   ├── vector_fold.hpp
            │   └── vector_max.hpp
            ├── lapack
            │   ├── fortran.hpp
            │   └── syev.hpp
            ├── matrix_assign.hpp
            ├── opencl
            │   ├── fold_rows.hpp
            │   ├── gemm.hpp
            │   ├── gemv.hpp
            │   ├── matrix_assign.hpp
            │   ├── potrf.hpp
            │   ├── random.hpp
            │   ├── syrk.hpp
            │   ├── trmm.hpp
            │   ├── trmv.hpp
            │   ├── trsm.hpp
            │   ├── trsv.hpp
            │   ├── vector_assign.hpp
            │   ├── vector_fold.hpp
            │   └── vector_max.hpp
            ├── potrf.hpp
            ├── pstrf.hpp
            ├── random.hpp
            ├── syev.hpp
            ├── syrk.hpp
            ├── tpmv.hpp
            ├── trmm.hpp
            ├── trmv.hpp
            ├── trsm.hpp
            ├── trsv.hpp
            ├── vector_assign.hpp
            ├── vector_fold.hpp
            └── vector_max.hpp
        ├── matrix_expression.hpp
        ├── opencl
            ├── copy.hpp
            ├── dense.hpp
            └── traits.hpp
        ├── permutation.hpp
        ├── proxy_expressions.hpp
        ├── random.hpp
        ├── remora.hpp
        ├── solve.hpp
        ├── sparse.hpp
        ├── triangular_matrix.hpp
        └── vector_expression.hpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Remora BLAS Library
 2 | ================================
 3 | 
 4 | Remora is a general purpose linear algebra library written in C++. 
 5 | It features:
 6 | 
 7 |   * Dense and Sparse Matrix and Vector operations
 8 |   * A basic set of optimized routines for matrix products, solving linear systems of equations etc
 9 |   * Bindings to highly optimized routines of BLAS packages
10 |   * A powerful expression template syntax which features algebraic optimizations of operations
11 |   * (experimental and very early) GPU support via OpenCL
12 |   
13 | Remora is used by the shark machine learning library.
14 |   
15 | Installation
16 | ---------------------------------------
17 |   
18 | Remora is header-only. just download and copy the contents of the include/ folder into its 
19 | target location. Remora depends on the boost c++ libraries.
20 | When using Remora, the following defines can be supplied at compile time
21 |   
22 | * REMORA_USE_SIMD if defined, Remora uses the compilers' auto vectorizing capabilities 
23 |   to speed up its computational routines. Requires G++ or clang
24 | * REMORA_USE_CBLAS if defined, Remora binds to a cblas library. 
25 |   On MacOsX, this flag is interpreted as using the accelerate framework.
26 |   Make sure to add the appropriate compile and linker flags for the library.
27 | * REMORA_USE_GPU if defined, Remora is enabling gpu support via the boost/compute
28 |   library. Highly experimental.
29 | * REMORA_USE_CLBLAST if defined, Remora is using CLBlast as GPU/opencl backend. 
30 |   This should be more stable and give better performance on most devices.
31 |   
32 | Contributing
33 | ----------------------------------------------------------
34 | Contributing is easy via [Pull Requests][1]. We are open
35 | for any types of contribution, but we favour them in the following order
36 | 
37 |   1. Bug fixes, test cases, documentation, benchmarks, examples...
38 |   2. Optimizing existing computational routines found in include/Remora/kernels/
39 |   3. Adding new computation routines in include/Remora/kernels/
40 |   4. extending the expression template system
41 |   
42 | The reason for this order is that extending the expression template system often leads
43 | to a large increase of possible operation combinations to cover, some of which might not
44 | be implemented in the kernel routines and thus leading to compile errors or inefficient code.
45 | We would like to prevent this by favouring the quality and number of underlying computational
46 | routines, which are very often easier to implement (and it is okay if those routines are very 
47 | specialized).
48 | 
49 | 
50 | [1]: https://github.com/Shark-ML/Remora/pulls
51 | 


--------------------------------------------------------------------------------
/Test/eigensymm.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_eigensymm
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | 
 5 | #include <remora/decompositions.hpp>
 6 | #include <remora/dense.hpp>
 7 | 
 8 | using namespace remora;
 9 | 
10 | BOOST_AUTO_TEST_SUITE (Remora_eigensymm)
11 | 
12 | matrix<double> createSymm(std::size_t dimensions, std::size_t rank = 0){
13 | 	if(rank == 0) rank = dimensions;
14 | 	matrix<double> R(dimensions,dimensions,0.0);
15 | 	
16 | 	for(std::size_t i = 0; i != dimensions; ++i){
17 | 		for(std::size_t j = 0; j <std::min(i,rank); ++j){
18 | 			R(i,j) = 0.2/std::abs((int)i -(int)j);
19 | 		}
20 | 		if(i < rank)
21 | 			R(i,i) = 0.5/dimensions*i+1;
22 | 	}
23 | 	matrix<double> A = prod(R,trans(R));
24 | 	if(rank != dimensions){
25 | 		for(std::size_t i = 0; i != rank/2; ++i){
26 | 			A.swap_rows(2*i,dimensions-i-1);
27 | 			A.swap_columns(2*i,dimensions-i-1);
28 | 		}
29 | 	}
30 | 	return A;
31 | }
32 | 
33 | BOOST_AUTO_TEST_CASE( Remora_eigensymm_decomposition)
34 | {
35 | 	std::size_t Dimensions = 123;
36 | 	matrix<double> A = createSymm(Dimensions);
37 | 		
38 | 	symm_eigenvalue_decomposition<matrix<double> > solver(A);
39 | 	
40 | 	matrix<double> Atest = solver.Q() % to_diagonal(solver.D()) % trans(solver.Q());
41 | 	BOOST_CHECK_SMALL(norm_inf(Atest-A),norm_inf(A) * 1.e-12);
42 | 
43 | }
44 | 
45 | BOOST_AUTO_TEST_CASE( Remora_eigensymm_solve )
46 | {
47 | 	std::size_t Dimensions = 153;
48 | 	std::size_t K = 35;
49 | 	//first generate a suitable eigenvalue problem matrix A
50 | 	matrix<double> A = createSymm(Dimensions);
51 | 		
52 | 	symm_eigenvalue_decomposition<matrix<double> > solver(A);
53 | 	cholesky_decomposition<matrix<double> > solver_cholesky(A);
54 | 	
55 | 	matrix<double> B(Dimensions,K);
56 | 	for(std::size_t i = 0; i != K; ++i){
57 | 		for(std::size_t j = 0; j != Dimensions; ++j){
58 | 			B(j,i) = (1.0 + j+K)/Dimensions; 
59 | 		}
60 | 	}
61 | 	
62 | 	vector<double> b(Dimensions);
63 | 	for(std::size_t j = 0; j != Dimensions; ++j){
64 | 		b(j) = (1.0 + j)/Dimensions; 
65 | 	}
66 | 	
67 | 	{
68 | 		vector<double> sol=b;
69 | 		vector<double> sol2=b;
70 | 		solver.solve(sol,left());
71 | 		solver_cholesky.solve(sol2,left());
72 | 		BOOST_CHECK_SMALL(norm_2(sol - sol2), norm_2(sol2)*1.e-8);
73 | 	}
74 | 	{
75 | 		vector<double> sol=b;
76 | 		vector<double> sol2=b;
77 | 		solver.solve(sol,right());
78 | 		solver_cholesky.solve(sol2,right());
79 | 		BOOST_CHECK_SMALL(norm_2(sol - sol2), norm_2(sol2)*1.e-8);
80 | 	}
81 | 	
82 | 	{
83 | 		matrix<double> sol=B;
84 | 		matrix<double> sol2=B;
85 | 		solver.solve(sol,left());
86 | 		solver_cholesky.solve(sol2,left());
87 | 		BOOST_CHECK_SMALL(norm_inf(sol - sol2), norm_inf(sol2)*1.e-8);
88 | 	}
89 | 	{
90 | 		matrix<double> sol=trans(B);
91 | 		matrix<double> sol2=trans(B);
92 | 		solver.solve(sol,right());
93 | 		solver_cholesky.solve(sol2,right());
94 | 		BOOST_CHECK_SMALL(norm_frobenius(sol - sol2), norm_frobenius(sol2)*1.e-8);
95 | 	}
96 | }
97 | BOOST_AUTO_TEST_SUITE_END()
98 | 


--------------------------------------------------------------------------------
/Test/getrf.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_Getrf
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | 
 5 | #include <remora/kernels/getrf.hpp>
 6 | #include <remora/dense.hpp>
 7 | #include <remora/matrix_expression.hpp>
 8 | 
 9 | #include <iostream>
10 | #include <boost/mpl/list.hpp>
11 | #include <boost/math/special_functions/fpclassify.hpp>
12 | 
13 | using namespace remora;
14 | 
15 | //the matrix is designed such that permutation will always give the next row
16 | matrix<double> createMatrix(std::size_t dimensions){
17 | 	matrix<double> L(dimensions,dimensions,0.0);
18 | 	matrix<double> U(dimensions,dimensions,0.0);
19 | 	
20 | 	for(std::size_t i = 0; i != dimensions; ++i){
21 | 		for(std::size_t j = 0; j <i; ++j){
22 | 			U(j,i) = 0.0;//1 - 0.1/dimensions * std::abs((int)i -(int)j);
23 | 			L(i,j)  = 3 - 3.0/dimensions*std::abs((int)i -(int)j);
24 | 		}
25 | 		U(i,i) = 0.5/dimensions*i+1;
26 | 		L(i,i) = 1;
27 | 	}
28 | 	matrix<double> A = prod(L,U);
29 | 	return A;
30 | }
31 | typedef boost::mpl::list<row_major,column_major> result_orientations;
32 | 
33 | 
34 | BOOST_AUTO_TEST_SUITE (Remora_Cholesky)
35 | 
36 | BOOST_AUTO_TEST_CASE_TEMPLATE(Remora_Potrf, Orientation,result_orientations) {
37 | 	std::size_t Dimensions = 123;
38 | 	//first generate a suitable eigenvalue problem matrix A
39 | 	matrix<double,Orientation> A = createMatrix(Dimensions);
40 | 	//calculate lu decomposition
41 | 	permutation_matrix P(Dimensions);
42 | 	matrix<double,Orientation> dec = A;
43 | 	kernels::getrf(dec,P);
44 | 
45 | 	//copy upper matrix to temporary
46 | 	matrix<double> upper(Dimensions,Dimensions,0.0);
47 | 	for (size_t row = 0; row < Dimensions; row++){
48 | 		for (size_t col = row; col < Dimensions ; col++){
49 | 			upper(row, col) = dec(row, col);
50 | 		}
51 | 	}
52 | 	
53 | 	//create reconstruction of A
54 | 	matrix<double> testA = triangular_prod<unit_lower>(dec,upper);
55 | 	swap_rows_inverted(P,testA);
56 | 	
57 | 	//test reconstruction error
58 | 	double error = max(abs(A - testA));
59 | 	BOOST_CHECK_SMALL(error,1.e-12);
60 | 	BOOST_CHECK(!(boost::math::isnan)(norm_frobenius(testA)));//test for nans
61 | }
62 | 
63 | BOOST_AUTO_TEST_SUITE_END()
64 | 


--------------------------------------------------------------------------------
/Test/hip_cholesky.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_Cholesky
 2 | 
 3 | #include <remora/kernels/potrf.hpp>
 4 | #include <remora/dense.hpp>
 5 | #include <remora/device_copy.hpp>
 6 | #include <remora/matrix_expression.hpp>
 7 | #include <remora/io.hpp>
 8 | 
 9 | #include <boost/test/unit_test.hpp>
10 | #include <boost/test/floating_point_comparison.hpp>
11 | using namespace remora;
12 | 
13 | //the matrix is designed such that a lot of permutations will be performed
14 | matrix<double> createSymm(std::size_t dimensions, std::size_t rank = 0){
15 | 	if(rank == 0) rank = dimensions;
16 | 	matrix<double> R(dimensions,dimensions,0.0);
17 | 	
18 | 	for(std::size_t i = 0; i != dimensions; ++i){
19 | 		for(std::size_t j = 0; j <std::min(i,rank); ++j){
20 | 			R(i,j) = 0.2/std::abs((int)i -(int)j);
21 | 		}
22 | 		if(i < rank)
23 | 			R(i,i) = 1+0.5/dimensions*i+1;
24 | 	}
25 | 	matrix<double> A = prod(R,trans(R));
26 | 	if(rank != dimensions){
27 | 		for(std::size_t i = 0; i != rank/2; ++i){
28 | 			A.swap_rows(2*i,dimensions-i-1);
29 | 			A.swap_columns(2*i,dimensions-i-1);
30 | 		}
31 | 	}
32 | 	return A;
33 | }
34 | BOOST_AUTO_TEST_SUITE (Remora_Cholesky)
35 | 
36 | template<class Orientation>
37 | void potrf_test(Orientation) {
38 | 	std::size_t Dimensions = 123;
39 | 	//first generate a suitable eigenvalue problem matrix A
40 | 	matrix<double,Orientation> A = createSymm(Dimensions);
41 | 	//calculate Cholesky
42 | 	matrix<double,Orientation, hip_tag> lowDec_opencl = copy_to_device(A, hip_tag());
43 | 	matrix<double,Orientation, hip_tag> upDec_opencl = copy_to_device(A, hip_tag());
44 | 	kernels::potrf<lower>(lowDec_opencl);
45 | 	kernels::potrf<upper>(upDec_opencl);
46 | 	matrix<double,Orientation> lowDec = copy_to_cpu(lowDec_opencl);
47 | 	matrix<double,Orientation> upDec = copy_to_cpu(upDec_opencl);
48 | 	matrix<double,Orientation> lowDec_test = A;
49 | 	matrix<double,Orientation> upDec_test = A;
50 | 	kernels::potrf<lower>(lowDec_test);
51 | 	kernels::potrf<upper>(upDec_test);
52 | 	//check that upper diagonal elements are correct and set them to zero
53 | 	for (size_t row = 0; row < Dimensions; row++){
54 | 		for (size_t col =0; col < Dimensions ; col++){
55 | 			BOOST_CHECK_CLOSE(lowDec(row, col), lowDec_test(row,col),1.e-12);
56 | 			BOOST_CHECK_CLOSE(upDec(row, col), upDec_test(row,col),1.e-12);
57 | 		}
58 | 	}
59 | 	
60 | }
61 | 
62 | BOOST_AUTO_TEST_CASE(Remora_Potrf) {
63 | 	potrf_test(row_major());
64 | 	potrf_test(column_major());
65 | }
66 | BOOST_AUTO_TEST_SUITE_END()
67 | 


--------------------------------------------------------------------------------
/Test/hip_matrix_assign.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_HIP_MatrixAssign
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | 
 5 | #include <remora/kernels/matrix_assign.hpp>
 6 | #include <remora/dense.hpp>
 7 | #include <remora/device_copy.hpp>
 8 | 
 9 | #include <iostream>
10 | using namespace remora;
11 | 
12 | template<class M1, class M2>
13 | void checkMatrixEqual(M1 const& m1_opencl, M2 const& m2_opencl){
14 | 	BOOST_REQUIRE_EQUAL(m1_opencl.size1(),m2_opencl.size1());
15 | 	BOOST_REQUIRE_EQUAL(m1_opencl.size2(),m2_opencl.size2());
16 | 	
17 | 	matrix<float> m1 = copy_to_cpu(m1_opencl);
18 | 	matrix<float> m2 = copy_to_cpu(m2_opencl);
19 | 	for(std::size_t i = 0; i != m2.size1(); ++i){
20 | 		for(std::size_t j = 0; j != m2.size2(); ++j){
21 | 			BOOST_CHECK_EQUAL(m1(i,j),m2(i,j));
22 | 		}
23 | 	}
24 | }
25 | 
26 | BOOST_AUTO_TEST_SUITE (Remora_opencl_matrix_assign)
27 | 
28 | BOOST_AUTO_TEST_CASE( Remora_Matrix_Assign_Dense ){
29 | 	std::cout<<"testing dense-dense assignment"<<std::endl;
30 | 	matrix<float> source_cpu(100,237);
31 | 	matrix<float> target_cpu(100,237);
32 | 	matrix<float> result_add_cpu(100,237);
33 | 	matrix<float> result_add_scalar_cpu(100,237);
34 | 	float scalar = 10;
35 | 	for(std::size_t i = 0; i != 100; ++i){
36 | 		for(std::size_t j = 0; j != 237; ++j){
37 | 			source_cpu(i,j) = 2*i+1+0.3*j;
38 | 			target_cpu(i,j) = 3*i+2+0.3*j;
39 | 			result_add_cpu(i,j) = source_cpu(i,j) + target_cpu(i,j);
40 | 			result_add_scalar_cpu(i,j) = target_cpu(i,j) + scalar;
41 | 		}
42 | 	}
43 | 	matrix<float, row_major, hip_tag> source = copy_to_device(source_cpu, hip_tag());
44 | 	matrix<float, column_major, hip_tag> source_cm = copy_to_device(source_cpu, hip_tag());
45 | 	matrix<float, row_major, hip_tag> result_add = copy_to_device(result_add_cpu, hip_tag());
46 | 	matrix<float, row_major, hip_tag> result_add_scalar = copy_to_device(result_add_scalar_cpu, hip_tag());
47 | 	{
48 | 		std::cout<<"testing direct assignment row-row"<<std::endl;
49 | 		matrix<float, row_major, hip_tag> target = copy_to_device(target_cpu, hip_tag());
50 | 		kernels::assign(target,source);
51 | 		checkMatrixEqual(target,source);
52 | 	}
53 | 	{
54 | 		std::cout<<"testing functor assignment row-row"<<std::endl;
55 | 		matrix<float, row_major, hip_tag> target = copy_to_device(target_cpu, hip_tag());
56 | 		kernels::assign(target,source, device_traits<hip_tag>::add<float>());
57 | 		checkMatrixEqual(target,result_add);
58 | 	}
59 | 	{
60 | 		std::cout<<"testing direct assignment row-column"<<std::endl;
61 | 		matrix<float, row_major, hip_tag> target = copy_to_device(target_cpu, hip_tag());
62 | 		kernels::assign(target,source_cm);
63 | 		checkMatrixEqual(target,source_cm);
64 | 	}
65 | 	{
66 | 		std::cout<<"testing functor assignment row-column"<<std::endl;
67 | 		matrix<float, row_major, hip_tag> target = copy_to_device(target_cpu, hip_tag());
68 | 		kernels::assign(target,source_cm, device_traits<hip_tag>::add<float>());
69 | 		checkMatrixEqual(target,result_add);
70 | 	}
71 | 	{
72 | 		std::cout<<"testing functor scalar assignment"<<std::endl;
73 | 		matrix<float, row_major, hip_tag> target = copy_to_device(target_cpu, hip_tag());
74 | 		kernels::assign<device_traits<hip_tag>::add<float> >(target,scalar);
75 | 		checkMatrixEqual(target,result_add_scalar);
76 | 	}
77 | 	
78 | }
79 | 
80 | BOOST_AUTO_TEST_SUITE_END()
81 | 


--------------------------------------------------------------------------------
/Test/hip_syrk.cpp:
--------------------------------------------------------------------------------
  1 | #define BOOST_TEST_MODULE Remora_HIP_Syrk
  2 | #include <boost/test/unit_test.hpp>
  3 | #include <boost/test/floating_point_comparison.hpp>
  4 | #include <boost/mpl/list.hpp>
  5 | 
  6 | 
  7 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
  8 | #include <remora/kernels/syrk.hpp>
  9 | #include <remora/dense.hpp>
 10 | #include <remora/device_copy.hpp>
 11 | #include <remora/matrix_expression.hpp>
 12 | 
 13 | 
 14 | #include <iostream>
 15 | using namespace remora;
 16 | 
 17 | template<class M, class Result>
 18 | void checkSyrk(M const& arg_opencl, Result const& result_opencl,double init, double alpha, bool upper){
 19 | 	BOOST_REQUIRE_EQUAL(arg_opencl.size1(), result_opencl.size1());
 20 | 	BOOST_REQUIRE_EQUAL(result_opencl.size1(), result_opencl.size2());
 21 | 	
 22 | 	matrix<float> arg = copy_to_cpu(arg_opencl);
 23 | 	matrix<float> result = copy_to_cpu(result_opencl);
 24 | 	
 25 | 	if(upper){
 26 | 		for(std::size_t i = 0; i != result.size1(); ++i) {
 27 | 			for(std::size_t j = 0; j != result.size2(); ++j) {
 28 | 				if(j < i){
 29 | 					BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4);
 30 | 				}else{
 31 | 					double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init;
 32 | 					BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4);
 33 | 				}
 34 | 			}
 35 | 		}
 36 | 	}else{
 37 | 		for(std::size_t i = 0; i != result.size1(); ++i) {
 38 | 			for(std::size_t j = 0; j != result.size2(); ++j) {
 39 | 				if(j > i){
 40 | 					BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4);
 41 | 				}else{
 42 | 					double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init;
 43 | 					BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4);
 44 | 				}
 45 | 			}
 46 | 		}
 47 | 	}
 48 | }
 49 | 
 50 | BOOST_AUTO_TEST_SUITE (Remora_HIP_Syrk)
 51 | 
 52 | template<class Orientation>
 53 | void syrk_test(Orientation) {
 54 | 	std::size_t dims = 936;//chosen as not to be a multiple of the block size
 55 | 	std::size_t K = 1039;
 56 | 
 57 | 	//rhs
 58 | 	matrix<float, row_major> arg_cpu(dims, K, 1.0);
 59 | 	for(std::size_t i = 0; i != dims; ++i) {
 60 | 		for(std::size_t j = 0; j != K; ++j) {
 61 | 			arg_cpu(i, j) = (1.0/ dims) * i + 0.2/K * j + 1;
 62 | 		}
 63 | 	}
 64 | 	
 65 | 	matrix<float,row_major, hip_tag> argrm = copy_to_device(arg_cpu, hip_tag());
 66 | 	matrix<float,column_major, hip_tag> argcm = copy_to_device(arg_cpu, hip_tag());
 67 | 
 68 | 	std::cout << "\nchecking syrk V+=AA^T" << std::endl;
 69 | 	{
 70 | 		std::cout<<"row major A, lower V"<<std::endl;
 71 | 		matrix<float, Orientation, hip_tag> result(dims,dims,3.0);
 72 | 		kernels::syrk<false>(argrm,result, 2.0);
 73 | 		checkSyrk(argrm,result, 3.0, 2.0,false);
 74 | 	}
 75 | 	{
 76 | 		std::cout<<"row major A, upper V"<<std::endl;
 77 | 		matrix<float, Orientation, hip_tag> result(dims,dims,3.0);
 78 | 		kernels::syrk<true>(argrm,result, 2.0);
 79 | 		checkSyrk(argrm,result, 3.0, 2.0,true);
 80 | 	}
 81 | 	{
 82 | 		std::cout<<"column major A, lower V"<<std::endl;
 83 | 		matrix<float, Orientation, hip_tag> result(dims,dims,3.0);
 84 | 		kernels::syrk<false>(argcm,result, 2.0);
 85 | 		checkSyrk(argrm,result, 3.0, 2.0,false);
 86 | 	}
 87 | 	{
 88 | 		std::cout<<"column major A, upper V"<<std::endl;
 89 | 		matrix<float, Orientation, hip_tag> result(dims,dims,3.0);
 90 | 		kernels::syrk<true>(argcm,result, 2.0);
 91 | 		checkSyrk(argrm,result, 3.0, 2.0,true);
 92 | 	}
 93 | }
 94 | 
 95 | BOOST_AUTO_TEST_CASE(HIP_syrk){
 96 | 	syrk_test(row_major());
 97 | 	syrk_test(column_major());
 98 | }
 99 | 
100 | BOOST_AUTO_TEST_SUITE_END()
101 | 


--------------------------------------------------------------------------------
/Test/hip_vector_assign.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_HIP_VectorAssign
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | 
 5 | #include <remora/kernels/vector_assign.hpp>
 6 | #include <remora/dense.hpp>
 7 | #include <remora/device_copy.hpp>
 8 | 
 9 | #include <iostream>
10 | using namespace remora;
11 | 
12 | template<class V1, class V2>
13 | void checkVectorEqual(V1 const& v1_opencl, V2 const& v2_opencl){
14 | 	BOOST_REQUIRE_EQUAL(v1_opencl.size(),v2_opencl.size());
15 | 	
16 | 	vector<unsigned int> v1 = copy_to_cpu(v1_opencl);
17 | 	vector<unsigned int> v2 = copy_to_cpu(v2_opencl);
18 | 	for(std::size_t i = 0; i != v2.size(); ++i){
19 | 		BOOST_CHECK_EQUAL(v1(i),v2(i));
20 | 	}
21 | }
22 | 
23 | BOOST_AUTO_TEST_SUITE (Remora_HIP_vector_assign)
24 | 
25 | BOOST_AUTO_TEST_CASE( Remora_Vector_Assign_Dense ){
26 | 	std::cout<<"testing dense-dense assignment"<<std::endl;
27 | 	vector<unsigned int> source_cpu(1000);
28 | 	vector<unsigned int> target_cpu(1000);
29 | 	vector<unsigned int> result_add_cpu(1000);
30 | 	vector<unsigned int> result_add_scalar_cpu(1000);
31 | 	unsigned int scalar = 10;
32 | 	for(std::size_t i = 0; i != 1000; ++i){
33 | 		source_cpu(i) = 2*i+1;
34 | 		target_cpu(i) = 3*i+2;
35 | 		result_add_cpu(i) = source_cpu(i) + target_cpu(i);
36 | 		result_add_scalar_cpu(i) = target_cpu(i) + scalar;
37 | 	}
38 | 	vector<unsigned int, hip_tag> source = copy_to_device(source_cpu, hip_tag());
39 | 	vector<unsigned int, hip_tag> result_add = copy_to_device(result_add_cpu, hip_tag());
40 | 	vector<unsigned int, hip_tag> result_add_scalar = copy_to_device(result_add_scalar_cpu, hip_tag());
41 | 	{
42 | 		std::cout<<"testing direct assignment"<<std::endl;
43 | 		vector<unsigned int, hip_tag> target = copy_to_device(target_cpu, hip_tag());
44 | 		kernels::assign(target,source);
45 | 		checkVectorEqual(target,source);
46 | 	}
47 | 	{
48 | 		std::cout<<"testing functor assignment"<<std::endl;
49 | 		vector<unsigned int, hip_tag> target = copy_to_device(target_cpu, hip_tag());
50 | 		kernels::assign(target,source, device_traits<hip_tag>::add<unsigned int>());
51 | 		checkVectorEqual(target,result_add);
52 | 	}
53 | 	{
54 | 		std::cout<<"testing functor scalar assignment"<<std::endl;
55 | 		vector<unsigned int, hip_tag> target = copy_to_device(target_cpu, hip_tag());
56 | 		kernels::assign<device_traits<hip_tag>::add<unsigned int> >(target,scalar);
57 | 		checkVectorEqual(target,result_add_scalar);
58 | 	}
59 | 	
60 | }
61 | 
62 | BOOST_AUTO_TEST_SUITE_END()
63 | 


--------------------------------------------------------------------------------
/Test/opencl_cholesky.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_Cholesky
 2 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
 3 | 
 4 | #include <remora/dense.hpp>
 5 | #include <remora/device_copy.hpp>
 6 | #include <remora/kernels/potrf.hpp>
 7 | #include <remora/matrix_expression.hpp>
 8 | #include <remora/io.hpp>
 9 | 
10 | #include <boost/test/unit_test.hpp>
11 | #include <boost/test/floating_point_comparison.hpp>
12 | #include <boost/mpl/list.hpp>
13 | #include <boost/math/special_functions/fpclassify.hpp>
14 | using namespace remora;
15 | 
16 | //the matrix is designed such that a lot of permutations will be performed
17 | matrix<double> createSymm(std::size_t dimensions, std::size_t rank = 0){
18 | 	if(rank == 0) rank = dimensions;
19 | 	matrix<double> R(dimensions,dimensions,0.0);
20 | 	
21 | 	for(std::size_t i = 0; i != dimensions; ++i){
22 | 		for(std::size_t j = 0; j <std::min(i,rank); ++j){
23 | 			R(i,j) = 0.2/std::abs((int)i -(int)j);
24 | 		}
25 | 		if(i < rank)
26 | 			R(i,i) = 1+0.5/dimensions*i+1;
27 | 	}
28 | 	matrix<double> A = prod(R,trans(R));
29 | 	if(rank != dimensions){
30 | 		for(std::size_t i = 0; i != rank/2; ++i){
31 | 			A.swap_rows(2*i,dimensions-i-1);
32 | 			A.swap_columns(2*i,dimensions-i-1);
33 | 		}
34 | 	}
35 | 	return A;
36 | }
37 | typedef boost::mpl::list<row_major,column_major> result_orientations;
38 | 
39 | 
40 | BOOST_AUTO_TEST_SUITE (Remora_Cholesky)
41 | 
42 | BOOST_AUTO_TEST_CASE_TEMPLATE(Remora_Potrf, Orientation,result_orientations) {
43 | 	std::size_t Dimensions = 123;
44 | 	//first generate a suitable eigenvalue problem matrix A
45 | 	matrix<double,Orientation> A = createSymm(Dimensions);
46 | 	//calculate Cholesky
47 | 	matrix<double,Orientation, opencl_tag> lowDec_opencl = copy_to_opencl(A);
48 | 	matrix<double,Orientation, opencl_tag> upDec_opencl = copy_to_opencl(A);
49 | 	kernels::potrf<lower>(lowDec_opencl);
50 | 	kernels::potrf<upper>(upDec_opencl);
51 | 	matrix<double,Orientation> lowDec = copy_to_cpu(lowDec_opencl);
52 | 	matrix<double,Orientation> upDec = copy_to_cpu(upDec_opencl);
53 | 	matrix<double,Orientation> lowDec_test = A;
54 | 	matrix<double,Orientation> upDec_test = A;
55 | 	kernels::potrf<lower>(lowDec_test);
56 | 	kernels::potrf<upper>(upDec_test);
57 | 	//check that upper diagonal elements are correct and set them to zero
58 | 	for (size_t row = 0; row < Dimensions; row++){
59 | 		for (size_t col =0; col < Dimensions ; col++){
60 | 			BOOST_CHECK_CLOSE(lowDec(row, col), lowDec_test(row,col),1.e-12);
61 | 			BOOST_CHECK_CLOSE(upDec(row, col), upDec_test(row,col),1.e-12);
62 | 		}
63 | 	}
64 | 	
65 | }
66 | 
67 | BOOST_AUTO_TEST_SUITE_END()
68 | 


--------------------------------------------------------------------------------
/Test/opencl_conv2d.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_OPENCL_Conv2d
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | #include <boost/mpl/list.hpp>
 5 | 
 6 | #include <remora/kernels/conv2d.hpp>
 7 | #include <remora/dense.hpp>
 8 | #include <remora/device_copy.hpp>
 9 | #include <remora/vector_expression.hpp>
10 | 
11 | using namespace remora;
12 | 
13 | void test(
14 | 	std::size_t image_size1, std::size_t image_size2,
15 | 	std::size_t filter_size1, std::size_t filter_size2,
16 | 	std::size_t num_channels,
17 | 	std::size_t num_filters,
18 | 	std::size_t num_images,
19 | 	std::size_t padding_height = 0,
20 | 	std::size_t padding_width = 0
21 | ){
22 | 	//create filter on CPU
23 | 	vector<float> filter(num_channels * num_filters *  filter_size1 * filter_size2);
24 | 	{
25 | 		std::size_t lin_elem = 0;
26 | 		for(std::size_t f = 0; f != num_filters; ++f){
27 | 			for(std::size_t i = 0; i !=  filter_size1; ++i){
28 | 				for(std::size_t j = 0; j != filter_size2; ++j){
29 | 					for(std::size_t c = 0; c != num_channels; ++c, ++lin_elem){
30 | 						double val = 1.0/(num_channels * filter_size1)*i + 0.1 - (0.1/filter_size2)*j+0.01*f-0.01*c;
31 | 						filter(lin_elem) = val;
32 | 					}
33 | 				}
34 | 			}
35 | 		}
36 | 	}
37 | 	
38 | 	//Create images on CPU
39 | 	matrix<float> image(num_images, num_channels * image_size1 *  image_size2);
40 | 	//create images and ground truth
41 | 	for(std::size_t im = 0; im != num_images; ++im){
42 | 		std::size_t lin_elem = 0;
43 | 		for(std::size_t i = 0; i !=  image_size1; ++i){
44 | 			for(std::size_t j = 0; j != image_size2; ++j){
45 | 				for(std::size_t c = 0; c != num_channels; ++c, ++lin_elem){
46 | 					image(im,lin_elem) = 1.0/(num_channels * image_size1)*i + 0.1 - (0.1/image_size2)*j;
47 | 				}
48 | 			}
49 | 		}
50 | 	}
51 | 	
52 | 	//copy to Device
53 | 	vector<float,opencl_tag> filter_opencl = copy_to_opencl(filter);
54 | 	matrix<float,row_major, opencl_tag> image_opencl = copy_to_opencl(image);
55 | 	
56 | 	//Reserve enough space for output
57 | 	
58 | 	std::size_t output_size1 = image_size1 - filter_size1 + 1 + padding_height;
59 | 	std::size_t output_size2 = image_size2 - filter_size2 + 1 + padding_width;
60 | 	matrix<float> out(num_images, output_size1 * output_size2 * num_filters, 0.0);
61 | 	matrix<float,row_major,opencl_tag> out_opencl(num_images, output_size1 * output_size2 * num_filters, 0.0);
62 | 	
63 | 	
64 | 	//compute baseline and opencl result
65 | 	kernels::conv2d(
66 | 		image,filter,out,num_channels, num_filters, 
67 | 		image_size1, image_size2, filter_size1, filter_size2,
68 | 		padding_height, padding_width
69 | 	);
70 | 	
71 | 	kernels::conv2d(
72 | 		image_opencl,filter_opencl,out_opencl,num_channels, num_filters, 
73 | 		image_size1, image_size2, filter_size1, filter_size2,
74 | 		padding_height, padding_width
75 | 	);
76 | 	
77 | 	//copy result back and test
78 | 	matrix<float,row_major> out_cpu = copy_to_cpu(out_opencl);
79 | 	
80 | 	for(std::size_t im = 0; im != num_images; ++im){
81 | 		for(std::size_t k = 0; k != out.size2(); ++k){
82 | 			BOOST_CHECK_CLOSE(out(im,k),out_cpu(im,k),1.e-2);
83 | 		}
84 | 	}	
85 | }
86 | 
87 | 
88 | 
89 | BOOST_AUTO_TEST_SUITE(Remora_Conv2d)
90 | 
91 | BOOST_AUTO_TEST_CASE(conv2d_test) {
92 | 	test(32,16,4,8,5,1,1);
93 | 	test(16,12,4,8,4,4,3);
94 | 	test(57,33,7,3,22,15,3);
95 | }
96 | 
97 | BOOST_AUTO_TEST_SUITE_END()
98 | 


--------------------------------------------------------------------------------
/Test/opencl_matrix_assign.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_OPENCL_MatrixAssign
 2 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
 3 | #include <boost/test/unit_test.hpp>
 4 | #include <boost/test/floating_point_comparison.hpp>
 5 | 
 6 | #include <remora/kernels/matrix_assign.hpp>
 7 | #include <remora/dense.hpp>
 8 | #include <remora/device_copy.hpp>
 9 | 
10 | #include <iostream>
11 | using namespace remora;
12 | 
13 | template<class M1, class M2>
14 | void checkMatrixEqual(M1 const& m1_opencl, M2 const& m2_opencl){
15 | 	BOOST_REQUIRE_EQUAL(m1_opencl.size1(),m2_opencl.size1());
16 | 	BOOST_REQUIRE_EQUAL(m1_opencl.size2(),m2_opencl.size2());
17 | 	
18 | 	matrix<float> m1 = copy_to_cpu(m1_opencl);
19 | 	matrix<float> m2 = copy_to_cpu(m2_opencl);
20 | 	for(std::size_t i = 0; i != m2.size1(); ++i){
21 | 		for(std::size_t j = 0; j != m2.size2(); ++j){
22 | 			BOOST_CHECK_EQUAL(m1(i,j),m2(i,j));
23 | 		}
24 | 	}
25 | }
26 | 
27 | BOOST_AUTO_TEST_SUITE (Remora_opencl_matrix_assign)
28 | 
29 | BOOST_AUTO_TEST_CASE( Remora_Matrix_Assign_Dense ){
30 | 	std::cout<<"testing dense-dense assignment"<<std::endl;
31 | 	matrix<float> source_cpu(100,237);
32 | 	matrix<float> target_cpu(100,237);
33 | 	matrix<float> result_add_cpu(100,237);
34 | 	matrix<float> result_add_scalar_cpu(100,237);
35 | 	float scalar = 10;
36 | 	for(std::size_t i = 0; i != 100; ++i){
37 | 		for(std::size_t j = 0; j != 237; ++j){
38 | 			source_cpu(i,j) = 2*i+1+0.3*j;
39 | 			target_cpu(i,j) = 3*i+2+0.3*j;
40 | 			result_add_cpu(i,j) = source_cpu(i,j) + target_cpu(i,j);
41 | 			result_add_scalar_cpu(i,j) = target_cpu(i,j) + scalar;
42 | 		}
43 | 	}
44 | 	matrix<float, row_major, opencl_tag> source = copy_to_opencl(source_cpu);
45 | 	matrix<float, column_major, opencl_tag> source_cm = copy_to_opencl(source_cpu);
46 | 	matrix<float, row_major, opencl_tag> result_add = copy_to_opencl(result_add_cpu);
47 | 	matrix<float, row_major, opencl_tag> result_add_scalar = copy_to_opencl(result_add_scalar_cpu);
48 | 	{
49 | 		std::cout<<"testing direct assignment row-row"<<std::endl;
50 | 		matrix<float, row_major, opencl_tag> target = copy_to_opencl(target_cpu);
51 | 		kernels::assign(target,source);
52 | 		checkMatrixEqual(target,source);
53 | 	}
54 | 	{
55 | 		std::cout<<"testing functor assignment row-row"<<std::endl;
56 | 		matrix<float, row_major, opencl_tag> target = copy_to_opencl(target_cpu);
57 | 		kernels::assign(target,source, device_traits<opencl_tag>::add<float>());
58 | 		checkMatrixEqual(target,result_add);
59 | 	}
60 | 	{
61 | 		std::cout<<"testing direct assignment row-column"<<std::endl;
62 | 		matrix<float, row_major, opencl_tag> target = copy_to_opencl(target_cpu);
63 | 		kernels::assign(target,source_cm);
64 | 		checkMatrixEqual(target,source_cm);
65 | 	}
66 | 	{
67 | 		std::cout<<"testing functor assignment row-column"<<std::endl;
68 | 		matrix<float, row_major, opencl_tag> target = copy_to_opencl(target_cpu);
69 | 		kernels::assign(target,source_cm, device_traits<opencl_tag>::add<float>());
70 | 		checkMatrixEqual(target,result_add);
71 | 	}
72 | 	{
73 | 		std::cout<<"testing functor scalar assignment"<<std::endl;
74 | 		matrix<float, row_major, opencl_tag> target = copy_to_opencl(target_cpu);
75 | 		kernels::assign<device_traits<opencl_tag>::add<float> >(target,scalar);
76 | 		target.queue().finish();
77 | 		checkMatrixEqual(target,result_add_scalar);
78 | 	}
79 | 	
80 | }
81 | 
82 | BOOST_AUTO_TEST_SUITE_END()
83 | 


--------------------------------------------------------------------------------
/Test/opencl_syrk.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_OPENCL_Syrk
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | #include <boost/mpl/list.hpp>
 5 | 
 6 | 
 7 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION
 8 | #include <remora/kernels/syrk.hpp>
 9 | #include <remora/dense.hpp>
10 | #include <remora/opencl/copy.hpp>
11 | #include <remora/matrix_expression.hpp>
12 | 
13 | 
14 | #include <iostream>
15 | using namespace remora;
16 | 
17 | template<class M, class Result>
18 | void checkSyrk(M const& arg_opencl, Result const& result_opencl,double init, double alpha, bool upper){
19 | 	BOOST_REQUIRE_EQUAL(arg_opencl.size1(), result_opencl.size1());
20 | 	BOOST_REQUIRE_EQUAL(result_opencl.size1(), result_opencl.size2());
21 | 	
22 | 	matrix<float> arg = copy_to_cpu(arg_opencl);
23 | 	matrix<float> result = copy_to_cpu(result_opencl);
24 | 	
25 | 	if(upper){
26 | 		for(std::size_t i = 0; i != result.size1(); ++i) {
27 | 			for(std::size_t j = 0; j != result.size2(); ++j) {
28 | 				if(j < i){
29 | 					BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4);
30 | 				}else{
31 | 					double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init;
32 | 					BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4);
33 | 				}
34 | 			}
35 | 		}
36 | 	}else{
37 | 		for(std::size_t i = 0; i != result.size1(); ++i) {
38 | 			for(std::size_t j = 0; j != result.size2(); ++j) {
39 | 				if(j > i){
40 | 					BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4);
41 | 				}else{
42 | 					double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init;
43 | 					BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4);
44 | 				}
45 | 			}
46 | 		}
47 | 	}
48 | }
49 | 
50 | BOOST_AUTO_TEST_SUITE (Remora_Gpu_Syrk)
51 | 
52 | typedef boost::mpl::list<row_major,column_major> result_orientations;
53 | BOOST_AUTO_TEST_CASE_TEMPLATE(syrk_test, Orientation,result_orientations) {
54 | 	std::size_t dims = 936;//chosen as not to be a multiple of the block size
55 | 	std::size_t K = 1039;
56 | 
57 | 	//rhs
58 | 	matrix<float, row_major> arg_cpu(dims, K, 1.0);
59 | 	for(std::size_t i = 0; i != dims; ++i) {
60 | 		for(std::size_t j = 0; j != K; ++j) {
61 | 			arg_cpu(i, j) = (1.0/ dims) * i + 0.2/K * j + 1;
62 | 		}
63 | 	}
64 | 	
65 | 	matrix<float,row_major, opencl_tag> argrm = copy_to_opencl(arg_cpu);
66 | 	matrix<float,column_major, opencl_tag> argcm = copy_to_opencl(arg_cpu);
67 | 
68 | 	std::cout << "\nchecking syrk V+=AA^T" << std::endl;
69 | 	{
70 | 		std::cout<<"row major A, lower V"<<std::endl;
71 | 		matrix<float, Orientation, opencl_tag> result(dims,dims,3.0);
72 | 		kernels::syrk<false>(argrm,result, 2.0);
73 | 		checkSyrk(argrm,result, 3.0, 2.0,false);
74 | 	}
75 | 	{
76 | 		std::cout<<"row major A, upper V"<<std::endl;
77 | 		matrix<float, Orientation, opencl_tag> result(dims,dims,3.0);
78 | 		kernels::syrk<true>(argrm,result, 2.0);
79 | 		checkSyrk(argrm,result, 3.0, 2.0,true);
80 | 	}
81 | 	{
82 | 		std::cout<<"column major A, lower V"<<std::endl;
83 | 		matrix<float, Orientation, opencl_tag> result(dims,dims,3.0);
84 | 		kernels::syrk<false>(argcm,result, 2.0);
85 | 		checkSyrk(argrm,result, 3.0, 2.0,false);
86 | 	}
87 | 	{
88 | 		std::cout<<"column major A, upper V"<<std::endl;
89 | 		matrix<float, Orientation, opencl_tag> result(dims,dims,3.0);
90 | 		kernels::syrk<true>(argcm,result, 2.0);
91 | 		checkSyrk(argrm,result, 3.0, 2.0,true);
92 | 	}
93 | }
94 | 
95 | BOOST_AUTO_TEST_SUITE_END()
96 | 


--------------------------------------------------------------------------------
/Test/opencl_vector_assign.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_OPENCL_VectorAssign
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | 
 5 | #include <remora/kernels/vector_assign.hpp>
 6 | #include <remora/dense.hpp>
 7 | #include <remora/device_copy.hpp>
 8 | 
 9 | #include <iostream>
10 | using namespace remora;
11 | 
12 | template<class V1, class V2>
13 | void checkVectorEqual(V1 const& v1_opencl, V2 const& v2_opencl){
14 | 	BOOST_REQUIRE_EQUAL(v1_opencl.size(),v2_opencl.size());
15 | 	
16 | 	vector<unsigned int> v1 = copy_to_cpu(v1_opencl);
17 | 	vector<unsigned int> v2 = copy_to_cpu(v2_opencl);
18 | 	for(std::size_t i = 0; i != v2.size(); ++i){
19 | 		BOOST_CHECK_EQUAL(v1(i),v2(i));
20 | 	}
21 | }
22 | 
23 | BOOST_AUTO_TEST_SUITE (Remora_opencl_vector_assign)
24 | 
25 | BOOST_AUTO_TEST_CASE( Remora_Vector_Assign_Dense ){
26 | 	std::cout<<"testing dense-dense assignment"<<std::endl;
27 | 	vector<unsigned int> source_cpu(1000);
28 | 	vector<unsigned int> target_cpu(1000);
29 | 	vector<unsigned int> result_add_cpu(1000);
30 | 	vector<unsigned int> result_add_scalar_cpu(1000);
31 | 	unsigned int scalar = 10;
32 | 	for(std::size_t i = 0; i != 1000; ++i){
33 | 		source_cpu(i) = 2*i+1;
34 | 		target_cpu(i) = 3*i+2;
35 | 		result_add_cpu(i) = source_cpu(i) + target_cpu(i);
36 | 		result_add_scalar_cpu(i) = target_cpu(i) + scalar;
37 | 	}
38 | 	vector<unsigned int, opencl_tag> source = copy_to_opencl(source_cpu);
39 | 	vector<unsigned int, opencl_tag> result_add = copy_to_opencl(result_add_cpu);
40 | 	vector<unsigned int, opencl_tag> result_add_scalar = copy_to_opencl(result_add_scalar_cpu);
41 | 	{
42 | 		std::cout<<"testing direct assignment"<<std::endl;
43 | 		vector<unsigned int, opencl_tag> target = copy_to_opencl(target_cpu);
44 | 		kernels::assign(target,source);
45 | 		checkVectorEqual(target,source);
46 | 	}
47 | 	{
48 | 		std::cout<<"testing functor assignment"<<std::endl;
49 | 		vector<unsigned int, opencl_tag> target = copy_to_opencl(target_cpu);
50 | 		kernels::assign(target,source, device_traits<opencl_tag>::add<unsigned int>());
51 | 		checkVectorEqual(target,result_add);
52 | 	}
53 | 	{
54 | 		std::cout<<"testing functor scalar assignment"<<std::endl;
55 | 		vector<unsigned int, opencl_tag> target = copy_to_opencl(target_cpu);
56 | 		kernels::assign<device_traits<opencl_tag>::add<unsigned int> >(target,scalar);
57 | 		checkVectorEqual(target,result_add_scalar);
58 | 	}
59 | 	
60 | }
61 | 
62 | BOOST_AUTO_TEST_SUITE_END()
63 | 


--------------------------------------------------------------------------------
/Test/syrk.cpp:
--------------------------------------------------------------------------------
 1 | #define BOOST_TEST_MODULE Remora_Syrk
 2 | #include <boost/test/unit_test.hpp>
 3 | #include <boost/test/floating_point_comparison.hpp>
 4 | #include <boost/mpl/list.hpp>
 5 | 
 6 | #include <remora/kernels/syrk.hpp>
 7 | #include <remora/dense.hpp>
 8 | #include <remora/proxy_expressions.hpp>
 9 | #include <remora/vector_expression.hpp>
10 | #include <iostream>
11 | using namespace remora;
12 | 
13 | template<class M1, class Result>
14 | void checkSyrk(M1 const& arg, Result const& result,double init, double alpha, bool upper){
15 | 	BOOST_REQUIRE_EQUAL(arg.size1(), result.size1());
16 | 	BOOST_REQUIRE_EQUAL(result.size1(), result.size2());
17 | 	
18 | 	if(upper){
19 | 		for(std::size_t i = 0; i != result.size1(); ++i) {
20 | 			for(std::size_t j = 0; j != result.size2(); ++j) {
21 | 				if(j < i){
22 | 					BOOST_CHECK_CLOSE(result(i,j),init, 1.e-10);
23 | 				}else{
24 | 					double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init;
25 | 					BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-10);
26 | 				}
27 | 			}
28 | 		}
29 | 	}else{
30 | 		for(std::size_t i = 0; i != result.size1(); ++i) {
31 | 			for(std::size_t j = 0; j != result.size2(); ++j) {
32 | 				if(j > i){
33 | 					BOOST_CHECK_CLOSE(result(i,j),init, 1.e-10);
34 | 				}else{
35 | 					double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init;
36 | 					BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-10);
37 | 				}
38 | 			}
39 | 		}
40 | 	}
41 | }
42 | 
43 | BOOST_AUTO_TEST_SUITE(Remora_SYRK)
44 | 
45 | 
46 | 
47 | typedef boost::mpl::list<row_major,column_major> result_orientations;
48 | BOOST_AUTO_TEST_CASE_TEMPLATE(syrk_test, Orientation,result_orientations) {
49 | 	std::size_t dims = 384;//chosen as not to be a multiple of the block size
50 | 	std::size_t K = 244;
51 | 
52 | 	//rhs
53 | 	matrix<double, row_major> argrm(dims, K, 1.0);
54 | 	matrix<double, column_major> argcm(dims, K, 1.0);
55 | 	for(std::size_t i = 0; i != dims; ++i) {
56 | 		for(std::size_t j = 0; j != K; ++j) {
57 | 			argrm(i, j) = argcm(i, j) = (1.0/ dims) * i + 0.2/K * j + 1;
58 | 		}
59 | 	}
60 | 
61 | 	std::cout << "\nchecking syrk V+=AA^T" << std::endl;
62 | 	{
63 | 		std::cout<<"row major A, lower V"<<std::endl;
64 | 		matrix<double,Orientation> result(dims, dims, 3.0);
65 | 		kernels::syrk<false>(argrm,result, 2.0);
66 | 		checkSyrk(argrm,result, 3.0, 2.0,false);
67 | 	}
68 | 	{
69 | 		std::cout<<"row major A, upper V"<<std::endl;
70 | 		matrix<double,Orientation> result(dims, dims, 3.0);
71 | 		kernels::syrk<true>(argrm,result, 2.0);
72 | 		checkSyrk(argrm,result, 3.0, 2.0,true);
73 | 	}
74 | 	{
75 | 		std::cout<<"column major A, lower V"<<std::endl;
76 | 		matrix<double,Orientation> result(dims, dims, 3.0);
77 | 		kernels::syrk<false>(argcm,result, 2.0);
78 | 		checkSyrk(argrm,result, 3.0, 2.0,false);
79 | 	}
80 | 	{
81 | 		std::cout<<"column major A, upper V"<<std::endl;
82 | 		matrix<double,Orientation> result(dims, dims, 3.0);
83 | 		kernels::syrk<true>(argcm,result, 2.0);
84 | 		checkSyrk(argrm,result, 3.0, 2.0,true);
85 | 	}
86 | 	
87 | }
88 | 
89 | BOOST_AUTO_TEST_SUITE_END()
90 | 


--------------------------------------------------------------------------------
/doc/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED( VERSION 2.8 )
 2 | 
 3 | find_package( Doxygen REQUIRED )
 4 | find_package( PythonInterp REQUIRED )
 5 | 
 6 | ADD_SUBDIRECTORY(tutToRst)
 7 | 
 8 | CONFIGURE_FILE (
 9 |   "${CMAKE_CURRENT_SOURCE_DIR}/sphinx_pages/conf.py.in"
10 |   "${CMAKE_CURRENT_SOURCE_DIR}/sphinx_pages/conf.py"
11 | )
12 | 
13 | set( SPHINX_EXECUTABLE sphinx-build )
14 | set( SPHINX_PARAMETERS -b html )
15 | 
16 | add_custom_target(doc_creation)
17 | 
18 | #find all .tut files
19 | file(GLOB_RECURSE TutFiles sphinx_pages *.tut)
20 | message(STATUS ${TutFiles})
21 | foreach(tut ${TutFiles})
22 | 	GET_FILENAME_COMPONENT(tutPath ${tut} PATH)
23 | 	GET_FILENAME_COMPONENT(tutName ${tut} NAME_WE)
24 | 	add_custom_command(TARGET doc_creation POST_BUILD COMMAND
25 | 		tut2rst ${tutPath}/${tutName} ${PROJECT_SOURCE_DIR}/examples
26 | 	)
27 | endforeach()
28 | add_dependencies(doc_creation tut2rst)
29 | 
30 | add_custom_target( doc ALL
31 | 	COMMAND ${SPHINX_EXECUTABLE} ${SPHINX_PARAMETERS} ${CMAKE_CURRENT_SOURCE_DIR}/sphinx_pages ${CMAKE_CURRENT_BINARY_DIR}/sphinx_pages/build/html
32 | )
33 | 
34 | 
35 | add_dependencies(doc doc_creation)


--------------------------------------------------------------------------------
/doc/tutToRst/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | CMAKE_MINIMUM_REQUIRED( VERSION 2.8 )
2 | 
3 | ADD_EXECUTABLE( tut2rst
4 | 	tut2rst.cpp
5 | )


--------------------------------------------------------------------------------
/examples/Benchmarks/Timer.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  *
  3 |  *
  4 |  * \brief       Timer abstraction with microsecond resolution
  5 |  *
  6 |  *
  7 |  *
  8 |  * \author      T. Voss, M. Tuma
  9 |  * \date        2010
 10 |  *
 11 |  *
 12 |  * \par Copyright 1995-2015 Shark Development Team
 13 |  *
 14 |  * <BR><HR>
 15 |  * This file is part of Shark.
 16 |  * <http://image.diku.dk/shark/>
 17 |  *
 18 |  * Shark is free software: you can redistribute it and/or modify
 19 |  * it under the terms of the GNU Lesser General Public License as published
 20 |  * by the Free Software Foundation, either version 3 of the License, or
 21 |  * (at your option) any later version.
 22 |  *
 23 |  * Shark is distributed in the hope that it will be useful,
 24 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 25 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 26 |  * GNU Lesser General Public License for more details.
 27 |  *
 28 |  * You should have received a copy of the GNU Lesser General Public License
 29 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 30 |  *
 31 |  */
 32 | 
 33 | #ifndef TIMER_HPP
 34 | #define TIMER_HPP
 35 | 
 36 | 
 37 | #ifdef _WIN32
 38 | #define WIN32_LEAN_AND_MEAN
 39 | #include <Windows.h>
 40 | #include <ctime>
 41 | #else
 42 | #include <sys/resource.h>
 43 | #include <sys/time.h>
 44 | #endif
 45 | 
 46 | /// \brief Timer abstraction with microsecond resolution
 47 | ///
 48 | /// \par
 49 | /// Use start() to start the timer and stop() to retrive the
 50 | /// elapsed time in seconds (guaranteed/forced to be >= 0 ).
 51 | /// Use now() to get the current time (may in rare cases give decreasing values).
 52 | class Timer
 53 | {
 54 | public:
 55 | 	Timer(bool measureWallclockTime = true)
 56 | 	: m_lastLap( 0.0 )
 57 | 	, m_startTime( 0.0 )
 58 | 	, m_measureWallclockTime(measureWallclockTime)
 59 | 	{ start();}
 60 | 
 61 | 	/// \brief Returns the current time in a microsecond resolution. Att: may in rare cases give decreasing values.
 62 | 	static double now(bool measureWallclockTime = true) {
 63 | #ifdef _WIN32
 64 | 		if(measureWallclockTime){
 65 | 			return static_cast<double>(std::clock()) / CLOCKS_PER_SEC;
 66 | 		}
 67 | 		else{
 68 | 			LARGE_INTEGER tick, tps;
 69 | 			QueryPerformanceFrequency(&tps);
 70 | 			QueryPerformanceCounter(&tick);
 71 | 			return( static_cast<double>( tick.QuadPart ) / static_cast<double>( tps.QuadPart ) );
 72 | 		}
 73 | #else
 74 | 		if(measureWallclockTime){
 75 | 			timeval time;
 76 | 			if (gettimeofday(&time,0)){
 77 | 				//  Handle error
 78 | 				return 0;
 79 | 			}
 80 | 			return time.tv_sec +1e-6 *time.tv_usec;
 81 | 		}
 82 | 		else
 83 | 		{
 84 | 			rusage res;
 85 | 			getrusage(RUSAGE_SELF, &res);
 86 | 			return(res.ru_utime.tv_sec + res.ru_stime.tv_sec)
 87 | 				+ 1e-6 * (res.ru_utime.tv_usec + res.ru_stime.tv_usec);
 88 | 		}
 89 | #endif
 90 | 	}
 91 | 
 92 | 	/// \brief Stores the current time in m_startTime.
 93 | 	void start() {
 94 | 		m_startTime = now(m_measureWallclockTime);
 95 | 	}
 96 | 
 97 | 	/// \brief Returns the difference between current time and the start time.
 98 | 	///
 99 | 	/// The time is meeasured since the last time start() was called. Thus several consecutive
100 | 	/// calls to stop() will return ascending numbers. start() is called automatically at construction time.
101 | 	double stop() {
102 | 		double stop = now(m_measureWallclockTime);
103 | 		m_lastLap = stop - m_startTime;
104 | 
105 | 		// avoid rare cases of non-increasing timer values (cf. eg. http://www.linuxmisc.com/8-freebsd/d4c6ddc8fbfbd523.htm)
106 | 
107 | 		if ( m_lastLap < 0.0 ) {
108 | 			m_lastLap = 0.0;
109 | 		}
110 | 
111 | 		return m_lastLap;
112 | 	}
113 | 
114 | 	/// \brief Returns the last value of stop().
115 | 	double lastLap() {
116 | 		return m_lastLap;
117 | 	}
118 | 
119 | private:
120 | 	double m_lastLap;
121 | 	double m_startTime;
122 | 	bool m_measureWallclockTime;
123 | };
124 | #endif
125 | 


--------------------------------------------------------------------------------
/examples/Benchmarks/conv2d.cpp:
--------------------------------------------------------------------------------
 1 | #include <remora/kernels/conv2d.hpp>
 2 | #include <remora/remora.hpp>
 3 | #include "Timer.hpp"
 4 | #include <iostream>
 5 | using namespace remora;
 6 | 
 7 | template<class E1, class E2>
 8 | void benchmark(
 9 | 	matrix_expression<E1, cpu_tag> const& image,
10 | 	vector_expression<E2, cpu_tag> const& filter,
11 | 	std::size_t num_channels,
12 | 	std::size_t num_filters,
13 | 	std::size_t image_size1,
14 | 	std::size_t image_size2,
15 | 	std::size_t filter_size
16 | ){
17 | 	std::size_t output_size1 = image_size1 - filter_size +1;
18 | 	std::size_t output_size2 = image_size2 - filter_size +1;
19 | 	typedef typename E1::value_type value_type;
20 | 
21 | 	remora::matrix<value_type> out(image().size1(), output_size1 * num_filters * output_size2, 0.0);
22 | 	double minOptTime = std::numeric_limits<double>::max();
23 | 	for(std::size_t i = 0; i != std::max<std::size_t>(1, std::size_t(20/image().size1())); ++i){
24 | 		Timer time;
25 | 		kernels::conv2d(image,filter,out, num_channels, num_filters, image_size1, image_size2, filter_size, filter_size,0,0);
26 | 		minOptTime = std::min(minOptTime,time.stop());
27 | 	}
28 | 
29 | 	double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels;
30 | 	double flops = image().size1() * mults /1024/1024/minOptTime;
31 | 
32 | 	std::cout<<output_size1<<"\t"<<filter_size<<"\t"<<num_channels<<"\t"<< num_filters<<"\t";
33 | 	std::cout<<"\t"<<flops<< std::endl;
34 | }
35 | 
36 | template<class T>
37 | void benchmark(std::size_t num_channels, std::size_t num_outputs, std::size_t num_images){
38 | 	std::cout<<"Flops"<<std::endl;
39 | 	for(std::size_t filterSize = 4; filterSize != 32; filterSize *= 2){
40 | 		for(std::size_t iter = 0; iter != 5; ++iter){
41 | 			std::size_t sizeOut1 = 3+16 * (2<<iter);
42 | 			std::size_t sizeOut2 = 3+16 * (2<<iter);
43 | 			std::size_t sizeIm1 = sizeOut1 + filterSize-1;
44 | 			std::size_t sizeIm2 = sizeOut2 + filterSize-1;
45 | 
46 | 			remora::matrix<T> image(num_images, num_channels * sizeIm1 * sizeIm2);
47 | 			remora::vector<T> filter(num_channels * num_outputs *  filterSize * filterSize);
48 | 			
49 | 			for(std::size_t im = 0; im != num_images; ++im){
50 | 				for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
51 | 					for(std::size_t j = 0; j != sizeIm2; ++j){
52 | 						image(im, i * sizeIm2 + j)  = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
53 | 					}
54 | 				}
55 | 			}
56 | 			for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
57 | 				for(std::size_t j = 0; j != filterSize; ++j){
58 | 					filter(i * filterSize + j)  = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
59 | 				}
60 | 			}
61 | 
62 | 			benchmark(image,filter,num_channels,num_outputs, sizeIm1, sizeIm2, filterSize);
63 | 		}
64 | 	}
65 | }
66 | 
67 | 
68 | int main(int argc, char **argv) {
69 | 	std::cout<<"performance float"<<std::endl;
70 | 	benchmark<float>(8,32,16);
71 | 	std::cout<<"performance double"<<std::endl;
72 | 	benchmark<double>(8,32,16);
73 | }
74 | 


--------------------------------------------------------------------------------
/examples/Benchmarks/gemm.cpp:
--------------------------------------------------------------------------------
 1 | #include <remora/remora.hpp>
 2 | #include "Timer.hpp"
 3 | #include <iostream>
 4 | using namespace remora;
 5 | 
 6 | template<class AMat, class BMat, class CMat>
 7 | double benchmark(
 8 | 	matrix_expression<AMat, cpu_tag> const& A,
 9 | 	matrix_expression<BMat, cpu_tag> const& B,
10 | 	matrix_expression<CMat, cpu_tag> & C
11 | ){
12 | 	double minTime = std::numeric_limits<double>::max();
13 | 	for(std::size_t i = 0; i != 10; ++i){
14 | 		Timer time;
15 | 		noalias(C) += prod(A,B);
16 | 		minTime = std::min(minTime,time.stop());
17 | 	}
18 | 	return (A().size1()*A().size2()*B().size2())/minTime/1024/1024;
19 | }
20 | 
21 | int main(int argc, char **argv) {
22 | 	std::size_t size = 100;
23 | 	std::cout<<"Flops"<<std::endl;
24 | 	for(std::size_t iter = 0; iter != 5; ++iter){
25 | 		std::size_t middle = size;
26 | 		matrix<double,row_major> Arow(size,middle);
27 | 		for(std::size_t i = 0; i != size; ++i){
28 | 			for(std::size_t k = 0; k != middle; ++k){
29 | 				Arow(i,k)  = 0.1/size*i+0.1/size*k;
30 | 			}
31 | 		}
32 | 
33 | 		matrix<double,row_major> Brow(middle,size);
34 | 		for(std::size_t k = 0; k != middle; ++k){
35 | 			for(std::size_t j = 0; j != size; ++j){
36 | 				Brow(k,j) = 0.1/size*j+0.1/size*k;
37 | 			}
38 | 		}
39 | 		matrix<double,column_major> Acol = Arow;
40 | 		matrix<double,column_major> Bcol = Brow;
41 | 
42 | 		matrix<double,row_major> Crow(size,size,0.0);
43 | 		matrix<double,column_major> Ccol(size,size,0.0);
44 | 		std::cout<<size<<"\t row major results\t"<<benchmark(Arow,Brow,Crow)<<"\t"<< benchmark(Acol,Brow,Crow)
45 | 		<<"\t"<< benchmark(Arow,Bcol,Crow) <<"\t" <<benchmark(Acol,Bcol,Crow) <<std::endl;
46 | 		std::cout<<size<<"\t column major results\t"<<benchmark(Arow,Brow,Ccol)<<"\t"<< benchmark(Acol,Brow,Ccol)
47 | 		<<"\t"<< benchmark(Arow,Bcol,Ccol) <<"\t" <<benchmark(Acol,Bcol,Ccol) <<std::endl;
48 | 		size *=2;
49 | 	}
50 | }
51 | 


--------------------------------------------------------------------------------
/examples/Benchmarks/opencl_conv2d.cpp:
--------------------------------------------------------------------------------
 1 | #include <remora/kernels/conv2d.hpp>
 2 | #include <remora/remora.hpp>
 3 | #include <remora/device_copy.hpp>
 4 | #include "Timer.hpp"
 5 | #include <iostream>
 6 | using namespace remora;
 7 | 
 8 | template<class E1, class E2>
 9 | void benchmark(
10 | 	matrix_expression<E1, cpu_tag> const& image,
11 | 	vector_expression<E2, cpu_tag> const& filter,
12 | 	std::size_t num_channels,
13 | 	std::size_t num_filters,
14 | 	std::size_t image_size1,
15 | 	std::size_t image_size2,
16 | 	std::size_t filter_size
17 | ){
18 | 	std::size_t output_size1 = image_size1 - filter_size +1;
19 | 	std::size_t output_size2 = image_size2 - filter_size +1;
20 | 	typedef typename E1::value_type value_type;
21 | 
22 | 	remora::matrix<value_type, row_major, opencl_tag> image_opencl = copy_to_opencl(image);
23 | 	remora::vector<value_type, opencl_tag> filter_opencl = copy_to_opencl(filter);
24 | 	remora::matrix<value_type, row_major, opencl_tag> out_opencl(image().size1(), output_size1 * num_filters * output_size2, 0.0);
25 | 	kernels::conv2d(image_opencl,filter_opencl,out_opencl, num_channels, num_filters, image_size1, image_size2, filter_size, filter_size,0,0);
26 | 	out_opencl.queue().finish();
27 | 	double minOptTime = std::numeric_limits<double>::max();
28 | 	for(std::size_t i = 0; i != 10; ++i){
29 | 		Timer time;
30 | 		kernels::conv2d(image_opencl,filter_opencl,out_opencl, num_channels, num_filters, image_size1, image_size2, filter_size, filter_size,0,0);
31 | 		out_opencl.queue().finish();
32 | 		minOptTime = std::min(minOptTime,time.stop());
33 | 	}
34 | 
35 | 	double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels;
36 | 	double flops = image().size1() * mults /1024/1024/minOptTime;
37 | 	
38 | 	double storage = double(out_opencl.size1() * out_opencl.size2())/1024/1024;
39 | 	std::cout<<output_size1<<"\t"<<filter_size<<"\t"<<num_channels<<"\t"<< num_filters<<"\t";
40 | 	std::cout<<storage<<"\t"<<flops<< std::endl;
41 | }
42 | 
43 | template<class T>
44 | void benchmark(std::size_t num_channels, std::size_t num_outputs, std::size_t num_images){
45 | 	std::cout<<"im_size\tfiltpx\tincChan\tOutChan\tmemOut\tFlops"<<std::endl;
46 | 	for(std::size_t filterSize = 4; filterSize != 16; filterSize *= 2){
47 | 		for(std::size_t iter = 0; iter != 4; ++iter){
48 | 			std::size_t sizeOut1 = 3 + 8 * (2<<iter);
49 | 			std::size_t sizeOut2 = 3 + 8 * (2<<iter);
50 | 			std::size_t sizeIm1 = sizeOut1 + filterSize-1;
51 | 			std::size_t sizeIm2 = sizeOut2 + filterSize-1;
52 | 
53 | 			remora::matrix<T> image(num_images, num_channels * sizeIm1 * sizeIm2);
54 | 			remora::vector<T> filter(num_channels * num_outputs *  filterSize * filterSize);
55 | 			
56 | 			for(std::size_t im = 0; im != num_images; ++im){
57 | 				for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){
58 | 					for(std::size_t j = 0; j != sizeIm2; ++j){
59 | 						image(im, i * sizeIm2 + j)  = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j;
60 | 					}
61 | 				}
62 | 			}
63 | 			for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){
64 | 				for(std::size_t j = 0; j != filterSize; ++j){
65 | 					filter(i * filterSize + j)  = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j;
66 | 				}
67 | 			}
68 | 
69 | 			benchmark(image,filter,num_channels,num_outputs, sizeIm1, sizeIm2, filterSize);
70 | 		}
71 | 	}
72 | }
73 | 
74 | 
75 | int main(int argc, char **argv) {
76 | 	std::cout<<"performance float"<<std::endl;
77 | 	benchmark<float>(3,16,4);
78 | }
79 | 


--------------------------------------------------------------------------------
/examples/Benchmarks/potrf.cpp:
--------------------------------------------------------------------------------
 1 | #include <remora/remora.hpp>
 2 | #include "Timer.hpp"
 3 | #include <iostream>
 4 | using namespace remora;
 5 | 
 6 | template<class MatA, class Triang>
 7 | double benchmark(
 8 | 	matrix_expression<MatA, cpu_tag> const& A,
 9 | 	Triang
10 | ){
11 | 	double minTime = std::numeric_limits<double>::max();
12 | 	volatile double res = 0;
13 | 	for(std::size_t i = 0; i != 20; ++i){
14 | 		typename matrix_temporary<MatA>::type Acopy = A;
15 | 		Timer time;
16 | 		kernels::potrf<Triang>(Acopy);
17 | 		minTime = std::min(minTime,time.stop());
18 | 		res += max(Acopy);
19 | 	}
20 | 	return (1.0/3.0*A().size1()*A().size1()*A().size1())/minTime/1024/1024;
21 | }
22 | 
23 | int main(int argc, char **argv) {
24 | 	std::size_t size = 128;
25 | 	std::cout<<"Mega Flops"<<std::endl;
26 | 	for(std::size_t iter = 0; iter != 10; ++iter){
27 | 		matrix<double,row_major> Arow(size,size);
28 | 		for(std::size_t i = 0; i != size; ++i){
29 | 			for(std::size_t j = 0; j != size; ++j){
30 | 				Arow(i,j)  = 0.1/size*i+0.1/size*j;
31 | 			}
32 | 			Arow(i,i) += 1000.0;
33 | 		}
34 | 		matrix<double,column_major> Acol = Arow;
35 | 		std::cout<<size<<"\t upper\t"<<benchmark(Arow,upper())<<"\t"<< benchmark(Acol,upper())<<std::endl;
36 | 		std::cout<<size<<"\t lower\t"<<benchmark(Arow,lower())<<"\t"<< benchmark(Acol,lower())<<std::endl;
37 | 		size *=2;
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/examples/Benchmarks/syrk.cpp:
--------------------------------------------------------------------------------
 1 | #include <remora/remora.hpp>
 2 | #include "Timer.hpp"
 3 | #include <iostream>
 4 | using namespace remora;
 5 | 
 6 | template<class Triangular, class AMat, class CMat>
 7 | double benchmark(
 8 | 	matrix_expression<AMat, cpu_tag> const& A,
 9 | 	matrix_expression<CMat, cpu_tag> & C
10 | ){
11 | 	double minTime = std::numeric_limits<double>::max();
12 | 	for(std::size_t i = 0; i != 10; ++i){
13 | 		Timer time;
14 | 		kernels::syrk<Triangular::is_upper>(A,C,2.0);
15 | 		minTime = std::min(minTime,time.stop());
16 | 	}
17 | 	return (0.5*A().size1()*A().size2()*A().size1())/minTime/1024/1024;
18 | }
19 | 
20 | int main(int argc, char **argv) {
21 | 	std::size_t size = 100;
22 | 	std::cout<<"Mega Flops"<<std::endl;
23 | 	for(std::size_t iter = 0; iter != 5; ++iter){
24 | 		matrix<double,row_major> Arow(size,size);
25 | 		for(std::size_t i = 0; i != size; ++i){
26 | 			for(std::size_t k = 0; k != size; ++k){
27 | 				Arow(i,k)  = 0.1/size*i+0.1/size*k;
28 | 			}
29 | 		}
30 | 		matrix<double,column_major> Acol = Arow;
31 | 
32 | 		matrix<double,row_major> Crow(size,size,0.0);
33 | 		matrix<double,column_major> Ccol(size,size,0.0);
34 | 		std::cout<<size<<"\trow major result - lower\t"<<benchmark<lower>(Arow,Crow)<<"\t"<< benchmark<lower>(Acol,Crow)<<std::endl;
35 | 		std::cout<<size<<"\trow major result - upper\t"<<benchmark<upper>(Arow,Crow)<<"\t"<< benchmark<upper>(Acol,Crow)<<std::endl;
36 | 		std::cout<<size<<"\tcolumn major result - lower\t"<<benchmark<lower>(Arow,Ccol)<<"\t"<< benchmark<lower>(Acol,Ccol)<<std::endl;
37 | 		std::cout<<size<<"\tcolumn major result - upper\t"<<benchmark<upper>(Arow,Ccol)<<"\t"<< benchmark<upper>(Acol,Ccol)<<std::endl;
38 | 
39 | 
40 | 		std::cout<<std::endl;
41 | 		size *=2;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/examples/Benchmarks/trmm.cpp:
--------------------------------------------------------------------------------
 1 | #include <remora/remora.hpp>
 2 | #include "Timer.hpp"
 3 | #include <iostream>
 4 | using namespace remora;
 5 | 
 6 | template<class Triangular, class AMat, class BMat, class CMat>
 7 | double benchmark(
 8 | 	matrix_expression<AMat, cpu_tag> const& A,
 9 | 	matrix_expression<BMat, cpu_tag> const& B,
10 | 	matrix_expression<CMat, cpu_tag> & C
11 | ){
12 | 	double minTime = std::numeric_limits<double>::max();
13 | 	for(std::size_t i = 0; i != 10; ++i){
14 | 		Timer time;
15 | 		noalias(C) += triangular_prod<Triangular>(A,B);
16 | 		minTime = std::min(minTime,time.stop());
17 | 	}
18 | 	return (0.5*A().size1()*A().size2()*B().size2())/minTime/1024/1024;
19 | }
20 | 
21 | int main(int argc, char **argv) {
22 | 	std::size_t size = 100;
23 | 	std::cout<<"Mega Flops"<<std::endl;
24 | 	for(std::size_t iter = 0; iter != 10; ++iter){
25 | 		matrix<double,row_major> Arow(size,size);
26 | 		for(std::size_t i = 0; i != size; ++i){
27 | 			for(std::size_t k = 0; k != size; ++k){
28 | 				Arow(i,k)  = 0.1/size*i+0.1/size*k;
29 | 			}
30 | 		}
31 | 
32 | 		matrix<double,row_major> Brow(size,size);
33 | 		for(std::size_t k = 0; k != size; ++k){
34 | 			for(std::size_t j = 0; j != size; ++j){
35 | 				Brow(k,j) = 0.1/size*j+0.1/size*k;
36 | 			}
37 | 		}
38 | 		matrix<double,column_major> Acol = Arow;
39 | 		matrix<double,column_major> Bcol = Brow;
40 | 
41 | 		matrix<double,row_major> Crow(size,size,0.0);
42 | 		matrix<double,column_major> Ccol(size,size,0.0);
43 | 		std::cout<<size<<"\t row major result - lower\t"<<benchmark<lower>(Arow,Brow,Crow)<<"\t"<< benchmark<lower>(Acol,Brow,Crow)
44 | 		<<"\t"<< benchmark<lower>(Arow,Bcol,Crow) <<"\t" <<benchmark<lower>(Acol,Bcol,Crow) <<std::endl;
45 | 		std::cout<<size<<"\t row major result - upper\t"<<benchmark<upper>(Arow,Brow,Crow)<<"\t"<< benchmark<upper>(Acol,Brow,Crow)
46 | 		<<"\t"<< benchmark<upper>(Arow,Bcol,Crow) <<"\t" <<benchmark<upper>(Acol,Bcol,Crow) <<std::endl;
47 | 		std::cout<<size<<"\t column major result - lower\t"<<benchmark<lower>(Arow,Brow,Ccol)<<"\t"<< benchmark<lower>(Acol,Brow,Ccol)
48 | 		<<"\t"<< benchmark<lower>(Arow,Bcol,Ccol) <<"\t" <<benchmark<lower>(Acol,Bcol,Ccol) <<std::endl;
49 | 		std::cout<<size<<"\t column major result - upper\t"<<benchmark<upper>(Arow,Brow,Ccol)<<"\t"<< benchmark<upper>(Acol,Brow,Ccol)
50 | 		<<"\t"<< benchmark<upper>(Arow,Bcol,Ccol) <<"\t" <<benchmark<upper>(Acol,Bcol,Ccol) <<std::endl;
51 | 		std::cout<<std::endl;
52 | 		size *=2;
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_custom_target(remoraexamples)
 2 | set(EXAMPLES_CPP_DIR ${CMAKE_CURRENT_BINARY_DIR})
 3 | set(TPP_FILES "")
 4 | 
 5 | macro( REMORA_ADD_EXAMPLE TUT_SRC NAME)
 6 | 	# Generate cpp file from tpp file
 7 | 	add_custom_command(
 8 | 	OUTPUT ${EXAMPLES_CPP_DIR}/${TUT_SRC}.cpp
 9 | 	COMMAND ${CMAKE_COMMAND}
10 | 	ARGS
11 | 		-DTUT_TPP:Filepath=${CMAKE_CURRENT_SOURCE_DIR}/${TUT_SRC}.tpp
12 | 		-DTUT_CPP:Filepath=${EXAMPLES_CPP_DIR}/${TUT_SRC}.cpp
13 | 		-P ${CMAKE_CURRENT_SOURCE_DIR}/createExampleSource.cmake
14 | 	MAIN_DEPENDENCY
15 | 		${CMAKE_CURRENT_SOURCE_DIR}/${TUT_SRC}.tpp
16 | 	COMMENT "creating ${EXAMPLES_CPP_DIR}/${TUT_SRC}.cpp from ${CMAKE_CURRENT_SOURCE_DIR}/${TUT_SRC}.tpp"
17 | 	)
18 | 
19 | 	list(APPEND TPP_FILES ${EXAMPLES_CPP_DIR}/${TUT_SRC}.cpp)
20 | 
21 | 	if( BUILD_EXAMPLES )
22 | 		add_executable(${NAME} ${EXAMPLES_CPP_DIR}/${TUT_SRC}.cpp)
23 | 		set_property(TARGET ${NAME} PROPERTY CXX_STANDARD 11)
24 | 		set_property(TARGET ${NAME} PROPERTY CXX_STANDARD_REQUIRED ON)
25 | 		target_link_libraries( ${NAME} ${CBLAS_LIBRARIES})
26 | 
27 | 		# Add the dependencies to the target examples
28 | 		add_dependencies(remoraexamples ${NAME})
29 | 
30 | 		set_target_properties( ${NAME} PROPERTIES FOLDER "examples")
31 | 		if(ENABLE_CLBLAST)
32 | 			target_link_libraries( ${NAME} clblast)
33 | 		endif()
34 | 
35 | 		# Installation
36 | 		install(TARGETS ${NAME}
37 | 			   RUNTIME DESTINATION ${SHARK_INSTALL_EXAMPLE_DIR}/
38 | 			   COMPONENT EXAMPLES)
39 | 
40 | 		install(FILES ${EXAMPLES_CPP_DIR}/${TUT_SRC}.cpp
41 | 			   DESTINATION ${SHARK_INSTALL_EXAMPLE_DIR}/)
42 | 
43 | 		source_group(${COMPONENT} FILES ${TUT_SRC})
44 | 	endif()
45 | endmacro()
46 | 
47 | macro( REMORA_ADD_BENCHMARK SRC NAME)
48 | 	add_executable(${NAME} ${CMAKE_CURRENT_SOURCE_DIR}/Benchmarks/${SRC}.cpp)
49 | 	set_property(TARGET ${NAME} PROPERTY CXX_STANDARD 11)
50 | 	set_property(TARGET ${NAME} PROPERTY CXX_STANDARD_REQUIRED ON)
51 | 	target_link_libraries( ${NAME} ${CBLAS_LIBRARIES})
52 | 
53 | 	if(ENABLE_CLBLAST)
54 | 		target_link_libraries( ${NAME} clblast)
55 | 	endif()
56 | 	set_target_properties( ${NAME} PROPERTIES FOLDER "examples")
57 | endmacro()
58 | 
59 | REMORA_ADD_EXAMPLE(linear_regression LinearRegression)
60 | 
61 | REMORA_ADD_BENCHMARK(conv2d Benchmark_Conv2D)
62 | REMORA_ADD_BENCHMARK(gemm Benchmark_Gemm)
63 | REMORA_ADD_BENCHMARK(trmm Benchmark_Trmm)
64 | REMORA_ADD_BENCHMARK(syrk Benchmark_Syrk)
65 | REMORA_ADD_BENCHMARK(potrf Benchmark_Potrf)
66 | 
67 | if(ENABLE_CLBLAST)
68 | REMORA_ADD_BENCHMARK(opencl_conv2d Benchmark_GPU_Conv2D)
69 | endif()
70 | 


--------------------------------------------------------------------------------
/examples/createExampleSource.cmake:
--------------------------------------------------------------------------------
1 | FILE( READ ${TUT_TPP}  contents )
2 | STRING( REGEX REPLACE "\t*//###[^\n]*\n" "" contents_cleared "${contents}")
3 | FILE( WRITE ${TUT_CPP} "${contents_cleared}")


--------------------------------------------------------------------------------
/examples/linear_regression.tpp:
--------------------------------------------------------------------------------
 1 | #include <random>
 2 | #include <iostream>
 3 | 
 4 | //###begin<include>
 5 | #include <remora/remora.hpp>
 6 | using namespace remora;
 7 | //###end<include>
 8 | 
 9 | 
10 | int main(){
11 | 	//Step 0: Theory
12 | 	// The goal of linear regression is to find a linear function f(x) = w^Tx + b
13 | 	// that fits best a given set of point-label pairs (x1,y1),(x2,y2),...,(xN,yN).
14 | 	// This is measured by the squared error:
15 | 	// E(w) = 1/(2N) sum_i (f(x_i)-y_i)^2
16 | 	// It turns out that the optimal solution can be written in simple matrix algebra,
17 | 	// when X is the data matrix where points are stored row-wise and y is the vector
18 | 	// of labels:
19 | 	// w=((X|1)^T (X|1))^{-1} (X|1)^Ty
20 | 	
21 | 	//Step 1: Generate some random data
22 | 	//###begin<data_declaration>
23 | 	std::size_t num_data_points = 100;
24 | 	std::size_t num_dims = 50;
25 | 	matrix<double> X(num_data_points, num_dims);
26 | 	vector<double> y(num_data_points);
27 | 	//###end<data_declaration>
28 | 	//###begin<generate_X>
29 | 	std::random_device rd;
30 | 	std::mt19937 gen(rd());
31 | 	std::normal_distribution<> normal(0,2);
32 | 	for(std::size_t i = 0; i != num_data_points; ++i){
33 | 		for(std::size_t j = 0; j != num_dims; ++j){
34 | 			X(i,j) = normal(gen); //set element (i,j) of X to a rnadomly generated number
35 | 		}
36 | 	}
37 | 	//###end<generate_X>
38 | 	//###begin<generate_Y>
39 | 	std::normal_distribution<> normal_noise(0,0.1);
40 | 	for(std::size_t i = 0; i != num_data_points; ++i){
41 | 		//label is chosen to be just the sum of entries plus some noise
42 | 		y(i) = sum(row(X,i)) + normal_noise(gen) + 1;
43 | 	}
44 | 	//###end<generate_Y>
45 | 	// Step 2: compute the linear regression
46 | 	// formula is w=((X|1)^T (X|1))^{-1} (X|1)^Ty
47 | 	// we need to tell the algebra how to solve the system of equations,
48 | 	// in this case we tell it that the matrix is symmetric positive definite.
49 | 	// but we have to be aware that our matrix is not always full rank,
50 | 	// e..g when we have more variables than data or when some variable
51 | 	// is constant 0.
52 | 	//###begin<solve_w>
53 | 	vector<double> w = inv(trans(X|1) % (X|1), symm_semi_pos_def()) % trans(X|1) % y; 
54 | 	//###end<solve_w>
55 | 	// Step 3: evaluate solution
56 | 	// we compute: E(w) = 1/(2N) sum_i (f(x_i)-y_i)^2
57 | 	//###begin<compute_error>
58 | 	double error = 0.5 * sum(sqr((X|1) % w - y)) / num_data_points;
59 | 	//###end<compute_error>
60 | 	// Step 4: For ensuring correctness, we will check that
61 | 	// the derivative of E(w) at the solution is small (on the order of 1.e-14)
62 | 	//###begin<verify_derivative>
63 | 	vector<double> derE = trans(X|1) % ((X|1) % w - y) / num_data_points;
64 | 	double error_derivative = norm_inf(derE);
65 | 	//###end<verify_derivative>
66 | 	std::cout<<"final error of fit: "<< error<<std::endl;
67 | 	std::cout<<"norm_1 of derivative: "<< error_derivative <<std::endl;
68 | }


--------------------------------------------------------------------------------
/include/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | INSTALL( DIRECTORY
2 | 	 ${CMAKE_CURRENT_SOURCE_DIR}/remora
3 | 	 DESTINATION DESTINATION ${REMORA_INSTALL_INCLUDE_DIR}
4 | 	 PATTERN ".*" EXCLUDE
5 | )


--------------------------------------------------------------------------------
/include/remora/detail/check.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef REMORA_DETAIL_CHECK_HPP
 2 | #define REMORA_DETAIL_CHECK_HPP
 3 | 
 4 | #include <cassert>
 5 | 
 6 | #ifndef NDEBUG
 7 | #define REMORA_RANGE_CHECK(cond) assert(cond)
 8 | #define REMORA_SIZE_CHECK(cond) assert(cond)
 9 | #else
10 | #define REMORA_RANGE_CHECK(cond) do { (void)sizeof(cond); } while (false)
11 | #define REMORA_SIZE_CHECK(cond) do { (void)sizeof(cond); } while (false)
12 | #endif
13 | 
14 | #endif


--------------------------------------------------------------------------------
/include/remora/detail/evaluation_tags.hpp:
--------------------------------------------------------------------------------
  1 | //===========================================================================
  2 | /*!
  3 |  * 
  4 |  *
  5 |  * \brief       Tags representing different type of expression evaluation categories
  6 |  *
  7 |  * \author      O. Krause
  8 |  * \date        2016
  9 |  *
 10 |  *
 11 |  * \par Copyright 1995-2015 Shark Development Team
 12 |  * 
 13 |  * <BR><HR>
 14 |  * This file is part of Shark.
 15 |  * <http://image.diku.dk/shark/>
 16 |  * 
 17 |  * Shark is free software: you can redistribute it and/or modify
 18 |  * it under the terms of the GNU Lesser General Public License as published 
 19 |  * by the Free Software Foundation, either version 3 of the License, or
 20 |  * (at your option) any later version.
 21 |  * 
 22 |  * Shark is distributed in the hope that it will be useful,
 23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 |  * GNU Lesser General Public License for more details.
 26 |  * 
 27 |  * You should have received a copy of the GNU Lesser General Public License
 28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 29 |  *
 30 |  */
 31 | //===========================================================================
 32 | 
 33 | #ifndef REMORA_DETAIL_EVALUATION_TAGS_HPP
 34 | #define REMORA_DETAIL_EVALUATION_TAGS_HPP
 35 | 
 36 | namespace remora{
 37 | 	
 38 | // Evaluation type tags:
 39 | // dense_tag -> dense storage scheme an dense interface supported
 40 | // continuous_dense_tag -> dense storage scheme where stride between all elements is 1
 41 | // sparse_tag -> sparse storage scheme and supports sparse interface.
 42 | // packed_tag ->BLAS packed format and supports packed interface
 43 | // unknown_tag -> no known storage scheme, only supports basic interface(probably blockwise evaluation)
 44 | struct unknown_tag{};
 45 | struct sparse_tag:public unknown_tag{};
 46 | struct dense_tag: public unknown_tag{};
 47 | struct continuous_dense_tag: public dense_tag{};
 48 | struct packed_tag: public unknown_tag{};
 49 | 	
 50 | struct elementwise_tag{};
 51 | struct blockwise_tag{};
 52 | 
 53 | //evaluation categories
 54 | template<class Tag>
 55 | struct elementwise: public elementwise_tag{
 56 | 	typedef Tag tag;
 57 | };
 58 | template<class Tag>
 59 | struct blockwise: public blockwise_tag{
 60 | 	typedef Tag tag;
 61 | };
 62 | 
 63 | 
 64 | template<class Tag1, class Tag2>
 65 | struct evaluation_tag_restrict_traits{
 66 | 	typedef Tag1 type;
 67 | };
 68 | 
 69 | template<class Tag1>
 70 | struct evaluation_tag_restrict_traits<Tag1, dense_tag> {
 71 | 	typedef dense_tag type;
 72 | };
 73 | 
 74 | template<>
 75 | struct evaluation_tag_restrict_traits<packed_tag,sparse_tag> {
 76 | 	typedef sparse_tag type;
 77 | };
 78 | 
 79 | namespace detail{
 80 | 	template<class Category1, class Category2>
 81 | 	struct evaluation_restrict_traits{
 82 | 		typedef blockwise<typename evaluation_tag_restrict_traits<
 83 | 			typename Category1::tag, typename Category2::tag
 84 | 		>::type> type;
 85 | 	};
 86 | 	template<class Tag1, class Tag2>
 87 | 	struct evaluation_restrict_traits<elementwise<Tag1>, elementwise<Tag2> >{
 88 | 		typedef elementwise<typename evaluation_tag_restrict_traits<Tag1, Tag2>::type> type;
 89 | 	};
 90 | }
 91 | template<class E1, class E2>
 92 | struct evaluation_restrict_traits: public detail::evaluation_restrict_traits<
 93 | 	typename E1::evaluation_category,
 94 | 	typename E2::evaluation_category
 95 | >{};
 96 | 
 97 | }
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/include/remora/detail/proxy_optimizers_fwd.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       Proxy Optimizations
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 |  #ifndef REMORA_DETAIL_PROXY_OPTIMIZERS_FWD_HPP
29 | #define REMORA_DETAIL_PROXY_OPTIMIZERS_FWD_HPP
30 | 
31 | namespace remora{namespace detail{
32 | 	
33 | //forward declarations
34 | template<class V>
35 | struct vector_range_optimizer;
36 | 	
37 | template<class M>
38 | struct matrix_transpose_optimizer;
39 | template<class M>
40 | struct matrix_row_optimizer;
41 | template<class M>
42 | struct matrix_range_optimizer;
43 | 	
44 | template<class M>
45 | struct matrix_rows_optimizer;
46 | 
47 | template<class M>
48 | struct linearized_matrix_optimizer;
49 | 	
50 | template<class M, class Orientation>
51 | struct vector_to_matrix_optimizer;
52 |     
53 | template<class M>
54 | struct matrix_diagonal_optimizer;
55 | 
56 | template<class M, class Tag>
57 | struct triangular_proxy_optimizer;
58 | 
59 | template<class V>
60 | struct vector_scalar_multiply_optimizer;
61 | 
62 | template<class M>
63 | struct matrix_scalar_multiply_optimizer;
64 | 
65 | }}
66 | #endif
67 | 


--------------------------------------------------------------------------------
/include/remora/detail/vector_set.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       Classes used for matrix expressions.
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 |  #ifndef REMORA_VECTOR_PROXY_SET_CLASSES_HPP
29 | #define REMORA_VECTOR_PROXY_SET_CLASSES_HPP
30 | 
31 | #include "traits.hpp"
32 | #include "../expression_types.hpp"
33 | 
34 | namespace remora{
35 | 
36 | template<class E, class O>
37 | class vector_set:public vector_set_expression<vector_set<E, O>, typename E::device_type >{
38 | public:
39 | 	typedef typename closure<E>::type expression_closure_type;
40 | 	typedef typename E::size_type size_type;
41 | 	typedef typename E::value_type value_type;
42 | 	typedef typename E::const_reference const_reference;
43 | 	typedef typename reference<E>::type reference;
44 | 
45 | 	typedef vector_set<typename E::const_closure_type, O> const_closure_type;
46 | 	typedef vector_set closure_type;
47 | 	typedef O point_orientation;
48 | 	typedef typename E::evaluation_category evaluation_category;
49 | 	typedef typename E::device_type device_type;
50 | 
51 | 	// Construction
52 | 	explicit vector_set(expression_closure_type const& e):m_expression(e){}
53 | 
54 | 	// Accessors
55 | 	size_type size() const{
56 | 		return point_orientation::index_M(m_expression.size1(), m_expression.size2());
57 | 	}
58 | 	size_type point_size() const{
59 | 		return point_orientation::index_m(m_expression.size1(), m_expression.size2());
60 | 	}
61 | 	
62 | 	expression_closure_type const& expression() const{
63 | 		return m_expression;
64 | 	}
65 | 	typename device_traits<device_type>::queue_type& queue()const{
66 | 		return m_expression.queue();
67 | 	}
68 | 	
69 | 	// Computation Kernels
70 | 	template<class MatX>
71 | 	void assign_to(matrix_expression<MatX, device_type>& X, typename MatX::value_type alpha)const{
72 | 		assign(X, m_expression, alpha);
73 | 	}
74 | 	template<class MatX>
75 | 	void plus_assign_to(matrix_expression<MatX, device_type>& X, typename MatX::value_type alpha)const{
76 | 		plus_assign(X, m_expression, alpha);
77 | 	}
78 | private:
79 | 	expression_closure_type m_expression;
80 | };	
81 | 
82 | }
83 | #endif
84 | 


--------------------------------------------------------------------------------
/include/remora/device_copy.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       expression templates for copying from cpu to device and back
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2013
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_DEVICE_COPY_HPP
29 | #define REMORA_DEVICE_COPY_HPP
30 | 
31 | #include "expression_types.hpp"
32 | 
33 | namespace remora{
34 | 
35 | template<class E>
36 | E const& copy_to_cpu(vector_expression<E, cpu_tag> const& e){
37 | 	return e();
38 | }
39 | 
40 | 
41 | template<class E>
42 | E const&  copy_to_cpu(matrix_expression<E, cpu_tag> const& e){
43 | 	return e();
44 | }
45 | 
46 | template<class E>
47 | E const& copy_to_device(vector_expression<E, cpu_tag> const& e, cpu_tag){
48 | 	return e();
49 | }
50 | 
51 | 
52 | template<class E>
53 | E const&  copy_to_device(matrix_expression<E, cpu_tag> const& e, cpu_tag){
54 | 	return e();
55 | }
56 | 
57 | }
58 | 
59 | #ifdef REMORA_USE_OPENCL
60 | #include "opencl/copy.hpp"
61 | #endif
62 | 
63 | #if defined(__HCC__) || defined(__NVCC__)
64 | #include "hip/copy.hpp"
65 | #endif
66 | 
67 | #endif
68 | 


--------------------------------------------------------------------------------
/include/remora/hip/exception.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       Error handling for the HIP runtime
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2018
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | 
33 | #ifndef REMORA_HIP_EXCEPTION_HPP
34 | #define REMORA_HIP_EXCEPTION_HPP
35 | 
36 | #include <exception>
37 | #include <system_error>
38 | #include <hip/hip_runtime.h>
39 | namespace remora{namespace hip{
40 | 	
41 | class hip_error_category: public std::error_category{
42 | public:
43 | 	const char* name() const noexcept{
44 | 		return "HIP";
45 | 	}
46 | 	std::string message( int error ) const{
47 | 		return hipGetErrorString(static_cast<hipError_t>(error));
48 | 	}
49 | 	static hip_error_category& category(){
50 | 		static hip_error_category cat;
51 | 		return cat;
52 | 	}
53 | };
54 | 
55 | class hip_exception:public std::system_error{
56 | public:
57 | 	hip_exception(hipError_t code): std::system_error(code, hip_error_category::category()){}
58 | };
59 | 
60 | inline void check_hip(hipError_t code){
61 | 	if(code != hipSuccess)
62 | 		throw hip_exception(code);
63 | }
64 | 
65 | }}
66 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/atlas/potrf.hpp:
--------------------------------------------------------------------------------
  1 | //===========================================================================
  2 | /*!
  3 |  *
  4 |  *
  5 |  * \brief       -
  6 |  *
  7 |  * \author      O. Krause
  8 |  * \date        2011
  9 |  *
 10 |  *
 11 |  * \par Copyright 1995-2015 Shark Development Team
 12 |  *
 13 |  * <BR><HR>
 14 |  * This file is part of Shark.
 15 |  * <http://image.diku.dk/shark/>
 16 |  *
 17 |  * Shark is free software: you can redistribute it and/or modify
 18 |  * it under the terms of the GNU Lesser General Public License as published
 19 |  * by the Free Software Foundation, either version 3 of the License, or
 20 |  * (at your option) any later version.
 21 |  *
 22 |  * Shark is distributed in the hope that it will be useful,
 23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 |  * GNU Lesser General Public License for more details.
 26 |  *
 27 |  * You should have received a copy of the GNU Lesser General Public License
 28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 29 |  *
 30 |  */
 31 | //===========================================================================
 32 | 
 33 | #ifndef REMORA_KERNELS_ATLAS_POTRF_H
 34 | #define REMORA_KERNELS_ATLAS_POTRF_H
 35 | 
 36 | #include "../cblas/cblas_inc.hpp"
 37 | extern "C"{
 38 | 	#include <clapack.h>
 39 | }
 40 | 
 41 | namespace remora {
 42 | namespace bindings {
 43 | 
 44 | inline int potrf(
 45 | 	CBLAS_ORDER const Order, CBLAS_UPLO const Uplo,
 46 |         int const N, float *A, int const lda
 47 | ) {
 48 | 	return clapack_spotrf(Order, Uplo, N, A, lda);
 49 | }
 50 | 
 51 | inline int potrf(
 52 | 	CBLAS_ORDER const Order, CBLAS_UPLO const Uplo,
 53 |         int const N, double *A, int const lda
 54 | ) {
 55 | 	return clapack_dpotrf(Order, Uplo, N, A, lda);
 56 | }
 57 | 
 58 | inline int potrf(
 59 | 	CBLAS_ORDER const Order, CBLAS_UPLO const Uplo,
 60 |         int const N, std::complex<float>* A, int const lda
 61 | ) {
 62 | 	return clapack_cpotrf(Order, Uplo, N, static_cast<void *>(A), lda);
 63 | }
 64 | 
 65 | inline int potrf(
 66 | 	CBLAS_ORDER const Order, CBLAS_UPLO const Uplo,
 67 |         int const N, std::complex<double>* A, int const lda
 68 | ) {
 69 | 	return clapack_zpotrf(Order, Uplo, N, static_cast<void *>(A), lda);
 70 | }
 71 | 
 72 | template <typename Triangular, typename SymmA>
 73 | inline int potrf(
 74 | 	matrix_container<SymmA, cpu_tag>& A,
 75 | 	std::true_type
 76 | ) {
 77 | 	CBLAS_UPLO const uplo = Triangular::is_upper ? CblasUpper : CblasLower;
 78 | 	CBLAS_ORDER const stor_ord =
 79 | 		(CBLAS_ORDER)storage_order<typename SymmA::orientation>::value;
 80 | 
 81 | 	std::size_t n = A().size1();
 82 | 	REMORA_SIZE_CHECK(n == A().size2());
 83 | 
 84 | 	auto storageA = A().raw_storage();
 85 | 	return potrf(
 86 | 		stor_ord, uplo, (int)n,
 87 | 		storageA.values,
 88 | 	        storageA.leading_dimension
 89 | 	);
 90 | }
 91 | 
 92 | template<class Storage, class T>
 93 | struct optimized_potrf_detail {
 94 | 	typedef std::false_type type;
 95 | };
 96 | template<>
 97 | struct optimized_potrf_detail <
 98 | 	dense_tag,
 99 | 	double
100 | > {
101 | 	typedef std::true_type type;
102 | };
103 | template<>
104 | struct optimized_potrf_detail <
105 | 	dense_tag,
106 | 	float
107 | > {
108 | 	typedef std::true_type type;
109 | };
110 | template<>
111 | struct optimized_potrf_detail <
112 | 	dense_tag,
113 | 	std::complex<double>
114 | > {
115 | 	typedef std::true_type type;
116 | };
117 | 
118 | template<>
119 | struct optimized_potrf_detail <
120 | 	dense_tag,
121 | 	std::complex<float>
122 | > {
123 | 	typedef std::true_type type;
124 | };
125 | 
126 | template<class M>
127 | struct  has_optimized_potrf
128 | 	: public optimized_potrf_detail <
129 | 	  typename M::storage_type::storage_tag,
130 | 	  typename M::value_type
131 | 	> {};
132 | }}
133 | #endif
134 | 


--------------------------------------------------------------------------------
/include/remora/kernels/cblas/cblas_inc.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  *
  3 |  *
  4 |  * \brief       -
  5 |  *
  6 |  * \author      -
  7 |  * \date        -
  8 |  *
  9 |  *
 10 |  * \par Copyright 1995-2015 Shark Development Team
 11 |  *
 12 |  * <BR><HR>
 13 |  * This file is part of Shark.
 14 |  * <http://image.diku.dk/shark/>
 15 |  *
 16 |  * Shark is free software: you can redistribute it and/or modify
 17 |  * it under the terms of the GNU Lesser General Public License as published
 18 |  * by the Free Software Foundation, either version 3 of the License, or
 19 |  * (at your option) any later version.
 20 |  *
 21 |  * Shark is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU Lesser General Public License for more details.
 25 |  *
 26 |  * You should have received a copy of the GNU Lesser General Public License
 27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 28 |  *
 29 |  */
 30 | 
 31 | #ifndef REMORA_KERNELS_CBLAS_CBLAS_INC_HPP
 32 | #define REMORA_KERNELS_CBLAS_CBLAS_INC_HPP
 33 | 
 34 | #ifdef __APPLE__
 35 | 
 36 | #ifdef __ASSERTMACROS__ //is AssertMacros already included?
 37 | //AssertMacros automatically defines __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES as 1
 38 | //if not already included
 39 | #if __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES
 40 | #warning "AssertMacros.h already included by some file. Disabling macros as otherwise compilation will fail"
 41 | 
 42 | //incomplete list (probably the worst offenders that will fail compilation.
 43 | #ifdef check
 44 |      #undef check
 45 | #endif
 46 | #ifdef require
 47 |      #undef require
 48 | #endif
 49 | #ifdef verify
 50 |      #undef verify
 51 | #endif
 52 | 
 53 | #endif
 54 | #else
 55 | //noone included it yet, so we can just prevent these macros...
 56 | #define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0
 57 | #endif
 58 | 
 59 | // included to make Accelerate work with boost on MacOS
 60 | #include <boost/intrusive/list.hpp>
 61 | 
 62 | // Accelerate framework support added by TG 19.06.2015
 63 | extern "C" {
 64 | #include <Accelerate/Accelerate.h>
 65 | }
 66 | #undef nil
 67 | 
 68 | #else
 69 | 
 70 | extern "C" {
 71 | #include <cblas.h>
 72 | }
 73 | 
 74 | #endif
 75 | 
 76 | //all atlas using functions need this anyway...
 77 | //so we prevent multiple includes in all atlas using functions
 78 | //which should decrease compile time a small bit
 79 | #include <complex>
 80 | #include "../../detail/traits.hpp"
 81 | 
 82 | namespace remora {namespace bindings {
 83 | 
 84 | template <typename Ord> struct storage_order {};
 85 | template<> struct storage_order<row_major> {
 86 | 	enum ename { value = CblasRowMajor };
 87 | };
 88 | template<> struct storage_order<column_major> {
 89 | 	enum ename { value = CblasColMajor };
 90 | };
 91 | 
 92 | template<class T>
 93 | struct allowed_cblas_type{
 94 | 	typedef std::false_type type;
 95 | };
 96 | 
 97 | template<>
 98 | struct allowed_cblas_type<float>{
 99 | 	typedef std::true_type type;
100 | };
101 | template<>
102 | struct allowed_cblas_type<double>{
103 | 	typedef std::true_type type;
104 | };
105 | template<>
106 | struct allowed_cblas_type<std::complex<float> >{
107 | 	typedef std::true_type type;
108 | };
109 | template<>
110 | struct allowed_cblas_type<std::complex<double> >{
111 | 	typedef std::true_type type;
112 | };
113 | 
114 | }}
115 | 
116 | #ifndef OPENBLAS_CONST
117 | typedef void cblas_float_complex_type;
118 | typedef void cblas_double_complex_type;
119 | #else
120 | typedef float cblas_float_complex_type;
121 | typedef double cblas_double_complex_type;
122 | #endif
123 | 
124 | 
125 | #endif
126 | 


--------------------------------------------------------------------------------
/include/remora/kernels/cblas/syrk.hpp:
--------------------------------------------------------------------------------
  1 | //===========================================================================
  2 | /*!
  3 |  * 
  4 |  *
  5 |  * \brief       -
  6 |  *
  7 |  * \author      O. Krause
  8 |  * \date        2010
  9 |  *
 10 |  *
 11 |  * \par Copyright 1995-2015 Shark Development Team
 12 |  * 
 13 |  * <BR><HR>
 14 |  * This file is part of Shark.
 15 |  * <http://image.diku.dk/shark/>
 16 |  * 
 17 |  * Shark is free software: you can redistribute it and/or modify
 18 |  * it under the terms of the GNU Lesser General Public License as published 
 19 |  * by the Free Software Foundation, either version 3 of the License, or
 20 |  * (at your option) any later version.
 21 |  * 
 22 |  * Shark is distributed in the hope that it will be useful,
 23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 |  * GNU Lesser General Public License for more details.
 26 |  * 
 27 |  * You should have received a copy of the GNU Lesser General Public License
 28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 29 |  *
 30 |  */
 31 | //===========================================================================
 32 | #ifndef REMORA_KERNELS_CBLAS_SYRK_HPP
 33 | #define REMORA_KERNELS_CBLAS_SYRK_HPP
 34 | 
 35 | #include "cblas_inc.hpp"
 36 | #include <type_traits>
 37 | 
 38 | namespace remora{ namespace bindings {
 39 | 
 40 | inline void syrk(
 41 | 	CBLAS_ORDER const order, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
 42 | 	int N, int K,
 43 | 	float alpha, float const *A, int lda,
 44 | 	float beta, float *C, int ldc
 45 | ){
 46 | 	cblas_ssyrk(
 47 | 		order, uplo, trans,
 48 | 		N, K,
 49 | 		alpha, A, lda,
 50 | 		beta, C, ldc
 51 | 	);
 52 | }
 53 | 
 54 | inline void syrk(
 55 | 	CBLAS_ORDER const order, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans,
 56 | 	int N, int K,
 57 | 	double alpha, double const *A, int lda,
 58 | 	double beta, double *C, int ldc
 59 | ){
 60 | 	cblas_dsyrk(
 61 | 		order, uplo, trans,
 62 | 		N, K,
 63 | 		alpha, A, lda,
 64 | 		beta, C, ldc
 65 | 	);
 66 | }
 67 | 
 68 | 
 69 | // C <- C + alpha * A * A^T 
 70 | template <bool Upper, typename MatA, typename MatC>
 71 | void syrk(
 72 | 	matrix_expression<MatA, cpu_tag> const& A,
 73 | 	matrix_expression<MatC, cpu_tag>& C,
 74 | 	typename MatC::value_type alpha,
 75 | 	std::true_type
 76 | ) {
 77 | 	REMORA_SIZE_CHECK(A().size1() == C().size1());
 78 | 	REMORA_SIZE_CHECK(C().size1() == C().size2());
 79 | 	
 80 | 	CBLAS_ORDER stor_ord = (CBLAS_ORDER) storage_order<typename MatC::orientation >::value;
 81 | 	CBLAS_UPLO uplo = Upper?CblasUpper: CblasLower;
 82 | 	CBLAS_TRANSPOSE trans = std::is_same<typename MatA::orientation,typename MatC::orientation>::value?CblasNoTrans:CblasTrans;
 83 | 	std::size_t n = C().size1();
 84 | 	std::size_t k = A().size2();
 85 | 	
 86 | 
 87 | 	auto storageA = A().raw_storage();
 88 | 	auto storageC = C().raw_storage();
 89 | 	syrk(stor_ord, uplo, trans,
 90 | 		(int)n, (int)k, alpha,
 91 | 		storageA.values,
 92 | 		(int)storageA.leading_dimension,
 93 | 		typename MatC::value_type(1),
 94 | 		storageC.values,
 95 | 		(int)storageC.leading_dimension
 96 | 	);
 97 | }
 98 | 
 99 | template<class M1, class M2>
100 | struct has_optimized_syrk: std::integral_constant<bool,
101 | 	allowed_cblas_type<typename M1::value_type>::type::value
102 | 	&& std::is_same<typename M1::value_type, typename M2::value_type>::value
103 | 	&& std::is_base_of<dense_tag, typename M1::storage_type::storage_tag>::value
104 | 	&& std::is_base_of<dense_tag, typename M2::storage_type::storage_tag>::value 
105 | >{};
106 | 
107 | }}
108 | 
109 | #endif
110 | 


--------------------------------------------------------------------------------
/include/remora/kernels/clBlast/gemv.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_CLBLAST_GEMV_HPP
33 | #define REMORA_KERNELS_CLBLAST_GEMV_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | #include <clblast.h>
38 | namespace remora{ namespace kernels{
39 | 
40 | // v <- v + alpha * A * x
41 | template <typename MatA, typename VecX, typename VecV>
42 | void gemv(
43 | 	matrix_expression<MatA, opencl_tag> const& A,
44 | 	vector_expression<VecX, opencl_tag> const& x,
45 | 	vector_expression<VecV, opencl_tag>& v, 
46 | 	typename VecV::value_type const& alpha
47 | ) {
48 | 	REMORA_SIZE_CHECK(A().size1() == v().size());
49 | 	REMORA_SIZE_CHECK(A().size2() == x().size());
50 | 	
51 | 	static_assert(std::is_same<typename MatA::value_type, typename VecX::value_type>::value, "[gemv] Arguments do not have same element type");
52 | 	static_assert(std::is_same<typename MatA::value_type, typename VecV::value_type>::value, "[gemv] Arguments do not have same element type");	
53 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[gemv] A is not dense");
54 | 	static_assert(std::is_same<typename VecX::evaluation_category::tag, dense_tag>::value, "[gemv] x is not dense");
55 | 	static_assert(std::is_base_of<dense_tag, typename VecV::storage_type::storage_tag>::value, "[gemv] v does not have dense storage layout");
56 | 	
57 | 	//pre-evaluate A and x into a temporary if necessary
58 | 	auto const& Aeval = eval_expression(A);
59 | 	auto const& xeval = eval_expression(x);
60 | 	
61 | 	
62 | 	using namespace clblast;
63 | 	
64 | 	//obtain geometry information
65 | 	auto layout = std::is_same<typename MatA::orientation::orientation, row_major>::value? Layout::kRowMajor: Layout::kColMajor; 
66 | 	std::size_t m = A().size1();
67 | 	std::size_t n = A().size2();
68 | 	
69 | 	//obtain raw storage
70 | 	auto storageA = Aeval.raw_storage();
71 | 	auto storagex = xeval.raw_storage();
72 | 	auto storagev = v().raw_storage();
73 | 	
74 | 	cl_event* event = nullptr;//todo: store events for out-of-order queues 
75 | 	auto code =  Gemv(layout, Transpose::kNo,
76 | 		m, n, alpha,
77 | 		storageA.buffer.get(), storageA.offset, storageA.leading_dimension,
78 | 		storagex.buffer.get(), storagex.offset, storagex.stride,
79 | 		typename VecV::value_type(1),
80 | 		storagev.buffer.get(), storagev.offset, storagev.stride,
81 | 		&v().queue().get(), event
82 | 	);
83 | 	assert(code == StatusCode::kSuccess);
84 | }
85 | 
86 | }}
87 | 
88 | #endif
89 | 


--------------------------------------------------------------------------------
/include/remora/kernels/clBlast/syrk.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2016
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_CLBLAST_SYRK_HPP
33 | #define REMORA_KERNELS_CLBLAST_SYRK_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | #include <clblast.h>
38 | namespace remora{ namespace kernels{
39 | 
40 | // C <- C + alpha * A * A^T
41 | template <bool Upper, typename MatA, typename MatC>
42 | void syrk(
43 | 	matrix_expression<MatA, opencl_tag> const& A,
44 | 	matrix_expression<MatC, opencl_tag>& C, 
45 | 	typename MatC::value_type const& alpha
46 | ) {
47 | 	REMORA_SIZE_CHECK(A().size1() == C().size1());
48 | 	REMORA_SIZE_CHECK(C().size1()== C().size2());
49 | 	
50 | 	static_assert(std::is_same<typename MatA::value_type, typename MatC::value_type>::value, "[syrk] Arguments do not have same element type");
51 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[syrk] A is not dense");
52 | 	static_assert(std::is_base_of<dense_tag, typename MatC::storage_type::storage_tag>::value, "[syrk] C does not have dense storage layout");
53 | 	
54 | 	//pre-evaluate A into a temporary if necessary
55 | 	auto const& Aeval = eval_expression(A);
56 | 		
57 | 	using namespace clblast;
58 | 	
59 | 	//obtain geometry information
60 | 	auto transA = std::is_same<typename MatA::orientation,typename MatC::orientation>::value? Transpose::kNo : Transpose::kYes;
61 | 	auto layout = std::is_same<typename MatC::orientation::orientation, row_major>::value? Layout::kRowMajor: Layout::kColMajor; 
62 | 	auto triangular = Upper? Triangle::kUpper : Triangle::kLower; 
63 | 	std::size_t n = C().size1();
64 | 	std::size_t k = A().size2();
65 | 
66 | 	//obtain matrix storage
67 | 	auto storageA = Aeval.raw_storage();
68 | 	auto storageC = C().raw_storage();
69 | 	
70 | 	//call
71 | 	cl_event* event = nullptr;//todo: store events for out-of-order queues 
72 | 	auto code =  Syrk(layout, triangular, transA,
73 |                 n, k, alpha,
74 | 		storageA.buffer.get(), storageA.offset, storageA.leading_dimension,
75 |                 typename MatC::value_type(1),
76 | 		storageC.buffer.get(), storageC.offset, storageC.leading_dimension,
77 |                 &C().queue().get(), event
78 | 	);
79 | 		
80 | 	assert(code == StatusCode::kSuccess);
81 | }
82 | 
83 | }}
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/include/remora/kernels/clBlast/trmm.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_CLBLAST_TRMM_HPP
33 | #define REMORA_KERNELS_CLBLAST_TRMM_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | #include <clblast.h>
38 | namespace remora{ namespace kernels{
39 | 
40 | // C <- AC with A being triangular
41 | template <bool Upper,bool Unit,typename MatA, typename MatC>
42 | void trmm(
43 | 	matrix_expression<MatA, opencl_tag> const& A, 
44 | 	matrix_expression<MatC, opencl_tag>& C
45 | ){
46 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
47 | 	REMORA_SIZE_CHECK(A().size2() == C().size1());
48 | 	
49 | 	static_assert(std::is_same<typename MatA::value_type, typename MatC::value_type>::value, "[trmm] Arguments do not have same element type");
50 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[trmm] A is not dense");
51 | 	static_assert(std::is_base_of<dense_tag, typename MatC::storage_type::storage_tag>::value, "[trmm] C does not have dense storage layout");
52 | 	
53 | 	//pre-evaluate A into a temporary if necessary
54 | 	auto const& Aeval = eval_expression(A);
55 | 	
56 | 	using namespace clblast;
57 | 	
58 | 	//obtain geometry information
59 | 	auto transA = std::is_same<typename MatA::orientation,typename MatC::orientation>::value? Transpose::kNo : Transpose::kYes;
60 | 	auto layout = std::is_same<typename MatC::orientation::orientation, row_major>::value? Layout::kRowMajor : Layout::kColMajor; 
61 | 	auto diagonal = Unit? Diagonal::kUnit : Diagonal::kNonUnit; 
62 | 	auto triangular = Upper? Triangle::kUpper : Triangle::kLower; 
63 | 	if(transA == Transpose::kYes){//when we transpose the matrix, we also have to change its Triangular type
64 | 		triangular = Upper? Triangle::kLower : Triangle::kUpper; 
65 | 	}
66 | 	std::size_t m = C().size1();
67 | 	std::size_t n = C().size2();
68 | 	
69 | 	//obtain raw storage
70 | 	auto storageA = Aeval.raw_storage();
71 | 	auto storageC = C().raw_storage();
72 | 	
73 | 	cl_event* event = nullptr;//todo: store events for out-of-order queues 
74 | 	auto code = Trmm(layout, Side::kLeft, triangular, transA, diagonal,
75 | 		m, n, typename MatC::value_type(1),
76 | 		storageA.buffer.get(), storageA.offset, storageA.leading_dimension,
77 | 		storageC.buffer.get(), storageC.offset, storageC.leading_dimension,
78 |                 &C().queue().get(), event
79 | 	);
80 | 	assert(code == StatusCode::kSuccess);
81 | }
82 | 
83 | }}
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/include/remora/kernels/clBlast/trmv.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_CLBLAST_TRMV_HPP
33 | #define REMORA_KERNELS_CLBLAST_TRMV_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | #include <clblast.h>
38 | namespace remora{ namespace kernels{
39 | 
40 | // v <- Av with A being triangular
41 | template <bool Upper,bool Unit,typename MatA, typename VecV>
42 | void trmv(
43 | 	matrix_expression<MatA, opencl_tag> const& A, 
44 | 	vector_expression<VecV, opencl_tag>& v
45 | ){
46 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
47 | 	REMORA_SIZE_CHECK(A().size2() == v().size());
48 | 	
49 | 	static_assert(std::is_same<typename MatA::value_type, typename VecV::value_type>::value, "[trmv] Arguments do not have same element type");
50 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[trmv] A is not dense");
51 | 	static_assert(std::is_base_of<dense_tag, typename VecV::storage_type::storage_tag>::value, "[trmv] v does not have dense storage layout");
52 | 	
53 | 	//pre-evaluate A into a temporary if necessary
54 | 	auto const& Aeval = eval_expression(A);
55 | 	
56 | 	using namespace clblast;
57 | 	
58 | 	//obtain geometry information
59 | 	auto layout = std::is_same<typename MatA::orientation::orientation, row_major>::value? Layout::kRowMajor : Layout::kColMajor; 
60 | 	auto triangular = Upper? Triangle::kUpper : Triangle::kLower; 
61 | 	auto diagonal = Unit? Diagonal::kUnit : Diagonal::kNonUnit; 
62 | 	std::size_t n = A().size1();
63 | 	
64 | 	//obtain raw storage
65 | 	auto storageA = Aeval.raw_storage();
66 | 	auto storagev = v().raw_storage();
67 | 	
68 | 	cl_event* event = nullptr;//todo: store events for out-of-order queues 
69 | 	auto code =  Trmv<typename VecV::value_type>(layout, triangular, Transpose::kNo, diagonal,
70 |                 n,
71 | 		storageA.buffer.get(), storageA.offset, storageA.leading_dimension,
72 | 		storagev.buffer.get(), storagev.offset, storagev.stride,
73 |                 &v().queue().get(), event
74 | 	);
75 | 	assert(code == StatusCode::kSuccess);
76 | }
77 | 
78 | }}
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/include/remora/kernels/clBlast/trsv.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_CLBLAST_TRSV_HPP
33 | #define REMORA_KERNELS_CLBLAST_TRSV_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | #include <clblast.h>
38 | namespace remora{ namespace kernels{
39 | 
40 | // solve Ax = b or xA=b with A being triangular
41 | template <class Triangular,class Side, typename MatA, typename VecB>
42 | void trsv(
43 | 	matrix_expression<MatA, opencl_tag> const& A, 
44 | 	vector_expression<VecB, opencl_tag>& b
45 | ){
46 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
47 | 	REMORA_SIZE_CHECK(A().size1() == b().size());
48 | 	
49 | 	static_assert(std::is_same<typename MatA::value_type, typename VecB::value_type>::value, "[trsv] Arguments do not have same element type");
50 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[trsv] A is not dense");
51 | 	static_assert(std::is_base_of<dense_tag, typename VecB::storage_type::storage_tag>::value, "[trsv] b does not have dense storage layout");
52 | 	
53 | 	//pre-evaluate A into a temporary if necessary
54 | 	auto const& Aeval = eval_expression(A);
55 | 	
56 | 	using namespace clblast;
57 | 	
58 | 	//obtain geometry information
59 | 	auto layout = std::is_same<typename MatA::orientation::orientation, row_major>::value? Layout::kRowMajor : Layout::kColMajor; 
60 | 	auto triangular = Triangular::is_upper? Triangle::kUpper : Triangle::kLower;
61 | 	auto diagonal = Triangular::is_unit? Diagonal::kUnit : Diagonal::kNonUnit; 
62 | 	//transpose if side is right
63 | 	if(!Side::is_left){
64 | 		layout = (layout == Layout::kRowMajor) ?  Layout::kColMajor :  Layout::kRowMajor;
65 | 		triangular = Triangular::is_upper? Triangle::kLower : Triangle::kUpper; 
66 | 	}
67 | 	std::size_t n = A().size1();
68 | 	
69 | 	//obtain raw storage
70 | 	auto storageA = Aeval.raw_storage();
71 | 	auto storageb = b().raw_storage();
72 | 	
73 | 	cl_event* event = nullptr;//todo: store events for out-of-order queues 
74 | 	auto code =  Trsv<typename VecB::value_type>(layout, triangular, Transpose::kNo , diagonal,
75 |                 n,
76 | 		storageA.buffer.get(), storageA.offset, storageA.leading_dimension,
77 | 		storageb.buffer.get(), storageb.offset, storageb.stride,
78 |                 &b().queue().get(), event
79 | 	);
80 | 	assert(code == StatusCode::kSuccess);
81 | }
82 | 
83 | }}
84 | 
85 | #endif
86 | 


--------------------------------------------------------------------------------
/include/remora/kernels/conv2d.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       2d convolution kernel
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_CONV2D_HPP
32 | #define REMORA_KERNELS_CONV2D_HPP
33 | 
34 | #include "default/conv2d.hpp"
35 | 
36 | #ifdef REMORA_USE_CLBLAST
37 | #include "clBlast/conv2d.hpp"
38 | #endif
39 | 
40 | namespace remora{namespace kernels{
41 | 	
42 | 
43 | ///\brief Computes the convolution of a set of multi-channel images with a set of filters. 
44 | ///
45 | /// Computes the result of applying k filters to a set of images where filters and images are allowed
46 | /// to have multiple channels (some would call this a 3d or even 4d convolution, but we refrain from this as 
47 | /// for two dimensions filter dimensions and image dimension must agree. E.g. it does not behave like convoluting a volume)
48 | /// The base for the convolution is the upper left corner and there is no boundary handling, i.e. only pixels within the image area
49 | /// are computed.
50 | ///
51 | /// The images are stored block-row-wise. i.e. an image of size nxm with k channels is stored as 
52 | /// and (n*k)x m matrix where n consecutive rows for the row of an image. Each image is stored as a row of the input matrix
53 | /// Filters are stored similarly, only that in their case we have the format (n1*k*l) x m1 for a
54 | /// set of l filters of size n1 x m1 with k channels each. the n1 rows form a channel, k*n1 rows form
55 | /// a filter.
56 | /// the output format is stored in the same way as image just with size (l* (m-m1+1))x(n-n1+1).
57 | /// The caller must ensure that enough memory is available.	
58 | template<class E1, class E2, class M, class Device>
59 | void conv2d(
60 | 	matrix_expression<E1, Device> const& images,
61 | 	vector_expression<E2, Device> const& filter,
62 | 	matrix_expression<M, Device>& outputs,
63 | 	std::size_t num_channels,
64 | 	std::size_t num_filters,
65 | 	std::size_t image_height,
66 | 	std::size_t image_width,
67 | 	std::size_t filter_height,
68 | 	std::size_t filter_width,
69 | 	std::size_t padding_height = 0,
70 | 	std::size_t padding_width = 0
71 | ){
72 | 	std::size_t output_rows_per_filter = (image_height  - filter_height +1 + padding_height) * (image_width - filter_width +1 + padding_width);
73 | 	std::size_t filter_size = filter_width * filter_height * num_channels;
74 | 	
75 | 	REMORA_SIZE_CHECK(outputs().size1() == images().size1());
76 | 	REMORA_SIZE_CHECK(outputs().size2() == num_filters * output_rows_per_filter);
77 | 	REMORA_SIZE_CHECK(images().size2() == num_channels * image_width * image_height);
78 | 	REMORA_SIZE_CHECK(filter().size() == num_filters * filter_size);
79 | 	
80 | 	bindings::conv2d(
81 | 		images, filter, outputs, num_channels, num_filters,
82 | 		image_height, image_width, filter_height, filter_width,
83 | 		padding_height, padding_width
84 | 	);
85 | }
86 | 
87 | }}
88 | #endif
89 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/aligned_alloc.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014-2015 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_ALIGNED_ALLOC_HPP
10 | #define BOOST_ALIGN_ALIGNED_ALLOC_HPP
11 | 
12 | #include <boost/config.hpp>
13 | 
14 | #if defined(BOOST_HAS_UNISTD_H)
15 | #include <unistd.h>
16 | #endif
17 | 
18 | #if defined(__APPLE__) || defined(__APPLE_CC__) || defined(macintosh)
19 | #include <AvailabilityMacros.h>
20 | #endif
21 | 
22 | #if defined(BOOST_ALIGN_USE_ALLOCATE)
23 | #include "detail/aligned_alloc.hpp"
24 | #elif defined(_MSC_VER) && !defined(UNDER_CE)
25 | #include "detail/aligned_alloc_msvc.hpp"
26 | #elif defined(__MINGW32__) && (__MSVCRT_VERSION__ >= 0x0700)
27 | #include "detail/aligned_alloc_msvc.hpp"
28 | #elif MAC_OS_X_VERSION_MIN_REQUIRED >= 1090
29 | #include "detail/aligned_alloc_posix.hpp"
30 | #elif MAC_OS_X_VERSION_MIN_REQUIRED >= 1060
31 | #include "detail/aligned_alloc_macos.hpp"
32 | #elif defined(__ANDROID__)
33 | #include "detail/aligned_alloc_android.hpp"
34 | #elif defined(__SunOS_5_11) || defined(__SunOS_5_12)
35 | #include "detail/aligned_alloc_posix.hpp"
36 | #elif defined(sun) || defined(__sun)
37 | #include "detail/aligned_alloc_sunos.hpp"
38 | #elif (_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600)
39 | #include "detail/aligned_alloc_posix.hpp"
40 | #else
41 | #include "detail/aligned_alloc.hpp"
42 | #endif
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/assume_aligned.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2015 NumScale SAS
 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI
 4 | 
 5 | (c) 2015 Glen Joseph Fernandes
 6 | <glenjofe -at- gmail.com>
 7 | 
 8 | Distributed under the Boost Software
 9 | License, Version 1.0.
10 | http://boost.org/LICENSE_1_0.txt
11 | */
12 | #ifndef BOOST_ALIGN_ASSUME_ALIGNED_HPP
13 | #define BOOST_ALIGN_ASSUME_ALIGNED_HPP
14 | 
15 | #include <boost/config.hpp>
16 | 
17 | #if defined(BOOST_MSVC)
18 | #include "detail/assume_aligned_msvc.hpp"
19 | #elif defined(BOOST_CLANG) && defined(__has_builtin)
20 | #include "detail/assume_aligned_clang.hpp"
21 | #elif BOOST_GCC_VERSION >= 40700
22 | #include "detail/assume_aligned_gcc.hpp"
23 | #elif defined(__INTEL_COMPILER)
24 | #include "detail/assume_aligned_intel.hpp"
25 | #else
26 | #include "detail/assume_aligned.hpp"
27 | #endif
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/aligned_alloc.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014-2015 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_HPP
10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_HPP
11 | 
12 | #include <boost/align/detail/is_alignment.hpp>
13 | #include <boost/assert.hpp>
14 | #include <cstdlib>
15 | #include <type_traits>
16 | #include <memory>
17 | namespace boost {
18 | namespace alignment {
19 | 
20 | inline void* aligned_alloc(std::size_t alignment, std::size_t size)
21 |     BOOST_NOEXCEPT
22 | {
23 |     BOOST_ASSERT(detail::is_alignment(alignment));
24 |     enum {
25 |         min_align = std::alignment_of<void*>::value
26 |     };
27 |     if (alignment < min_align) {
28 |         alignment = min_align;
29 |     }
30 |     std::size_t n = size + alignment - min_align;
31 |     void* r = 0;
32 |     void* p = std::malloc(sizeof(void*) + n);
33 |     if (p) {
34 |         r = static_cast<char*>(p) + sizeof p;
35 |         (void)std::align(alignment, size, r, n);
36 |         *(static_cast<void**>(r) - 1) = p;
37 |     }
38 |     return r;
39 | }
40 | 
41 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT
42 | {
43 |     if (ptr) {
44 |         std::free(*(static_cast<void**>(ptr) - 1));
45 |     }
46 | }
47 | 
48 | } /* .alignment */
49 | } /* .boost */
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/aligned_alloc_android.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_ANDROID_HPP
10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_ANDROID_HPP
11 | 
12 | #include "is_alignment.hpp"
13 | #include <boost/assert.hpp>
14 | #include <malloc.h>
15 | 
16 | namespace boost {
17 | namespace alignment {
18 | 
19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size)
20 |     BOOST_NOEXCEPT
21 | {
22 |     BOOST_ASSERT(detail::is_alignment(alignment));
23 |     return ::memalign(alignment, size);
24 | }
25 | 
26 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT
27 | {
28 |     ::free(ptr);
29 | }
30 | 
31 | } /* .alignment */
32 | } /* .boost */
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/aligned_alloc_macos.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MACOS_HPP
10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MACOS_HPP
11 | 
12 | #include "is_alignment.hpp"
13 | #include <boost/assert.hpp>
14 | #include <stdlib.h>
15 | 
16 | namespace boost {
17 | namespace alignment {
18 | 
19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size)
20 |     BOOST_NOEXCEPT
21 | {
22 |     BOOST_ASSERT(detail::is_alignment(alignment));
23 |     if (size == 0) {
24 |         return 0;
25 |     }
26 |     if (alignment < sizeof(void*)) {
27 |         alignment = sizeof(void*);
28 |     }
29 |     void* p;
30 |     if (::posix_memalign(&p, alignment, size) != 0) {
31 |         p = 0;
32 |     }
33 |     return p;
34 | }
35 | 
36 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT
37 | {
38 |     ::free(ptr);
39 | }
40 | 
41 | } /* .alignment */
42 | } /* .boost */
43 | 
44 | #endif
45 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/aligned_alloc_msvc.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MSVC_HPP
10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MSVC_HPP
11 | 
12 | #include "is_alignment.hpp"
13 | #include <boost/assert.hpp>
14 | #include <malloc.h>
15 | 
16 | namespace boost {
17 | namespace alignment {
18 | 
19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size)
20 |     BOOST_NOEXCEPT
21 | {
22 |     BOOST_ASSERT(detail::is_alignment(alignment));
23 |     return ::_aligned_malloc(size, alignment);
24 | }
25 | 
26 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT
27 | {
28 |     ::_aligned_free(ptr);
29 | }
30 | 
31 | } /* .alignment */
32 | } /* .boost */
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/aligned_alloc_posix.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_POSIX_HPP
10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_POSIX_HPP
11 | 
12 | #include "is_alignment.hpp"
13 | #include <boost/assert.hpp>
14 | #include <stdlib.h>
15 | 
16 | namespace boost {
17 | namespace alignment {
18 | 
19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size)
20 |     BOOST_NOEXCEPT
21 | {
22 |     BOOST_ASSERT(detail::is_alignment(alignment));
23 |     if (alignment < sizeof(void*)) {
24 |         alignment = sizeof(void*);
25 |     }
26 |     void* p;
27 |     if (::posix_memalign(&p, alignment, size) != 0) {
28 |         p = 0;
29 |     }
30 |     return p;
31 | }
32 | 
33 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT
34 | {
35 |     ::free(ptr);
36 | }
37 | 
38 | } /* .alignment */
39 | } /* .boost */
40 | 
41 | #endif
42 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/aligned_alloc_sunos.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_SUNOS_HPP
10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_SUNOS_HPP
11 | 
12 | #include "is_alignment.hpp"
13 | #include <boost/assert.hpp>
14 | #include <stdlib.h>
15 | 
16 | namespace boost {
17 | namespace alignment {
18 | 
19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size)
20 |     BOOST_NOEXCEPT
21 | {
22 |     BOOST_ASSERT(detail::is_alignment(alignment));
23 |     return ::memalign(alignment, size);
24 | }
25 | 
26 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT
27 | {
28 |     ::free(ptr);
29 | }
30 | 
31 | } /* .alignment */
32 | } /* .boost */
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/assume_aligned.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2015 NumScale SAS
 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI
 4 | 
 5 | (c) 2015 Glen Joseph Fernandes
 6 | <glenjofe -at- gmail.com>
 7 | 
 8 | Distributed under the Boost Software
 9 | License, Version 1.0.
10 | http://boost.org/LICENSE_1_0.txt
11 | */
12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_HPP
13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_HPP
14 | 
15 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n)
16 | 
17 | #endif
18 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/assume_aligned_clang.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2015 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_CLANG_HPP
10 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_CLANG_HPP
11 | 
12 | #if __has_builtin(__builtin_assume_aligned)
13 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \
14 | (p) = (__typeof__(p))(__builtin_assume_aligned((p), (n)))
15 | #else
16 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n)
17 | #endif
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/assume_aligned_gcc.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2015 NumScale SAS
 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI
 4 | 
 5 | (c) 2015 Glen Joseph Fernandes
 6 | <glenjofe -at- gmail.com>
 7 | 
 8 | Distributed under the Boost Software
 9 | License, Version 1.0.
10 | http://boost.org/LICENSE_1_0.txt
11 | */
12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_GCC_HPP
13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_GCC_HPP
14 | 
15 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \
16 | (p) = (__typeof__(p))(__builtin_assume_aligned((p), (n)))
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/assume_aligned_intel.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2015 NumScale SAS
 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI
 4 | 
 5 | (c) 2015 Glen Joseph Fernandes
 6 | <glenjofe -at- gmail.com>
 7 | 
 8 | Distributed under the Boost Software
 9 | License, Version 1.0.
10 | http://boost.org/LICENSE_1_0.txt
11 | */
12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_INTEL_HPP
13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_INTEL_HPP
14 | 
15 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \
16 | __assume_aligned((p), (n))
17 | 
18 | #endif
19 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/assume_aligned_msvc.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2015 NumScale SAS
 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI
 4 | 
 5 | (c) 2015 Glen Joseph Fernandes
 6 | <glenjofe -at- gmail.com>
 7 | 
 8 | Distributed under the Boost Software
 9 | License, Version 1.0.
10 | http://boost.org/LICENSE_1_0.txt
11 | */
12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_MSVC_HPP
13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_MSVC_HPP
14 | 
15 | #include <cstddef>
16 | 
17 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \
18 | __assume(((std::size_t)(p) & ((n) - 1)) == 0)
19 | 
20 | #endif
21 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/is_alignment.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_IS_ALIGNMENT_HPP
10 | #define BOOST_ALIGN_DETAIL_IS_ALIGNMENT_HPP
11 | 
12 | #include <boost/config.hpp>
13 | #include <cstddef>
14 | 
15 | namespace boost {
16 | namespace alignment {
17 | namespace detail {
18 | 
19 | BOOST_CONSTEXPR inline bool is_alignment(std::size_t value)
20 |     BOOST_NOEXCEPT
21 | {
22 |     return (value > 0) && ((value & (value - 1)) == 0);
23 | }
24 | 
25 | } /* .detail */
26 | } /* .alignment */
27 | } /* .boost */
28 | 
29 | #endif
30 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/is_alignment_constant.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_IS_ALIGNMENT_CONSTANT_HPP
10 | #define BOOST_ALIGN_DETAIL_IS_ALIGNMENT_CONSTANT_HPP
11 | 
12 | #include <cstddef>
13 | #include <type_traits>
14 | namespace boost {
15 | namespace alignment {
16 | namespace detail {
17 | 
18 | template<std::size_t N>
19 | struct is_alignment_constant
20 |     : std::integral_constant<bool, (N > 0) && ((N & (N - 1)) == 0)> { };
21 | 
22 | } /* .detail */
23 | } /* .alignment */
24 | } /* .boost */
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/max_objects.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_MAX_OBJECTS_HPP
10 | #define BOOST_ALIGN_DETAIL_MAX_OBJECTS_HPP
11 | 
12 | #include <type_traits>
13 | #include <cstddef>
14 | 
15 | namespace boost {
16 | namespace alignment {
17 | namespace detail {
18 | 
19 | template<class T>
20 | struct max_objects
21 |     : std::integral_constant<std::size_t,
22 |         ~static_cast<std::size_t>(0) / sizeof(T)> { };
23 | 
24 | } /* .detail */
25 | } /* .alignment */
26 | } /* .boost */
27 | 
28 | #endif
29 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/boost_align/detail/max_size.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 | (c) 2014-2015 Glen Joseph Fernandes
 3 | <glenjofe -at- gmail.com>
 4 | 
 5 | Distributed under the Boost Software
 6 | License, Version 1.0.
 7 | http://boost.org/LICENSE_1_0.txt
 8 | */
 9 | #ifndef BOOST_ALIGN_DETAIL_MAX_SIZE_HPP
10 | #define BOOST_ALIGN_DETAIL_MAX_SIZE_HPP
11 | 
12 | #include <type_traits>
13 | #include <cstddef>
14 | 
15 | namespace boost {
16 | namespace alignment {
17 | namespace detail {
18 | 
19 | template<std::size_t A, std::size_t B>
20 | struct max_size
21 |     : std::integral_constant<std::size_t, (A > B) ? A : B> { };
22 | 
23 | } /* .detail */
24 | } /* .alignment */
25 | } /* .boost */
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/dot.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * 
  3 |  *
  4 |  * \brief       -
  5 |  *
  6 |  * \author      O. Krause
  7 |  * \date        2012
  8 |  *
  9 |  *
 10 |  * \par Copyright 1995-2015 Shark Development Team
 11 |  * 
 12 |  * <BR><HR>
 13 |  * This file is part of Shark.
 14 |  * <http://image.diku.dk/shark/>
 15 |  * 
 16 |  * Shark is free software: you can redistribute it and/or modify
 17 |  * it under the terms of the GNU Lesser General Public License as published 
 18 |  * by the Free Software Foundation, either version 3 of the License, or
 19 |  * (at your option) any later version.
 20 |  * 
 21 |  * Shark is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU Lesser General Public License for more details.
 25 |  * 
 26 |  * You should have received a copy of the GNU Lesser General Public License
 27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 28 |  *
 29 |  */
 30 | #ifndef REMORA_KERNELS_DEFAULT_DOT_HPP
 31 | #define REMORA_KERNELS_DEFAULT_DOT_HPP
 32 | 
 33 | #include "../../expression_types.hpp"//vector_expression
 34 | #include "../../detail/traits.hpp"//storage tags
 35 | 
 36 | namespace remora{namespace bindings{
 37 | 
 38 | // Dense case
 39 | template<class E1, class E2, class result_type>
 40 | void dot(
 41 | 	vector_expression<E1, cpu_tag> const& v1,
 42 | 	vector_expression<E2, cpu_tag> const& v2,
 43 | 	result_type& result,
 44 | 	dense_tag,
 45 | 	dense_tag
 46 | ) {
 47 | 	result = result_type();
 48 | 	auto v1_end = v1().end();
 49 | 	auto v2_pos = v2().begin();
 50 | 	for(auto v1_pos = v1().begin(); v1_pos != v1_end; ++v1_pos, ++v2_pos){
 51 | 		result += (*v1_pos) * (*v2_pos);
 52 | 	}
 53 | }
 54 | // Sparse case
 55 | template<class E1, class E2, class result_type>
 56 | void dot(
 57 | 	vector_expression<E1, cpu_tag> const& v1,
 58 | 	vector_expression<E2, cpu_tag> const& v2,
 59 | 	result_type& result,
 60 | 	sparse_tag,
 61 | 	sparse_tag
 62 | ) {
 63 | 	typename E1::const_iterator iter1=v1().begin();
 64 | 	typename E1::const_iterator end1=v1().end();
 65 | 	typename E2::const_iterator iter2=v2().begin();
 66 | 	typename E2::const_iterator end2=v2().end();
 67 | 	result = result_type();
 68 | 	//be aware of empty vectors!
 69 | 	while(iter1 != end1 && iter2 != end2)
 70 | 	{
 71 | 		std::size_t index1=iter1.index();
 72 | 		std::size_t index2=iter2.index();
 73 | 		if(index1==index2){
 74 | 			result += *iter1 * *iter2;
 75 | 			++iter1;
 76 | 			++iter2;
 77 | 		}
 78 | 		else if(index1> index2){
 79 | 			++iter2;
 80 | 		}
 81 | 		else {
 82 | 			++iter1;
 83 | 		}
 84 | 	}
 85 | }
 86 | 
 87 | // Dense-Sparse case
 88 | template<class E1, class E2, class result_type>
 89 | void dot(
 90 | 	vector_expression<E1, cpu_tag> const& v1,
 91 | 	vector_expression<E2, cpu_tag> const& v2,
 92 | 	result_type& result,
 93 | 	dense_tag,
 94 | 	sparse_tag
 95 | ) {
 96 | 	typename E2::const_iterator iter2=v2().begin();
 97 | 	typename E2::const_iterator end2=v2().end();
 98 | 	result = result_type();
 99 | 	auto v1_elem = v1().elements();
100 | 	for(;iter2 != end2;++iter2){
101 | 		result += v1_elem(iter2.index()) * (*iter2);
102 | 	}
103 | }
104 | //Sparse-Dense case is reduced to Dense-Sparse using symmetry.
105 | template<class E1, class E2, class result_type>
106 | void dot(
107 | 	vector_expression<E1, cpu_tag> const& v1,
108 | 	vector_expression<E2, cpu_tag> const& v2,
109 | 	result_type& result,
110 | 	sparse_tag t1,
111 | 	dense_tag t2
112 | ) {
113 | 	//use commutativity!
114 | 	dot(v2,v1,result,t2,t1);
115 | }
116 | 
117 | }}
118 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/default/fold_rows.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * 
  3 |  *
  4 |  * \brief       Folds the rows of a row-major or column major matrix.
  5 |  *
  6 |  * \author      O. Krause
  7 |  * \date        2018
  8 |  *
  9 |  *
 10 |  * \par Copyright 1995-2015 Shark Development Team
 11 |  * 
 12 |  * <BR><HR>
 13 |  * This file is part of Shark.
 14 |  * <http://image.diku.dk/shark/>
 15 |  * 
 16 |  * Shark is free software: you can redistribute it and/or modify
 17 |  * it under the terms of the GNU Lesser General Public License as published 
 18 |  * by the Free Software Foundation, either version 3 of the License, or
 19 |  * (at your option) any later version.
 20 |  * 
 21 |  * Shark is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU Lesser General Public License for more details.
 25 |  * 
 26 |  * You should have received a copy of the GNU Lesser General Public License
 27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 28 |  *
 29 |  */
 30 | 
 31 | #ifndef REMORA_KERNELS_DEFAULT_FOLD_ROWS_HPP
 32 | #define REMORA_KERNELS_DEFAULT_FOLD_ROWS_HPP
 33 | 
 34 | #include "../../expression_types.hpp"//for vector/matrix_expression
 35 | #include "../../detail/traits.hpp"
 36 | 
 37 | namespace remora{namespace bindings{
 38 | 	
 39 | template<class F, class G, class M,class V>
 40 | void fold_rows(
 41 | 	matrix_expression<M, cpu_tag> const& A, 
 42 | 	vector_expression<V, cpu_tag>& v,
 43 | 	F f,
 44 | 	G g,
 45 | 	row_major
 46 | ){
 47 | 	for(std::size_t i = 0; i != v().size(); ++i){
 48 | 		auto end = A().major_end(i);
 49 | 		auto pos = A().major_begin(i);
 50 | 		typename V::value_type s = *pos;
 51 | 		++pos;
 52 | 		for(; pos != end; ++pos){
 53 | 			s = f(s,*pos);
 54 | 		}
 55 | 		v()(i) += g(s);
 56 | 	}
 57 | }
 58 | 
 59 | template<class F, class G, class M,class V>
 60 | void fold_rows(
 61 | 	matrix_expression<M, cpu_tag> const& A, 
 62 | 	vector_expression<V, cpu_tag>& v,
 63 | 	F f,
 64 | 	G g,
 65 | 	column_major
 66 | ){
 67 | 	std::size_t n = v().size();
 68 | 	const std::size_t BLOCK_SIZE = 16;
 69 | 	typename V::value_type storage[BLOCK_SIZE];
 70 | 	std::size_t numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; 
 71 | 	auto A_elem = A().elements();
 72 | 	for(std::size_t b = 0; b != numBlocks; ++b){
 73 | 		std::size_t start = b * BLOCK_SIZE;
 74 | 		std::size_t cur_size = std::min(BLOCK_SIZE, n - start);
 75 | 		for(std::size_t i = 0; i != cur_size; ++i){
 76 | 			storage[i] = A_elem(start + i, 0);
 77 | 		}
 78 | 		for(std::size_t j = 1; j != A().size2(); ++j){
 79 | 			for(std::size_t i = 0; i != cur_size; ++i){
 80 | 				storage[i] = f(storage[i], A_elem(start + i, j));
 81 | 			}
 82 | 		}
 83 | 		for(std::size_t i = 0; i != cur_size; ++i){
 84 | 			v()(start + i) += g(storage[i]);
 85 | 		}
 86 | 	}
 87 | }
 88 | 
 89 | //dispatcher for triangular matrix
 90 | template<class F, class G, class M,class V,class Orientation,class Triangular>
 91 | void fold_rows(
 92 | 	matrix_expression<M, cpu_tag> const& A, 
 93 | 	vector_expression<V, cpu_tag>& v,
 94 | 	F f,
 95 | 	G g,
 96 | 	triangular<Orientation,Triangular>
 97 | ){
 98 | 	fold_rows(A,v, f, g, Orientation());
 99 | }
100 | 
101 | }}
102 | 
103 | #endif
104 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/gemv.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * 
  3 |  *
  4 |  * \brief       -
  5 |  *
  6 |  * \author      O. Krause
  7 |  * \date        2012
  8 |  *
  9 |  *
 10 |  * \par Copyright 1995-2015 Shark Development Team
 11 |  * 
 12 |  * <BR><HR>
 13 |  * This file is part of Shark.
 14 |  * <http://image.diku.dk/shark/>
 15 |  * 
 16 |  * Shark is free software: you can redistribute it and/or modify
 17 |  * it under the terms of the GNU Lesser General Public License as published 
 18 |  * by the Free Software Foundation, either version 3 of the License, or
 19 |  * (at your option) any later version.
 20 |  * 
 21 |  * Shark is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MatAERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU Lesser General Public License for more details.
 25 |  * 
 26 |  * You should have received a copy of the GNU Lesser General Public License
 27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 28 |  *
 29 |  */
 30 | #ifndef REMORA_KERNELS_DEFAULT_GEMatAV_HPP
 31 | #define REMORA_KERNELS_DEFAULT_GEMatAV_HPP
 32 | 
 33 | #include "../../expression_types.hpp" //matrix/vector_expression
 34 | #include "../../proxy_expressions.hpp" //matrix row,, transpose
 35 | #include "../../detail/traits.hpp" //matrix orientations
 36 | #include "../default/dot.hpp" //inner product
 37 | #include "../vector_assign.hpp" //assignment of vectors
 38 | #include <type_traits> //std::false_type marker for unoptimized
 39 | 
 40 | namespace remora{namespace bindings {
 41 | 	
 42 | //row major can be further reduced to inner_prod()
 43 | template<class ResultV, class MatA, class V>
 44 | void gemv_impl(
 45 | 	matrix_expression<MatA, cpu_tag> const& A,
 46 | 	vector_expression<V, cpu_tag> const& x,
 47 | 	vector_expression<ResultV, cpu_tag>& result, 
 48 | 	typename ResultV::value_type alpha,
 49 | 	row_major
 50 | ) {
 51 | 	typedef typename ResultV::value_type value_type;
 52 | 	value_type value;
 53 | 	for(std::size_t i = 0; i != A().size1();++i){
 54 | 		bindings::dot(row(A,i), x, value,  typename MatA::evaluation_category::tag(), typename V::evaluation_category::tag());
 55 | 		if(value != value_type())//handling of sparse results.
 56 | 			result()(i) += alpha * value;
 57 | 	}
 58 | }
 59 | 
 60 | //column major is implemented by computing a linear combination of matrix-rows 
 61 | template<class ResultV, class MatA, class V>
 62 | void gemv_impl(
 63 | 	matrix_expression<MatA, cpu_tag> const& A,
 64 | 	vector_expression<V, cpu_tag> const& x,
 65 | 	vector_expression<ResultV, cpu_tag>& result,
 66 | 	typename ResultV::value_type alpha,
 67 | 	column_major
 68 | ) {
 69 | 	typedef typename V::const_iterator iterator;
 70 | 	typedef typename ResultV::value_type value_type;
 71 | 	typedef device_traits<cpu_tag>::multiply_and_add<value_type> MultAdd;
 72 | 	iterator end = x().end();
 73 | 	for(iterator it = x().begin(); it != end; ++it) {
 74 | 		//FIXME: for sparse result vectors, this might hurt.
 75 | 		kernels::assign(result, column(A,it.index()), MultAdd(alpha * (*it)));
 76 | 	}
 77 | }
 78 | 
 79 | //unknown orientation is dispatched to row_major
 80 | template<class ResultV, class MatA, class V>
 81 | void gemv_impl(
 82 | 	matrix_expression<MatA, cpu_tag> const& A,
 83 | 	vector_expression<V, cpu_tag> const& x,
 84 | 	vector_expression<ResultV, cpu_tag>& result,
 85 | 	typename ResultV::value_type alpha,
 86 | 	unknown_orientation
 87 | ) {
 88 | 	gemv_impl(A,x,result,alpha,row_major());
 89 | }
 90 | 
 91 | // result += alpha * A * x
 92 | template<class ResultV, class MatA, class V>
 93 | void gemv(
 94 | 	matrix_expression<MatA, cpu_tag> const& A,
 95 |         vector_expression<V, cpu_tag> const& x,
 96 |         vector_expression<ResultV, cpu_tag>& result, 
 97 | 	typename ResultV::value_type alpha,
 98 | 	std::false_type
 99 | ) {
100 | 	typedef typename MatA::orientation orientation;
101 | 
102 | 	gemv_impl(A, x, result, alpha, orientation());
103 | }
104 | 
105 | }}
106 | #endif
107 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/random.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * 
  3 |  *
  4 |  * \brief       Generation of random variates on cpu
  5 |  *
  6 |  * \author      O. Krause
  7 |  * \date        2017
  8 |  *
  9 |  *
 10 |  * \par Copyright 1995-2015 Shark Development Team
 11 |  * 
 12 |  * <BR><HR>
 13 |  * This file is part of Shark.
 14 |  * <http://image.diku.dk/shark/>
 15 |  * 
 16 |  * Shark is free software: you can redistribute it and/or modify
 17 |  * it under the terms of the GNU Lesser General Public License as published 
 18 |  * by the Free Software Foundation, either version 3 of the License, or
 19 |  * (at your option) any later version.
 20 |  * 
 21 |  * Shark is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU Lesser General Public License for more details.
 25 |  * 
 26 |  * You should have received a copy of the GNU Lesser General Public License
 27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 28 |  *
 29 |  */
 30 | #ifndef REMORA_KERNELS_DEFAULT_RANDOM_HPP
 31 | #define REMORA_KERNELS_DEFAULT_RANDOM_HPP
 32 | 
 33 | #include <random>
 34 | #include <cmath>
 35 | 
 36 | namespace remora{ namespace bindings{
 37 | template<class V, class Rng>
 38 | void generate_normal(
 39 | 	vector_expression<V, cpu_tag>& v,
 40 | 	Rng& rng,
 41 | 	typename V::value_type mean,
 42 | 	typename V::value_type variance
 43 | ) {
 44 | 	std::normal_distribution<typename V::value_type> dist(mean,std::sqrt(variance));
 45 | 	for(auto& val: v())
 46 | 		val = dist(rng);
 47 | }
 48 | 
 49 | template<class M, class Rng>
 50 | void generate_normal(
 51 | 	matrix_expression<M, cpu_tag>& m,
 52 | 	Rng& rng,
 53 | 	typename M::value_type mean,
 54 | 	typename M::value_type variance
 55 | ) {
 56 | 	std::normal_distribution<typename M::value_type> dist(mean,std::sqrt(variance));
 57 | 	std::size_t size = M::orientation::index_M(m().size1(),m().size2());
 58 | 	for(std::size_t i = 0; i != size; ++i){
 59 | 		auto end = m().major_end(i);
 60 | 		for(auto pos = m().major_begin(i);pos != end; ++pos){
 61 | 			*pos = dist(rng);
 62 | 		}
 63 | 	}
 64 | }
 65 | 
 66 | template<class V, class Rng>
 67 | void generate_uniform(
 68 | 	vector_expression<V, cpu_tag>& v,
 69 | 	Rng& rng,
 70 | 	typename V::value_type low,
 71 | 	typename V::value_type high
 72 | ) {
 73 | 	std::uniform_real_distribution<typename V::value_type> dist(low,high);
 74 | 	for(auto& val: v())
 75 | 		val = dist(rng);
 76 | }
 77 | 
 78 | template<class M, class Rng>
 79 | void generate_uniform(
 80 | 	matrix_expression<M, cpu_tag>& m,
 81 | 	Rng& rng,
 82 | 	typename M::value_type low,
 83 | 	typename M::value_type high
 84 | ) {
 85 | 	std::uniform_real_distribution<typename M::value_type> dist(low,high);
 86 | 	std::size_t size = M::orientation::index_M(m().size1(),m().size2());
 87 | 	for(std::size_t i = 0; i != size; ++i){
 88 | 		auto end = m().major_end(i);
 89 | 		for(auto pos = m().major_begin(i);pos != end; ++pos){
 90 | 			*pos = dist(rng);
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | template<class V, class Rng>
 96 | void generate_discrete(
 97 | 	vector_expression<V, cpu_tag>& v,
 98 | 	Rng& rng,
 99 | 	int low,
100 | 	int high
101 | ) {
102 | 	std::uniform_int_distribution<int> dist(low,high);
103 | 	for(auto& val: v())
104 | 		val = dist(rng);
105 | }
106 | 
107 | template<class M, class Rng>
108 | void generate_discrete(
109 | 	matrix_expression<M, cpu_tag>& m,
110 | 	Rng& rng,
111 | 	int low,
112 | 	int high
113 | ) {
114 | 	std::uniform_int_distribution<int> dist(low,high);
115 | 	std::size_t size = M::orientation::index_M(m().size1(),m().size2());
116 | 	for(std::size_t i = 0; i != size; ++i){
117 | 		auto end = m().major_end(i);
118 | 		for(auto pos = m().major_begin(i);pos != end; ++pos){
119 | 			*pos = dist(rng);
120 | 		}
121 | 	}
122 | }
123 | 
124 | }}
125 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/default/simd.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *
 3 |  *
 4 |  * \brief       Some Macros and basic definitions for the use of SIMD block storage
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2016
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  *
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  *
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  *
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  *
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_DEFAULT_SIMD_HPP
32 | #define REMORA_KERNELS_DEFAULT_SIMD_HPP
33 | 
34 | #include <boost/version.hpp>
35 | #include <cstddef>
36 | 
37 | //older boost versions have some issues
38 |  #if (BOOST_VERSION >= 106300)
39 |  	#include <boost/align/assume_aligned.hpp>
40 |  	#include <boost/align/aligned_allocator.hpp>
41 | #else//subset of boost/align 1.63
42 | 	#include "boost_align/assume_aligned.hpp"
43 | 	#include "boost_align/aligned_allocator.hpp"
44 | #endif
45 | 
46 | 
47 | 
48 | 
49 | #ifdef __AVX__
50 | 	#define REMORA_VECTOR_LENGTH 32
51 | #else
52 | 	#define REMORA_VECTOR_LENGTH 16
53 | #endif
54 | 
55 | namespace remora{namespace bindings{namespace detail{
56 | template<class T>
57 | struct block{
58 | 	static const std::size_t max_vector_elements = REMORA_VECTOR_LENGTH/sizeof(T);
59 | 	#ifdef REMORA_USE_SIMD
60 | 		static const std::size_t vector_elements = REMORA_VECTOR_LENGTH/sizeof(T);
61 | 		#ifdef BOOST_COMP_CLANG_DETECTION
62 | 			typedef T type __attribute__((ext_vector_type (vector_elements)));
63 | 		#else
64 | 		    typedef T type __attribute__((vector_size (REMORA_VECTOR_LENGTH)));
65 | 		#endif
66 | 	#else
67 | 		static const std::size_t vector_elements = 1;
68 | 		typedef T type;
69 | 	#endif
70 | 	static const std::size_t align = 64;
71 | };
72 | }}}
73 | #endif
74 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/vector_fold.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       Kernels for folding vector expressions
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_KERNELS_DEFAULT_VECTOR_FOLD_HPP
29 | #define REMORA_KERNELS_DEFAULT_VECTOR_FOLD_HPP
30 | 
31 | #include "../../expression_types.hpp"
32 | 
33 | namespace remora{namespace bindings{
34 | template<class F, class V>
35 | void vector_fold(vector_expression<V, cpu_tag> const& v, typename F::result_type& value, dense_tag) {
36 | 	F f;
37 | 	auto end = v().end();
38 | 	for(auto pos = v().begin(); pos != end; ++pos){
39 | 		value = f(value,*pos);
40 | 	}
41 | }
42 | 
43 | template<class F, class V>
44 | void vector_fold(vector_expression<V, cpu_tag> const& v, typename F::result_type& value, sparse_tag) {
45 | 	F f;
46 | 	std::size_t nnz = 0;
47 | 	auto iter = v().begin();
48 | 	auto end = v().end();
49 | 	for(;iter != end;++iter,++nnz){
50 | 		value = f(value,*iter);
51 | 	}
52 | 	//apply final operator f(0,v)
53 | 	if(nnz != v().size())
54 | 		value = f(value, 0);
55 | }
56 | 
57 | }}
58 | #endif
59 | 


--------------------------------------------------------------------------------
/include/remora/kernels/default/vector_max.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       -
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | #ifndef REMORA_KERNELS_DEFAULT_VECTOR_MAX_HPP
31 | #define REMORA_KERNELS_DEFAULT_VECTOR_MAX_HPP
32 | 
33 | #include "../../detail/traits.hpp"
34 | #include <algorithm>
35 | namespace remora{namespace bindings{
36 | 
37 | template<class E, class Tag>
38 | std::size_t vector_max(vector_expression<E, cpu_tag> const& v,Tag) {
39 | 	return std::max_element(v().begin(),v().end()).index();
40 | }
41 | 
42 | 
43 | }}
44 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/fold_rows.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Folds the rows of a row-major or column major matrix.
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2018
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_FOLD_ROWS_HPP
32 | #define REMORA_KERNELS_FOLD_ROWS_HPP
33 | 
34 | #include "default/fold_rows.hpp"
35 | #ifdef REMORA_USE_OPENCL
36 | #include "opencl/fold_rows.hpp"
37 | #endif
38 | #if defined(__HCC__) || defined(__NVCC__)
39 | #include "hip/fold_rows.hpp"
40 | #endif
41 | 
42 | 
43 | namespace remora {namespace bindings{
44 | template<class F,  class G, class M,class V, class Device>
45 | void fold_rows(
46 | 	matrix_expression<M, Device> const & A, 
47 | 	vector_expression<V, Device>& b,
48 | 	F f,
49 | 	G g,
50 | 	unknown_orientation
51 | ){
52 | 	fold_rows(A, b, f, g, row_major());
53 | }
54 | }
55 | 	
56 | namespace kernels{
57 | ///\brief Folds each row of a matrix with a function f and transforms the result with another function g
58 | ///
59 | /// output v_i is computed as v_i += g( f(A_i0, f(A_i1,... f(A_n-2i, A_n-1i) ))). That is, the result is the same
60 | /// as folding each row separately as if it was a collection of numbers.
61 | template <class F, class G, class M, class V, class Device>
62 | void fold_rows(
63 | 	matrix_expression<M, Device> const & A, 
64 | 	vector_expression<V, Device>& b,
65 | 	F f,
66 | 	G g
67 | ){
68 | 	REMORA_SIZE_CHECK(A().size1() == b().size());
69 | 	if(A().size1() == 0) return; //undefined
70 | 	bindings::fold_rows(
71 | 		A, b, f, g, typename M::orientation()
72 | 	);
73 | }
74 | 
75 | }}
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/include/remora/kernels/gemv.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       matrix-vector multiplication kernel
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | #ifndef REMORA_KERNELS_GEMV_HPP
31 | #define REMORA_KERNELS_GEMV_HPP
32 | 
33 | #include "default/gemv.hpp"
34 | 
35 | #ifdef REMORA_USE_CBLAS
36 | #include "cblas/gemv.hpp"
37 | #else
38 | // if no bindings are included, we have to provide the default has_optimized_gemv 
39 | // otherwise the binding will take care of this
40 | namespace remora{ namespace bindings{
41 | template<class M1, class M2, class M3>
42 | struct  has_optimized_gemv
43 | : public std::false_type{};
44 | }}
45 | #endif
46 | 
47 | #include <cassert>
48 | 	
49 | namespace remora{namespace kernels{
50 | 	
51 | ///\brief Well known GEneral Matrix-Vector product kernel M+=alpha*E1*e2.
52 | ///
53 | /// If bindings are included and the matrix/vector combination allows for a specific binding
54 | /// to be applied, the binding is called automatically from {binding}/gemv.h
55 | /// otherwise default/gemv.h is used which is fully implemented for all dense/sparse combinations.
56 | /// if a combination is optimized, bindings::has_optimized_gemv<M,E1,E2>::type evaluates to std::true_type
57 | /// The kernels themselves are implemented in bindings::gemv.
58 | template<class M, class E1, class E2>
59 | void gemv(
60 | 	matrix_expression<E1, cpu_tag> const& e1,
61 | 	vector_expression<E2, cpu_tag> const& e2,
62 | 	vector_expression<M, cpu_tag>& m,
63 | 	typename M::value_type alpha
64 | ) {
65 | 	assert(m().size() == e1().size1());
66 | 	assert(e1().size2() == e2().size());
67 | 	
68 | 	bindings::gemv(
69 | 		e1, e2, m,alpha,
70 | 		typename bindings::has_optimized_gemv<M,E1,E2>::type()
71 | 	);
72 | }
73 | 
74 | }}
75 | 
76 | #ifdef REMORA_USE_CLBLAST
77 | #include "clBlast/gemv.hpp"
78 | #elif defined REMORA_USE_OPENCL
79 | #include "opencl/gemv.hpp"
80 | #endif
81 | #if defined(__HCC__) || defined(__NVCC__)
82 | #include "hip/gemv.hpp"
83 | #endif
84 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/getrf.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *
 3 |  *
 4 |  * \brief       Dispatches the GETRF algorithm
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2016
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2014 Shark Development Team
11 |  *
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  *
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  *
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  *
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_GETRF_HPP
32 | #define REMORA_KERNELS_GETRF_HPP
33 | 
34 | 
35 | #include "default/getrf.hpp"
36 | 
37 | namespace remora{namespace kernels {
38 | 
39 | ///\brief Implements the GEneral TRiangular matrix Factorisation GETRF.
40 | ///
41 | /// It is better known as the LU decomposition with partial row-pivoting for dense matrices.
42 | /// The algorithm works in place and does not require additional memory.
43 | ///
44 | /// The algorithm computes
45 | /// A = P * L * U
46 | ///
47 | /// where L is lower unit-triangular and U upper triangular.
48 | /// 
49 | /// The unit diagonal part of L is not stored explicitely. P is a permutation matrix
50 | /// where P(i) stores the index of the row that row i is swapped with.
51 | template <typename MatA, typename VecP>
52 | void getrf(
53 | 	matrix_expression<MatA, cpu_tag>& A,
54 | 	vector_expression<VecP, cpu_tag>& P
55 | ) {
56 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
57 | 	REMORA_SIZE_CHECK(P().size() == A().size1());
58 | 	return bindings::getrf(A,P);
59 | }
60 | 
61 | }}
62 | #endif
63 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/fold_rows.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Folds the rows of a row-major or column major matrix.
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2018
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_HIP_FOLD_ROWS_HPP
32 | #define REMORA_KERNELS_HIP_FOLD_ROWS_HPP
33 | 
34 | #include "../../expression_types.hpp"
35 | #include "../../detail/traits.hpp"
36 | 
37 | namespace remora{
38 | 
39 | namespace hip{
40 | template<class MatA, class VecV, class F, class G>
41 | __global__ void fold_rows_kernel(hipLaunchParm lp,MatA A, size_t size1, size_t size2, VecV v, F f, G g){
42 | 	typedef typename std::remove_reference<typename VecV::result_type>::type value_type;
43 | 	__shared__ value_type folds[64];
44 | 	size_t rowid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
45 | 	size_t colid = hipThreadIdx_y;
46 | 	value_type& entry = folds[hipThreadIdx_y];
47 | 	if(colid < size2){
48 | 		entry = A(rowid,colid);
49 | 		colid += hipBlockDim_y;
50 | 		for(;colid < size2; colid += hipBlockDim_y){
51 | 			entry = f(entry, A(rowid,colid));
52 | 		}
53 | 	}
54 | 	__threadfence_block();
55 | 	if(hipThreadIdx_y == 0){
56 | 		value_type acc =  folds[0];
57 | 		for(size_t i = 1 ; i < min(size_t(hipBlockDim_y), size2); ++i){
58 | 			acc = f(acc, folds[i]);
59 | 		}
60 | 		v(rowid) += g(acc);
61 | 	}
62 | }
63 | }
64 | 	
65 | namespace bindings{
66 | 
67 | template<class F, class G, class MatA, class VecV, class Orientation>
68 | void fold_rows(
69 | 	matrix_expression<MatA, hip_tag> const& A, 
70 | 	vector_expression<VecV, hip_tag>& v,
71 | 	F f,
72 | 	G g,
73 | 	Orientation
74 | ){
75 | 	std::size_t blockSize1 = 1;
76 | 	std::size_t blockSize2 = std::min<std::size_t>(64, A().queue().warp_size());
77 | 	std::size_t numBlocks1 = A().size1();
78 | 	std::size_t numBlocks2 = 1;
79 | 	auto stream = get_stream(A().queue()).handle();
80 | 	hipLaunchKernel(
81 | 		hip::fold_rows_kernel, 
82 | 		dim3(numBlocks1, numBlocks2), dim3(blockSize1, blockSize2), 0, stream,
83 | 		A().elements(), A().size1(), A().size2(),
84 | 		v().elements(), f, g
85 | 	);
86 | }
87 | 
88 | 
89 | }}
90 | 
91 | #endif
92 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/gemv.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       Hip GEMM kernel frontend using cuBLAS or rocmBLAS backends
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_HIP_GEMV_HPP
33 | #define REMORA_KERNELS_HIP_GEMV_HPP
34 | 
35 | #include "../../proxy_expressions.hpp"
36 | #include "../../hip/traits.hpp"
37 | 
38 | #ifdef __NVCC__
39 | #include "../../hip/cublas_backend.hpp"
40 | #endif
41 | 
42 | namespace remora{
43 | namespace kernels{
44 | 
45 | // v <- v + alpha * A * x
46 | template <typename MatA, typename VecX, typename VecV>
47 | void gemv(
48 | 	matrix_expression<MatA, hip_tag> const& A,
49 | 	vector_expression<VecX, hip_tag> const& x,
50 | 	vector_expression<VecV, hip_tag>& v, 
51 | 	typename VecV::value_type const& alpha
52 | ) {
53 | 	REMORA_SIZE_CHECK(A().size1() == v().size());
54 | 	REMORA_SIZE_CHECK(A().size2() == x().size());
55 | 	
56 | 	static_assert(std::is_same<typename MatA::value_type, typename VecX::value_type>::value, "[gemv] Arguments do not have same element type");
57 | 	static_assert(std::is_same<typename MatA::value_type, typename VecV::value_type>::value, "[gemv] Arguments do not have same element type");	
58 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[gemv] A is not dense");
59 | 	static_assert(std::is_same<typename VecX::evaluation_category::tag, dense_tag>::value, "[gemv] x is not dense");
60 | 	static_assert(std::is_base_of<dense_tag, typename VecV::storage_type::storage_tag>::value, "[gemv] v does not have dense storage layout");
61 | 	
62 | 	//pre-evaluate A and x into a temporary if necessary
63 | 	auto const& Aeval = eval_expression(A);
64 | 	auto const& xeval = eval_expression(x);
65 | 	
66 | 	//obtain geometry information
67 | 	bool transA = std::is_same<typename MatA::orientation, row_major>::value;
68 | 	std::size_t m = A().size1();
69 | 	std::size_t n = A().size2();
70 | 	if(transA)
71 | 		std::swap(m,n);
72 | 
73 | 	//obtain matrix storage
74 | 	auto storageA = Aeval.raw_storage();
75 | 	auto storagex = xeval.raw_storage();
76 | 	auto storagev = v().raw_storage();
77 | 	
78 | 	hip::get_blas(A().queue()).gemv(
79 | 		transA,
80 | 		m, n,
81 | 		alpha,
82 | 		storageA.values, storageA.leading_dimension,
83 | 		storagex.values, storagex.stride,
84 | 		typename VecV::value_type(1),
85 | 		storagev.values, storagev.stride,
86 | 		hip::get_stream(A().queue())
87 | 	);
88 | }
89 | 
90 | }}
91 | 
92 | #endif
93 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/syrk.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2016
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_HIP_SYRK_HPP
33 | #define REMORA_KERNELS_HIP_SYRK_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | 
38 | #ifdef __NVCC__
39 | #include "../../hip/cublas_backend.hpp"
40 | #endif
41 | namespace remora{ namespace kernels{
42 | 
43 | // C <- C + alpha * A * A^T
44 | template <bool Upper, typename MatA, typename MatC>
45 | void syrk(
46 | 	matrix_expression<MatA, hip_tag> const& A,
47 | 	matrix_expression<MatC, hip_tag>& C, 
48 | 	typename MatC::value_type const& alpha
49 | ) {
50 | 	REMORA_SIZE_CHECK(A().size1() == C().size1());
51 | 	REMORA_SIZE_CHECK(C().size1()== C().size2());
52 | 	
53 | 	static_assert(std::is_same<typename MatA::value_type, typename MatC::value_type>::value, "[syrk] Arguments do not have same element type");
54 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[syrk] A is not dense");
55 | 	static_assert(std::is_base_of<dense_tag, typename MatC::storage_type::storage_tag>::value, "[syrk] C does not have dense storage layout");
56 | 	
57 | 	//pre-evaluate A into a temporary if necessary
58 | 	auto const& Aeval = eval_expression(A);
59 | 	
60 | 	//obtain geometry information
61 | 	bool transA = !std::is_same<typename MatA::orientation,typename MatC::orientation>::value;
62 | 	bool is_column_majorA = std::is_same<typename MatC::orientation::orientation, column_major>::value; 
63 | 	auto upperA = Upper; 
64 | 	if(!is_column_majorA){
65 | 		transA = !transA;
66 | 		upperA = !upperA;
67 | 	}
68 | 	
69 | 	
70 | 	std::size_t n = C().size1();
71 | 	std::size_t k = A().size2();
72 | 
73 | 	//obtain matrix storage
74 | 	auto storageA = Aeval.raw_storage();
75 | 	auto storageC = C().raw_storage();
76 | 	
77 | 	hip::get_blas(C().queue()).syrk(
78 | 		upperA, transA,
79 | 		n, k, alpha,
80 | 		storageA.values, storageA.leading_dimension,
81 | 		typename MatC::value_type(1),
82 | 		storageC.values, storageC.leading_dimension,
83 | 		hip::get_stream(C().queue())
84 | 	);
85 | }
86 | 
87 | }}
88 | 
89 | #endif
90 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/trmm.hpp:
--------------------------------------------------------------------------------
  1 | //===========================================================================
  2 | /*!
  3 |  * 
  4 |  *
  5 |  * \brief       -
  6 |  *
  7 |  * \author      O. Krause
  8 |  * \date        2017
  9 |  *
 10 |  *
 11 |  * \par Copyright 1995-2015 Shark Development Team
 12 |  * 
 13 |  * <BR><HR>
 14 |  * This file is part of Shark.
 15 |  * <http://image.diku.dk/shark/>
 16 |  * 
 17 |  * Shark is free software: you can redistribute it and/or modify
 18 |  * it under the terms of the GNU Lesser General Public License as published 
 19 |  * by the Free Software Foundation, either version 3 of the License, or
 20 |  * (at your option) any later version.
 21 |  * 
 22 |  * Shark is distributed in the hope that it will be useful,
 23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 |  * GNU Lesser General Public License for more details.
 26 |  * 
 27 |  * You should have received a copy of the GNU Lesser General Public License
 28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 29 |  *
 30 |  */
 31 | //===========================================================================
 32 | #ifndef REMORA_KERNELS_HIP_TRMM_HPP
 33 | #define REMORA_KERNELS_HIP_TRMM_HPP
 34 | 
 35 | #include "../../expression_types.hpp"
 36 | #include "../../detail/traits.hpp"
 37 | 
 38 | #ifdef __NVCC__
 39 | #include "../../hip/cublas_backend.hpp"
 40 | #endif
 41 | 
 42 | namespace remora{ namespace kernels{
 43 | 
 44 | // C <- AC with A being triangular
 45 | template <bool Upper,bool Unit, bool Left, typename MatA, typename MatB>
 46 | void trmm_impl(
 47 | 	matrix_expression<MatA, hip_tag> const& A, 
 48 | 	matrix_expression<MatB, hip_tag>& B,
 49 | 	column_major
 50 | ){
 51 | 	
 52 | 	//obtain geometry information
 53 | 	auto transA = !std::is_same<typename MatA::orientation,typename MatB::orientation>::value;
 54 | 	
 55 | 	//obtain raw storage
 56 | 	auto storageA = A().raw_storage();
 57 | 	auto storageB = B().raw_storage();
 58 | 	
 59 | 	if(!transA){
 60 | 		hip::get_blas(B().queue()).trmm(
 61 | 			Left, Upper, transA, Unit,
 62 | 			B().size1(), B().size2(),
 63 | 			typename MatB::value_type(1),
 64 | 			storageA.values, storageA.leading_dimension,
 65 | 			storageB.values, storageB.leading_dimension,
 66 | 			hip::get_stream(B().queue())
 67 | 		);
 68 | 	}else{
 69 | 		hip::get_blas(B().queue()).trmm(
 70 | 			!Left, Upper, transA, Unit,
 71 | 			B().size2(), B().size1(),
 72 | 			typename MatB::value_type(1),
 73 | 			storageA.values, storageA.leading_dimension,
 74 | 			storageB.values, storageB.leading_dimension,
 75 | 			hip::get_stream(B().queue())
 76 | 		);
 77 | 	}
 78 | }
 79 | 
 80 | template <bool Upper,bool Unit, bool Left, typename MatA, typename MatB>
 81 | void trmm_impl(
 82 | 	matrix_expression<MatA, hip_tag> const& A, 
 83 | 	matrix_expression<MatB, hip_tag>& B,
 84 | 	row_major
 85 | ) {
 86 | 	auto transB = trans(B);
 87 | 	trmm_impl<!Upper, Unit, !Left>(trans(A), transB, column_major());
 88 | }
 89 | 
 90 | template <bool Upper,bool Unit,typename MatA, typename MatB>
 91 | void trmm(
 92 | 	matrix_expression<MatA, hip_tag> const& A, 
 93 | 	matrix_expression<MatB, hip_tag>& B
 94 | ){
 95 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
 96 | 	REMORA_SIZE_CHECK(A().size2() == B().size1());
 97 | 	
 98 | 	static_assert(std::is_same<typename MatA::value_type, typename MatB::value_type>::value, "[trmm] Arguments do not have same element type");
 99 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[trmm] A is not dense");
100 | 	static_assert(std::is_base_of<dense_tag, typename MatB::storage_type::storage_tag>::value, "[trmm] C does not have dense storage layout");
101 | 	
102 | 	//pre-evaluate A into a temporary if necessary
103 | 	auto const& Aeval = eval_expression(A);
104 | 	
105 | 	trmm_impl<Upper, Unit, true>(Aeval, B, typename MatA::orientation());
106 | }
107 | 
108 | }}
109 | 
110 | #endif
111 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/trmv.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_HIP_TRMV_HPP
33 | #define REMORA_KERNELS_HIP_TRMV_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | 
38 | #ifdef __NVCC__
39 | #include "../../hip/cublas_backend.hpp"
40 | #endif
41 | namespace remora{ namespace kernels{
42 | 
43 | // v <- Av with A being triangular
44 | template <bool Upper,bool Unit,typename MatA, typename VecV>
45 | void trmv(
46 | 	matrix_expression<MatA, hip_tag> const& A, 
47 | 	vector_expression<VecV, hip_tag>& v
48 | ){
49 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
50 | 	REMORA_SIZE_CHECK(A().size2() == v().size());
51 | 	
52 | 	static_assert(std::is_same<typename MatA::value_type, typename VecV::value_type>::value, "[trmv] Arguments do not have same element type");
53 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[trmv] A is not dense");
54 | 	static_assert(std::is_base_of<dense_tag, typename VecV::storage_type::storage_tag>::value, "[trmv] v does not have dense storage layout");
55 | 	
56 | 	//pre-evaluate A into a temporary if necessary
57 | 	auto const& Aeval = eval_expression(A);
58 | 
59 | 	//obtain geometry information
60 | 	auto transA = std::is_same<typename MatA::orientation::orientation, row_major>::value;
61 | 	bool triangular = transA? !Upper : Upper;
62 | 	std::size_t n = A().size1();
63 | 	
64 | 	//obtain raw storage
65 | 	auto storageA = Aeval.raw_storage();
66 | 	auto storagev = v().raw_storage();
67 | 	
68 | 	hip::get_blas(v().queue()).trmv(
69 | 		triangular, transA, Unit,
70 | 		n,
71 | 		storageA.values, storageA.leading_dimension,
72 | 		storagev.values, storagev.stride,
73 | 		hip::get_stream(v().queue())
74 | 	);
75 | }
76 | }}
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/trsv.hpp:
--------------------------------------------------------------------------------
 1 | //===========================================================================
 2 | /*!
 3 |  * 
 4 |  *
 5 |  * \brief       -
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2017
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | //===========================================================================
32 | #ifndef REMORA_KERNELS_HIP_TRSV_HPP
33 | #define REMORA_KERNELS_HIP_TRSV_HPP
34 | 
35 | #include "../../expression_types.hpp"
36 | #include "../../detail/traits.hpp"
37 | 
38 | #ifdef __NVCC__
39 | #include "../../hip/cublas_backend.hpp"
40 | #endif
41 | namespace remora{ namespace kernels{
42 | 
43 | // solve Ax = b or xA=b with A being triangular
44 | template <class Triangular,class Side, typename MatA, typename VecV>
45 | void trsv(
46 | 	matrix_expression<MatA, hip_tag> const& A, 
47 | 	vector_expression<VecV, hip_tag>& v
48 | ){
49 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
50 | 	REMORA_SIZE_CHECK(A().size2() == v().size());
51 | 	
52 | 	static_assert(std::is_same<typename MatA::value_type, typename VecV::value_type>::value, "[trmv] Arguments do not have same element type");
53 | 	static_assert(std::is_same<typename MatA::evaluation_category::tag, dense_tag>::value, "[trmv] A is not dense");
54 | 	static_assert(std::is_base_of<dense_tag, typename VecV::storage_type::storage_tag>::value, "[trmv] v does not have dense storage layout");
55 | 	
56 | 	//pre-evaluate A into a temporary if necessary
57 | 	auto const& Aeval = eval_expression(A);
58 | 
59 | 	//obtain geometry information
60 | 	auto transA = !std::is_same<typename MatA::orientation::orientation, column_major>::value;
61 | 	bool upperA = transA? !Triangular::is_upper : Triangular::is_upper;
62 | 	//transpose if side is right
63 | 	if(!Side::is_left){
64 | 		transA = !transA;
65 | 	}
66 | 	
67 | 	std::size_t n = A().size1();
68 | 	
69 | 	
70 | 	//obtain raw storage
71 | 	auto storageA = Aeval.raw_storage();
72 | 	auto storagev = v().raw_storage();
73 | 	
74 | 	hip::get_blas(v().queue()).trsv(
75 | 		upperA, transA, Triangular::is_unit,
76 | 		n,
77 | 		storageA.values, storageA.leading_dimension,
78 | 		storagev.values, storagev.stride,
79 | 		hip::get_stream(v().queue())
80 | 	);
81 | }
82 | }}
83 | 
84 | #endif
85 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/vector_fold.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       kernels for folding vectors with hip
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_KERNELS_HIP_VECTOR_FOLD_HPP
29 | #define REMORA_KERNELS_HIP_VECTOR_FOLD_HPP
30 | 
31 | #include "../../expression_types.hpp"
32 | #include "../../detail/traits.hpp"
33 | #include "../../hip/buffer.hpp"
34 | namespace remora{
35 | namespace hip{
36 | template<class VecV, class R, class F>
37 | __global__ void vector_fold_kernel(hipLaunchParm lp, VecV v, size_t size, R* resultp, F f){
38 | 	__shared__ R folds[64];
39 | 	R& entry = folds[hipThreadIdx_x];
40 | 	size_t i = hipThreadIdx_x;
41 | 	if(i < size){
42 | 		entry = v(i);
43 | 		i += hipBlockDim_x;
44 | 		for(;i < size; i +=  hipBlockDim_x){
45 | 			entry = f(entry, v(i));
46 | 		}
47 | 	}
48 | 	__threadfence();
49 | 	
50 | 	if(hipThreadIdx_x == 0){
51 | 		for(size_t i = 0 ; i < min(size_t(hipBlockDim_x), size); ++i){
52 | 			*resultp = f(*resultp, folds[i]);
53 | 		}
54 | 	}
55 | }
56 | }
57 | namespace bindings{
58 | template<class F, class V>
59 | void vector_fold(vector_expression<V, hip_tag> const& v, typename F::result_type& value, dense_tag){
60 | 	if(v().size() == 0) return;
61 | 	typedef typename F::result_type value_type;
62 | 	hip::buffer<value_type> result(1, v().queue());
63 | 	
64 | 	hipMemcpy(result.get(), &value, sizeof(value), hipMemcpyHostToDevice);
65 | 	
66 | 	std::size_t blockSize = std::min(64, v().queue().warp_size());
67 | 	std::size_t numBlocks = 1;
68 | 	auto stream = hip::get_stream(v().queue()).handle();
69 | 	hipLaunchKernel(
70 | 		hip::vector_fold_kernel, 
71 | 		dim3(numBlocks), dim3(blockSize), 0, stream,
72 | 		v().elements(), v().size(), result.get(), F()
73 | 	);
74 | 	hipMemcpy(&value, result.get(), sizeof(value), hipMemcpyDeviceToHost);
75 | }
76 | 
77 | 
78 | }}
79 | #endif
80 | 


--------------------------------------------------------------------------------
/include/remora/kernels/hip/vector_max.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       kernels for getting the maximum element of a vector with hip
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_KERNELS_HIP_VECTOR_MAX_HPP
29 | #define REMORA_KERNELS_HIP_VECTOR_MAX_HPP
30 | 
31 | #include "../../expression_types.hpp"
32 | #include "../../detail/traits.hpp"
33 | #include "../../hip/buffer.hpp"
34 | namespace remora{
35 | namespace hip{
36 | template<class VecV>
37 | __global__ void vector_max_kernel(hipLaunchParm lp, VecV v, size_t size, size_t* max){
38 | 	typedef typename std::remove_const<
39 | 		typename std::remove_reference<typename VecV::result_type>::type 
40 | 	> ::type value_type;
41 | 	__shared__ value_type max_value[64];
42 | 	__shared__ std::size_t max_index[64];
43 | 	value_type& thread_max = max_value[hipThreadIdx_x];
44 | 	std::size_t& thread_index = max_index[hipThreadIdx_x];
45 | 	thread_max = 1.e-30;
46 | 	thread_index = 0;
47 | 	for(size_t i = hipThreadIdx_x; i < size; i += hipBlockDim_x){
48 | 		if(thread_max < v(i)){
49 | 			thread_max = v(i);
50 | 			thread_index = i;
51 | 		}
52 | 	}
53 | 	__threadfence();
54 | 	
55 | 	if(hipThreadIdx_x == 0){
56 | 		for(size_t i = 1 ; i < min(size_t(hipBlockDim_x), size); ++i){
57 | 			if(thread_max < max_value[i]){
58 | 				thread_max = max_value[i];
59 | 				thread_index = max_index[i];
60 | 			}
61 | 		}
62 | 		*max = thread_index;
63 | 	}
64 | }
65 | }
66 | namespace bindings{
67 | template<class V>
68 | std::size_t vector_max(vector_expression<V, hip_tag> const& v, dense_tag){
69 | 	if(v().size() == 0) return 0;
70 | 	hip::buffer<std::size_t> result(1, v().queue());
71 | 	
72 | 	std::size_t blockSize = std::min(64, v().queue().warp_size());
73 | 	std::size_t numBlocks = (v().size() + blockSize - 1) / blockSize;
74 | 	auto stream = hip::get_stream(v().queue()).handle();
75 | 	hipLaunchKernel(
76 | 		hip::vector_max_kernel, 
77 | 		dim3(numBlocks), dim3(blockSize), 0, stream,
78 | 		v().elements(), v().size(), result.get()
79 | 	);
80 | 	std::size_t index;
81 | 	hipMemcpy(&index, result.get(), sizeof(index), hipMemcpyDeviceToHost);
82 | 	return index;
83 | }
84 | 
85 | 
86 | }}
87 | #endif
88 | 


--------------------------------------------------------------------------------
/include/remora/kernels/lapack/fortran.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       Defines Fortran naming conventions when binding to lapack routines
 3 |  *
 4 |  * \author      O. Krause
 5 |  * \date        2012
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  *
10 |  * This is based on boost::numeric::bindings, written by Toon Knapen and Kresimir Fresl 
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_LAPACK_FORTRAN_H
32 | #define REMORA_KERNELS_LAPACK_FORTRAN_H
33 | 
34 | #if defined(BIND_FORTRAN_LOWERCASE_UNDERSCORE) || defined(BIND_FORTRAN_LOWERCASE)
35 | // Allow manual override of the defaults, e.g. if you want to use a fortran
36 | // lib compiled with gcc from MSVC
37 | #else
38 | 
39 | // First we need to know what the conventions for linking
40 | // C with Fortran is on this platform/toolset
41 | #if defined(__GNUC__) || defined(__ICC) || defined(__sgi) || defined(__COMO__) || defined(__KCC)
42 | #define BIND_FORTRAN_LOWERCASE_UNDERSCORE
43 | #elif defined(__IBMCPP__) || defined(_MSC_VER)
44 | #define BIND_FORTRAN_LOWERCASE
45 | #else
46 | 	#error do not know how to link with fortran for the given platform
47 | #endif
48 | 
49 | #endif
50 | 
51 | // Next we define macro's to convert our symbols to 
52 | // the current convention
53 | #if defined(BIND_FORTRAN_LOWERCASE_UNDERSCORE)
54 | #define FORTRAN_ID( id ) id##_
55 | #elif defined(BIND_FORTRAN_LOWERCASE)
56 | #define FORTRAN_ID( id ) id
57 | #else
58 | #error do not know how to bind to fortran calling convention
59 | #endif
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/include/remora/kernels/lapack/syev.hpp:
--------------------------------------------------------------------------------
  1 | //===========================================================================
  2 | /*!
  3 |  * 
  4 |  *
  5 |  * \brief      Contains the lapack bindings for the symmetric eigenvalue problem syev.
  6 |  *
  7 |  * \author      O. Krause
  8 |  * \date        2010
  9 |  *
 10 |  *
 11 |  * \par Copyright 1995-2015 Shark Development Team
 12 |  * 
 13 |  * <BR><HR>
 14 |  * This file is part of Shark.
 15 |  * <http://image.diku.dk/shark/>
 16 |  * 
 17 |  * Shark is free software: you can redistribute it and/or modify
 18 |  * it under the terms of the GNU Lesser General Public License as published 
 19 |  * by the Free Software Foundation, either version 3 of the License, or
 20 |  * (at your option) any later version.
 21 |  * 
 22 |  * Shark is distributed in the hope that it will be useful,
 23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 25 |  * GNU Lesser General Public License for more details.
 26 |  * 
 27 |  * You should have received a copy of the GNU Lesser General Public License
 28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 29 |  *
 30 |  */
 31 | //===========================================================================
 32 | #ifndef REMORA_KERNELS_LAPACK_SYEV_HPP
 33 | #define REMORA_KERNELS_LAPACK_SYEV_HPP
 34 | 
 35 | #include "fortran.hpp"
 36 | #include "../../detail/traits.hpp"
 37 | 
 38 | #define REMORA_LAPACK_DSYEV FORTRAN_ID(dsyev)
 39 | 
 40 | extern "C"{
 41 | void REMORA_LAPACK_DSYEV( 
 42 | 	const char* jobz, const char* uplo, const int *n,
 43 | 	double* a, const int * lda, double* w,
 44 | 	double* work, const int * lwork, int* info
 45 | );
 46 | }
 47 | 
 48 | 
 49 | 
 50 | namespace remora {namespace bindings {
 51 | 
 52 | inline void syev(
 53 | 	int n, bool upper,
 54 | 	double* A, int lda,
 55 | 	double* eigenvalues
 56 | ){
 57 | 	if(n == 0) return;
 58 | 	int lwork = std::min<int>(130,4*n)*n;
 59 | 	double* work = new double[lwork];
 60 | 	int info;
 61 | 	char job = 'V';
 62 | 	char uplo = upper?'U':'L';
 63 | 	REMORA_LAPACK_DSYEV(&job, &uplo, &n, A, &lda,eigenvalues,work,&lwork,&info);
 64 | 	delete[] work;
 65 | 	
 66 | }
 67 | 
 68 | 
 69 | template <typename MatA, typename VectorB>
 70 | void syev(
 71 | 	matrix_expression<MatA, cpu_tag>& A,
 72 | 	vector_expression<VectorB, cpu_tag>& eigenValues
 73 | ) {
 74 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
 75 | 	REMORA_SIZE_CHECK(A().size1() == eigenValues().size());
 76 | 	
 77 | 	std::size_t n = A().size1();
 78 | 	bool upper = false;
 79 | 	//lapack is column major storage.
 80 | 	if(std::is_same<typename MatA::orientation, row_major>::value){
 81 | 		upper = !upper;
 82 | 	}
 83 | 	auto storageA = A().raw_storage();
 84 | 	auto storageEig = eigenValues().raw_storage();
 85 | 	syev(
 86 | 		n, upper,
 87 | 		storageA.values,
 88 | 	        storageA.leading_dimension,
 89 | 		storageEig.values
 90 | 	);
 91 | 	
 92 | 	A() = trans(A);
 93 | 	
 94 | 	//reverse eigenvectors and eigenvalues
 95 | 	for (int i = 0; i < (int)n-i-1; i++)
 96 | 	{
 97 | 		int l =  n-i-1;
 98 | 		std::swap(eigenValues()( l ),eigenValues()( i ));
 99 | 	}
100 | 	for (int j = 0; j < (int)n; j++) {
101 | 		for (int i = 0; i < (int)n-i-1; i++)
102 | 		{
103 | 			int l =  n-i-1;
104 | 			std::swap(A()( j , l ), A()( j , i ));
105 | 		}
106 | 	}
107 | }
108 | 
109 | }}
110 | 
111 | #undef REMORA_LAPACK_DSYEV
112 | 
113 | #endif
114 | 


--------------------------------------------------------------------------------
/include/remora/kernels/opencl/vector_assign.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * \brief       Assignment kernels for vector expressions
  3 |  * 
  4 |  * \author      O. Krause
  5 |  * \date        2016
  6 |  *
  7 |  *
  8 |  * \par Copyright 1995-2015 Shark Development Team
  9 |  * 
 10 |  * <BR><HR>
 11 |  * This file is part of Shark.
 12 |  * <http://image.diku.dk/shark/>
 13 |  * 
 14 |  * Shark is free software: you can redistribute it and/or modify
 15 |  * it under the terms of the GNU Lesser General Public License as published 
 16 |  * by the Free Software Foundation, either version 3 of the License, or
 17 |  * (at your option) any later version.
 18 |  * 
 19 |  * Shark is distributed in the hope that it will be useful,
 20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 22 |  * GNU Lesser General Public License for more details.
 23 |  * 
 24 |  * You should have received a copy of the GNU Lesser General Public License
 25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 26 |  *
 27 |  */
 28 | #ifndef REMORA_KERNELS_CLBLAS_VECTOR_ASSIGN_HPP
 29 | #define REMORA_KERNELS_CLBLAS_VECTOR_ASSIGN_HPP
 30 | 
 31 | #include "../../expression_types.hpp"
 32 | #include "../../detail/traits.hpp"
 33 | 
 34 | namespace remora{namespace bindings{
 35 | 	
 36 | template<class F, class V>
 37 | void apply(vector_expression<V, opencl_tag>& v_unreg, F const& f_unreg) {
 38 | 	if(v_unreg().size() == 0) return;
 39 | 	opencl::detail::meta_kernel k("blas_vector_apply_dense");
 40 | 	
 41 | 	auto v = k.register_args(v_unreg().elements());
 42 | 	auto f = k.register_args(f_unreg);
 43 | 	
 44 | 	//create source
 45 | 	k<<v(k.get_global_id(0))<<" = " << f(v(k.get_global_id(0)))<<";";
 46 | 	boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context());
 47 | 	//enqueue kernel
 48 | 	std::size_t global_work_size[1] = {v_unreg().size()};
 49 | 	v_unreg().queue().enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, nullptr);
 50 | }
 51 | 
 52 | template<class F, class V>
 53 | void assign(vector_expression<V, opencl_tag>& v, typename V::value_type t) {
 54 | 	static_assert(std::is_base_of<dense_tag, typename V::storage_type::storage_tag>::value, "target must have dense storage for assignment");
 55 | 	auto f = device_traits<opencl_tag>::make_bind_second(F(), t);
 56 | 	apply(v,f);
 57 | }
 58 | 
 59 | ////////////////////////////////////////////
 60 | //assignment with functor
 61 | ////////////////////////////////////////////
 62 | 
 63 | // Dense-Dense case
 64 | template<class V, class E, class F>
 65 | void vector_assign_functor(
 66 | 	vector_expression<V, opencl_tag>& v_unreg,
 67 | 	vector_expression<E, opencl_tag> const& e_unreg,
 68 | 	F f_unreg,
 69 | 	dense_tag, dense_tag
 70 | ) {
 71 | 	if(v_unreg().size() == 0) return;
 72 | 	
 73 | 	opencl::detail::meta_kernel k("blas_vector_assign_functor_dense");
 74 | 	
 75 | 	auto v = k.register_args(v_unreg().elements());
 76 | 	auto e = k.register_args(e_unreg().elements());
 77 | 	auto f = k.register_args(f_unreg);
 78 | 	
 79 | 	//create source
 80 | 	k<<v(k.get_global_id(0))<<" = " << f(v(k.get_global_id(0)), e(k.get_global_id(0)))<<";";
 81 | 	boost::compute::kernel kernel = k.compile(v_unreg().queue().get_context());
 82 | 	//enqueue kernel
 83 | 	std::size_t global_work_size[1] = {v_unreg().size()};
 84 | 	v_unreg().queue().enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, nullptr);
 85 | }
 86 | 
 87 | /////////////////////////////////////////////////////////
 88 | //direct assignment of two vectors
 89 | ////////////////////////////////////////////////////////
 90 | 
 91 | // Dense-Dense case
 92 | template< class V, class E>
 93 | void vector_assign(
 94 | 	vector_expression<V, opencl_tag>& v, vector_expression<E, opencl_tag> const& e, 
 95 | 	dense_tag t, dense_tag
 96 | ) {
 97 | 	vector_assign_functor(v, e, device_traits<opencl_tag>::right_arg<typename E::value_type>(), t, t);
 98 | }
 99 | 
100 | 
101 | 
102 | 
103 | }}
104 | #endif
105 | 


--------------------------------------------------------------------------------
/include/remora/kernels/opencl/vector_fold.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       kernels for folding kernels with openCL
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP
29 | #define REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP
30 | 
31 | #include "../../expression_types.hpp"
32 | #include "../../detail/traits.hpp"
33 | #include <boost/compute/container/array.hpp>
34 | #include <boost/compute/algorithm/copy_n.hpp>
35 | namespace remora{namespace bindings{
36 | 
37 | template<class F, class V>
38 | void vector_fold(vector_expression<V, opencl_tag> const& v_unreg, typename F::result_type& value, dense_tag) {
39 | 	if(v_unreg().size() == 0) return;
40 | 	auto& queue = v_unreg().queue();
41 | 	typedef typename F::result_type value_type;
42 | 	opencl::detail::meta_kernel k("blas_vector_fold");
43 | 	std::size_t size_index = k.add_arg<std::size_t>("size");
44 | 	auto v = k.register_args(v_unreg().elements());
45 | 	auto f = k.register_args(F());
46 | 	
47 | 	boost::compute::array<value_type,1> device_result;
48 | 	boost::compute::copy_n(&value, 1, device_result.begin(), queue);
49 | 	auto exprSubFold = k.expr<value_type>("subfold[get_local_id(0)]");
50 | 	k << "__local " <<k.decl<value_type>("subfold")<< "[TILE_DIM];\n";
51 | 	k << exprSubFold<<" = "<<v(k.expr<cl_uint>("min(size-1,get_local_id(0))"))<<";\n";
52 | 	k << "for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n    ";
53 | 	k << exprSubFold << '=' << f(exprSubFold,v(k.expr<cl_uint>("i")))<<";\n";
54 | 	k << "}\n";
55 | 	k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing
56 | 	//sum up the rows
57 | 	k << "if(get_local_id(0) == 0){\n";
58 | 	k << "    for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
59 | 	k << "        subfold[0] =" << f(k.expr<value_type>("subfold[0]"),k.expr<value_type>("subfold[i]"))<<";\n";
60 | 	k << "    }\n    ";
61 | 	k << device_result.begin()[0]<< "= subfold[0];\n";
62 | 	k << "}\n";
63 | 	
64 | 	std::size_t TILE_DIM = 32;
65 | 	boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32");
66 | 	kernel.set_arg(size_index, v_unreg().size());
67 | 	
68 | 	std::size_t global_work_size[1] = {TILE_DIM};
69 | 	std::size_t local_work_size[1] = {TILE_DIM};
70 | 	queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size);
71 | 	boost::compute::copy_n(device_result.begin(), 1, &value, queue);
72 | }
73 | 
74 | 
75 | }}
76 | #endif
77 | 


--------------------------------------------------------------------------------
/include/remora/kernels/opencl/vector_max.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       -
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2016
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | #ifndef REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP
31 | #define REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP
32 | 
33 | #include "../../detail/traits.hpp"
34 | #include "../../expression_types.hpp"
35 | namespace remora {namespace bindings{
36 | 
37 | template<class E>
38 | std::size_t vector_max(vector_expression<E, opencl_tag> const& v_unreg, dense_tag) {
39 | 	if(v_unreg().size() == 0) return 0;
40 | 	auto& queue = v_unreg().queue();
41 | 	typedef typename E::value_type value_type;
42 | 	opencl::detail::meta_kernel k("blas_vector_fold");
43 | 	std::size_t size_index = k.add_arg<std::size_t>("size");
44 | 	auto v = k.register_args(v_unreg().elements());
45 | 	
46 | 	boost::compute::array<std::size_t,1> device_result;
47 | 	auto exprMax = k.expr<value_type>("maximum[get_local_id(0)]");
48 | 	k << "__local " <<k.decl<value_type>("maximum")<< "[TILE_DIM];\n";
49 | 	k << "__local uint maximum_index[TILE_DIM];\n";
50 | 	k << exprMax<<" = "<<v(k.expr<cl_uint>("min(size-1,get_local_id(0))"))<<";\n";
51 | 	k << "maximum_index[get_local_id(0)] = get_local_id(0);\n";
52 | 	k << "for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n";
53 | 	k << "    if( " << exprMax << '<' << v(k.expr<cl_uint>("i"))<<"){\n        ";
54 | 	k << exprMax << '=' << v(k.expr<cl_uint>("i"))<<";\n";
55 | 	k << "        maximum_index[get_local_id(0)] = i;\n";
56 | 	k << "    }\n";
57 | 	k << "}\n";
58 | 	k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing
59 | 	//sum up the rows
60 | 	k << "if(get_local_id(0) == 0){\n";
61 | 	k << "    for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n";
62 | 	k << "        if( " << exprMax<< '<' << v(k.expr<cl_uint>("i"))<<"){\n";
63 | 	k << "            maximum_index[0] = maximum_index[i];\n";
64 | 	k << "            maximum[0] = maximum[i];\n";
65 | 	k << "        }\n";
66 | 	k << "    }\n";
67 | 	k << device_result.begin()[0]<< "= maximum_index[0];\n";
68 | 	k << "}\n";
69 | 	
70 | 	std::size_t TILE_DIM = 32;
71 | 	boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32");
72 | 	kernel.set_arg(size_index, v_unreg().size());
73 | 	
74 | 	std::size_t global_work_size[1] = {TILE_DIM};
75 | 	std::size_t local_work_size[1] = {TILE_DIM};
76 | 	queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size);
77 | 	std::size_t result;
78 | 	boost::compute::copy_n(device_result.begin(), 1, &result, queue);
79 | 	return result;
80 | }
81 | 
82 | 
83 | }}
84 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/potrf.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *
 3 |  *
 4 |  * \brief       Dispatches the POTRF algorithm
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2014 Shark Development Team
11 |  *
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  *
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  *
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  *
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_POTRF_HPP
32 | #define REMORA_KERNELS_POTRF_HPP
33 | 
34 | #include <type_traits>
35 | #ifdef REMORA_USE_ATLAS_LAPACK
36 | #include "atlas/potrf.hpp"
37 | #else
38 | 
39 | // if no bindings are included, we have to provide the default has_optimized_gemv
40 | // otherwise the binding will take care of this
41 | namespace remora {namespace bindings {
42 | template<class M>
43 | struct  has_optimized_potrf
44 | 	: public std::false_type {};
45 | }}
46 | #endif
47 | 
48 | #include "default/potrf.hpp"
49 | 
50 | namespace remora {namespace kernels {
51 | 
52 | ///\brief Implements the POsitive TRiangular matrix Factorisation POTRF.
53 | ///
54 | /// It is better known as the cholesky decomposition for dense matrices.
55 | /// The algorithm works in place and does not require additional memory.
56 | template <class Triangular, typename MatA>
57 | std::size_t potrf(
58 |     matrix_container<MatA, cpu_tag>& A
59 | ) {
60 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
61 | 	return bindings::potrf<Triangular>(A, typename bindings::has_optimized_potrf<MatA>::type());
62 | }
63 | 
64 | }}
65 | 
66 | #ifdef REMORA_USE_OPENCL
67 | #include "opencl/potrf.hpp"
68 | #endif
69 | 
70 | #if defined(__HCC__) || defined(__NVCC__)
71 | #include "hip/potrf.hpp"
72 | #endif
73 | 
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/include/remora/kernels/pstrf.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  *
 3 |  *
 4 |  * \brief       Dispatches the POTRF algorithm
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2014 Shark Development Team
11 |  *
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  *
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  *
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  *
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_PSTRF_HPP
32 | #define REMORA_KERNELS_PSTRF_HPP
33 | 
34 | #include "default/pstrf.hpp"
35 | 
36 | namespace remora {
37 | namespace kernels {
38 | 
39 | /*!
40 |  *  \brief Cholesky decomposition with full pivoting performed in place.
41 |  *
42 |  *  Given an \f$ m \times m \f$ symmetric positive semi-definite matrix
43 |  *  \f$A\f$, compute thes matrix \f$L\f$ and permutation Matrix P such that
44 |  *  \f$P^TAP = LL^T \f$. If matrix A has rank(A) = k, the first k columns of A hold the full
45 |  *  decomposition, while the rest of the matrix is zero. 
46 |  *  This method is slower than the cholesky decomposition without pivoting but numerically more
47 |  *  stable. The diagonal elements are ordered such that i > j => L(i,i) >= L(j,j)
48 |  *
49 |  *  The implementation used here is described in the working paper 
50 |  *  "LAPACK-Style Codes for Level 2 and 3 Pivoted Cholesky Factorizations"
51 |  *  http://www.netlib.org/lapack/lawnspdf/lawn161.pdf
52 |  *
53 |  * The computation is carried out in place this means A is destroyed and replaced by L.
54 |  *  
55 |  *
56 |  *  \param  A \f$ m \times m \f$ matrix, which must be symmetric and positive definite. It is replaced by L in the end.
57 |  *  \param  P The pivoting matrix of dimension \f$ m \f$
58 |  *  \return The rank of the matrix A
59 |  */
60 | template<class Triangular, class MatA, class VecP>
61 | std::size_t pstrf(
62 | 	matrix_expression<MatA, cpu_tag>&A,
63 | 	vector_expression<VecP, cpu_tag>& P
64 | ){
65 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
66 | 	REMORA_SIZE_CHECK(P().size() == A().size1());
67 | 	return bindings::pstrf(A,P, Triangular());
68 | }
69 | 
70 | 
71 | }}
72 | 
73 | #endif
74 | 


--------------------------------------------------------------------------------
/include/remora/kernels/random.hpp:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * 
  3 |  *
  4 |  * \brief       Generation of random variates
  5 |  *
  6 |  * \author      O. Krause
  7 |  * \date        2017
  8 |  *
  9 |  *
 10 |  * \par Copyright 1995-2015 Shark Development Team
 11 |  * 
 12 |  * <BR><HR>
 13 |  * This file is part of Shark.
 14 |  * <http://image.diku.dk/shark/>
 15 |  * 
 16 |  * Shark is free software: you can redistribute it and/or modify
 17 |  * it under the terms of the GNU Lesser General Public License as published 
 18 |  * by the Free Software Foundation, either version 3 of the License, or
 19 |  * (at your option) any later version.
 20 |  * 
 21 |  * Shark is distributed in the hope that it will be useful,
 22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 24 |  * GNU Lesser General Public License for more details.
 25 |  * 
 26 |  * You should have received a copy of the GNU Lesser General Public License
 27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
 28 |  *
 29 |  */
 30 | #ifndef REMORA_KERNELS_RANDOM_HPP
 31 | #define REMORA_KERNELS_RANDOM_HPP
 32 | 
 33 | #include "default/random.hpp"
 34 | #ifdef REMORA_USE_OPENCL
 35 | #include "opencl/random.hpp"
 36 | #endif
 37 | #if defined(__HCC__) || defined(__NVCC__)
 38 | #include "hip/random.hpp"
 39 | #endif
 40 | 
 41 | 	
 42 | namespace remora{namespace kernels{
 43 | 	
 44 | template<class V, class Rng, class Device>
 45 | void generate_normal(
 46 | 	vector_expression<V, Device>& v,
 47 | 	Rng& rng,
 48 | 	typename V::value_type mean,
 49 | 	typename V::value_type variance
 50 | ) {
 51 | 	bindings::generate_normal(v, rng, mean, variance);
 52 | }
 53 | 
 54 | template<class M, class Rng, class Device>
 55 | void generate_normal(
 56 | 	matrix_expression<M, Device>& m,
 57 | 	Rng& rng,
 58 | 	typename M::value_type mean,
 59 | 	typename M::value_type variance
 60 | ) {
 61 | 	bindings::generate_normal(m, rng, mean, variance);
 62 | }
 63 | 
 64 | template<class V, class Rng, class Device>
 65 | void generate_uniform(
 66 | 	vector_expression<V, Device>& v,
 67 | 	Rng& rng,
 68 | 	typename V::value_type low,
 69 | 	typename V::value_type high
 70 | ) {
 71 | 	bindings::generate_uniform(v, rng, low, high);
 72 | }
 73 | 
 74 | template<class M, class Rng, class Device>
 75 | void generate_uniform(
 76 | 	matrix_expression<M, Device>& m,
 77 | 	Rng& rng,
 78 | 	typename M::value_type low,
 79 | 	typename M::value_type high
 80 | ) {
 81 | 	bindings::generate_uniform(m, rng, low, high);
 82 | }
 83 | 
 84 | template<class V, class Rng, class Device>
 85 | void generate_discrete(
 86 | 	vector_expression<V, Device>& v,
 87 | 	Rng& rng,
 88 | 	int low,
 89 | 	int high
 90 | ) {
 91 | 	bindings::generate_discrete(v, rng, low, high);
 92 | }
 93 | 
 94 | template<class M, class Rng, class Device>
 95 | void generate_discrete(
 96 | 	matrix_expression<M, Device>& m,
 97 | 	Rng& rng,
 98 | 	int low,
 99 | 	int high
100 | ) {
101 | 	bindings::generate_discrete(m, rng, low, high);
102 | }
103 | 
104 | }}
105 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/syev.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Symmetric eigenvalue decomposition
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | #ifndef REMORA_KERNELS_SYEV_HPP
31 | #define REMORA_KERNELS_SYEV_HPP
32 | 
33 | 
34 | #ifdef REMORA_USE_LAPACK
35 | #include "lapack/syev.hpp"
36 | #else
37 | #include "default/syev.hpp"
38 | #endif
39 | 	
40 | namespace remora{ namespace kernels{
41 | 	
42 | ///\brief Well known SYmmetric EigenValue function (SYEV).
43 | ///
44 | /// A given matrix A is decomposed as 
45 | /// A=QDQ^T
46 | /// where Q is an orthogonal (or unitary) matrix with QQ^T=Q^TQ=I and D are the eigenvalue
47 | /// of A. As A is symmetric, only the lower part of it is accessed for reading.
48 | /// The wholee matrix will in the end contain the eigenvectors of A and thus
49 | /// A is replaced by Q. 
50 | /// Additionally the eigenvalues are stored in the second argument. 
51 | template <typename MatA, typename VectorB>
52 | void syev(
53 | 	matrix_expression<MatA, cpu_tag>& matA,
54 | 	vector_expression<VectorB, cpu_tag>& eigenValues
55 | ) {
56 | 	bindings::syev(matA,eigenValues);
57 | }
58 | 
59 | 
60 | }}
61 | #endif


--------------------------------------------------------------------------------
/include/remora/kernels/syrk.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       matrix-matrix multiplication kernel for symmetrik Rank-K updates
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2016
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_SYRK_HPP
32 | #define REMORA_KERNELS_SYRK_HPP
33 | 
34 | #include "default/syrk.hpp"
35 | 
36 | #ifdef REMORA_USE_CBLAS
37 | #include "cblas/syrk.hpp"
38 | #else
39 | //if no bindings are included, we have to provide the default has_optimized_syrk otherwise the binding will take care of this
40 | namespace remora{ namespace bindings{
41 | template<class M1, class M2>
42 | struct  has_optimized_syrk
43 | : public std::false_type{};
44 | }}
45 | #endif
46 | 
47 | namespace remora{namespace kernels{
48 | 	
49 | ///\brief Well known SYmmetric Rank-K update kernel M+=alpha*A*A^T.
50 | ///
51 | /// Note that it assumes M to be symmetric and it will only touch the upper or lower triangular area.
52 | /// If bindings are included and the matrix combination allow for a specific binding
53 | /// to be applied, the binding is called automatically from {binding}/syrk.h
54 | /// otherwise default/syrk.h is used.
55 | template<bool Upper, class M, class E>
56 | void syrk(
57 | 	matrix_expression<E, cpu_tag> const& e,
58 | 	matrix_expression<M, cpu_tag>& m,
59 | 	typename M::value_type alpha
60 | ) {
61 | 	REMORA_SIZE_CHECK(m().size1() == m().size2());
62 | 	REMORA_SIZE_CHECK(m().size1() == e().size1());
63 | 	
64 | 	bindings::syrk<Upper>(e, m, alpha,
65 | 		typename bindings::has_optimized_syrk<M,E>::type()
66 | 	);
67 | }
68 | 
69 | }}
70 | 
71 | #ifdef REMORA_USE_CLBLAST
72 | #include "clBlast/syrk.hpp"
73 | #elif defined REMORA_USE_OPENCL
74 | #include "opencl/syrk.hpp"
75 | #endif
76 | #if defined(__HCC__) || defined(__NVCC__)
77 | #include "hip/syrk.hpp"
78 | #endif
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/include/remora/kernels/tpmv.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Triangular packed matrix-vector multiplication
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_TPMV_HPP
32 | #define REMORA_KERNELS_TPMV_HPP
33 | 
34 | #ifdef REMORA_USE_CBLAS
35 | #include "cblas/tpmv.hpp"
36 | #else
37 | // if no bindings are included, we have to provide the default has_optimized_gemv 
38 | // otherwise the binding will take care of this
39 | namespace remora{ namespace bindings{
40 | template<class M1, class M2>
41 | struct  has_optimized_tpmv
42 | : public std::false_type{};
43 | }}
44 | #endif
45 | 
46 | #include "default/tpmv.hpp"
47 | 
48 | namespace remora{namespace kernels{
49 | 	
50 | ///\brief Implements the Tringular Packed Matrix-Vector multiplication(TPMV)
51 | ///
52 | /// It computes b=A*b where A is a lower or upper packed triangular matrix.
53 | template <typename MatA, typename VecB>
54 | void tpmv(
55 | 	matrix_expression<MatA, cpu_tag> const &A, 
56 | 	vector_expression<VecB, cpu_tag>& b
57 | ){
58 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
59 | 	REMORA_SIZE_CHECK(A().size1() == b().size());
60 | 	
61 | 	bindings::tpmv(A,b,typename bindings::has_optimized_tpmv<MatA, VecB>::type());
62 | }
63 | 
64 | }}
65 | 
66 | #endif
67 | 


--------------------------------------------------------------------------------
/include/remora/kernels/trmm.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Triangular Matrix-Matrix multiplication kernel
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2014 Shark Developcbment Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_TRMM_HPP
32 | #define REMORA_KERNELS_TRMM_HPP
33 | 
34 | #ifdef REMORA_USE_CBLAS
35 | #include "cblas/trmm.hpp"
36 | #else
37 | // if no bindings are included, we have to provide the default has_optimized_gemv 
38 | // otherwise the binding will take care of this
39 | namespace remora{ namespace bindings{
40 | template<class M1, class M2>
41 | struct  has_optimized_trmm
42 | : public std::false_type{};
43 | }}
44 | #endif
45 | 
46 | #include "default/trmm.hpp"
47 | 
48 | namespace remora{namespace kernels{
49 | 	
50 | ///\brief Implements the TRiangular Matrix Matrix multiply.
51 | ///
52 | /// It computes B=A*B in place, where A is a triangular matrix and B a dense matrix
53 | template <bool Upper,bool Unit,typename MatA, typename MatB>
54 | void trmm(
55 | 	matrix_expression<MatA, cpu_tag> const &A, 
56 | 	matrix_expression<MatB, cpu_tag>& B
57 | ){
58 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
59 | 	REMORA_SIZE_CHECK(A().size1() == B().size1());
60 | 	
61 | 	bindings::trmm<Upper,Unit>(A,B,typename bindings::has_optimized_trmm<MatA, MatB>::type());
62 | }
63 | 
64 | }}
65 | 
66 | #ifdef REMORA_USE_CLBLAST
67 | #include "clBlast/trmm.hpp"
68 | #elif defined REMORA_USE_OPENCL
69 | #include "opencl/trmm.hpp"
70 | #endif
71 | #if defined(__HCC__) || defined(__NVCC__)
72 | #include "hip/trmm.hpp"
73 | #endif
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/include/remora/kernels/trmv.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Triangular matrix-vector multiplication kernel
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_TRMV_HPP
32 | #define REMORA_KERNELS_TRMV_HPP
33 | 
34 | #ifdef REMORA_USE_CBLAS
35 | #include "cblas/trmv.hpp"
36 | #else
37 | // if no bindings are included, we have to provide the default has_optimized_gemv 
38 | // otherwise the binding will take care of this
39 | namespace remora{ namespace bindings{
40 | template<class M, class V>
41 | struct  has_optimized_trmv
42 | : public std::false_type{};
43 | }}
44 | #endif
45 | 
46 | #include "default/trmv.hpp"
47 | 
48 | namespace remora{namespace kernels{
49 | 	
50 | ///\brief Implements the TRiangular Solver for Vectors.
51 | ///
52 | /// It solves Systems of the form Ax = b where A is a square lower or upper triangular matrix.
53 | /// It can optionally assume that the diagonal is 1 and won't access the diagonal elements.
54 | template <bool Upper,bool Unit,typename MatA, typename VecB>
55 | void trmv(
56 | 	matrix_expression<MatA, cpu_tag> const &A, 
57 | 	vector_expression<VecB, cpu_tag>& b
58 | ){
59 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
60 | 	REMORA_SIZE_CHECK(A().size1() == b().size());
61 | 	
62 | 	bindings::trmv<Upper,Unit>(A,b,typename bindings::has_optimized_trmv<MatA, VecB>::type());
63 | }
64 | 
65 | }}
66 | 
67 | #ifdef REMORA_USE_CLBLAST
68 | #include "clBlast/trmv.hpp"
69 | #elif defined REMORA_USE_OPENCL
70 | #include "opencl/trmv.hpp"
71 | #endif
72 | #if defined(__HCC__) || defined(__NVCC__)
73 | #include "hip/trmv.hpp"
74 | #endif
75 | #endif
76 | 


--------------------------------------------------------------------------------
/include/remora/kernels/trsm.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Triangular solve kernel for matrix epressions.
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_TRSM_HPP
32 | #define REMORA_KERNELS_TRSM_HPP
33 | 
34 | #include <type_traits> //std::false_type marker for unoptimized
35 | #ifdef REMORA_USE_CBLAS
36 | #include "cblas/trsm.hpp"
37 | #else
38 | // if no bindings are included, we have to provide the default has_optimized_gemv 
39 | // otherwise the binding will take care of this
40 | namespace remora{ namespace bindings{
41 | template<class M1, class M2>
42 | struct  has_optimized_trsm
43 | : public std::false_type{};
44 | }}
45 | #endif
46 | 
47 | #include "default/trsm.hpp"
48 | 
49 | namespace remora{namespace kernels{
50 | 	
51 | ///\brief Implements the TRiangular Solver for Vectors.
52 | ///
53 | /// It solves Systems of the form Ax = b where A is a square lower or upper triangular matrix.
54 | /// It can optionally assume that the diagonal is 1 and won't access the diagonal elements.
55 | template <class Triangular,class Side, typename MatA, typename MatB>
56 | void trsm(
57 | 	matrix_expression<MatA, cpu_tag> const &A, 
58 | 	matrix_expression<MatB, cpu_tag> &B
59 | ){
60 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
61 | 	REMORA_SIZE_CHECK(!Side::is_left || A().size2() == B().size1());
62 | 	REMORA_SIZE_CHECK(Side::is_left || A().size2() == B().size2());
63 | 	
64 | 	bindings::trsm<Triangular, Side>(A,B,typename bindings::has_optimized_trsm<MatA, MatB>::type());
65 | }
66 | 
67 | }}
68 | 
69 | #ifdef REMORA_USE_CLBLAST
70 | #include "clBlast/trsm.hpp"
71 | #elif defined REMORA_USE_OPENCL
72 | #include "opencl/trsm.hpp"
73 | #endif
74 | #if defined(__HCC__) || defined(__NVCC__)
75 | #include "hip/trsm.hpp"
76 | #endif
77 | 
78 | #endif
79 | 


--------------------------------------------------------------------------------
/include/remora/kernels/trsv.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Triangular solve kernel for vector expressions.
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2012
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | 
31 | #ifndef REMORA_KERNELS_TRSV_HPP
32 | #define REMORA_KERNELS_TRSV_HPP
33 | 
34 | #include <type_traits>
35 | #ifdef REMORA_USE_CBLAS
36 | #include "cblas/trsv.hpp"
37 | #else
38 | // if no bindings are included, we have to provide the default has_optimized_gemv 
39 | // otherwise the binding will take care of this
40 | namespace remora {namespace bindings{
41 | template<class M1, class M2>
42 | struct  has_optimized_trsv
43 | : public std::false_type{};
44 | }}
45 | #endif
46 | 
47 | #include "default/trsv.hpp"
48 | 
49 | namespace remora{namespace kernels{
50 | 	
51 | ///\brief Implements the TRiangular Solver for Vectors.
52 | ///
53 | /// It solves Systems of the form Ax = b where A is a square lower or upper triangular matrix.
54 | /// It can optionally assume that the diagonal is 1 and won't access the diagonal elements.
55 | template <class Triangular, class Side, typename MatA, typename V>
56 | void trsv(
57 | 	matrix_expression<MatA, cpu_tag> const &A, 
58 | 	vector_expression<V, cpu_tag> &b
59 | ){
60 | 	REMORA_SIZE_CHECK(A().size1() == A().size2());
61 | 	REMORA_SIZE_CHECK(A().size1() == b().size());
62 | 	
63 | 	bindings::trsv<Triangular, Side>(A,b,typename bindings::has_optimized_trsv<MatA, V>::type());
64 | }
65 | 
66 | }}
67 | 
68 | #ifdef REMORA_USE_CLBLAST
69 | #include "clBlast/trsv.hpp"
70 | #elif defined REMORA_USE_OPENCL
71 | #include "opencl/trsv.hpp"
72 | #endif
73 | #if defined(__HCC__) || defined(__NVCC__)
74 | #include "hip/trsv.hpp"
75 | #endif
76 | 
77 | #endif
78 | 


--------------------------------------------------------------------------------
/include/remora/kernels/vector_assign.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       Assignment kernels for vector expressions
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2015
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_KERNELS_VECTOR_ASSIGN_HPP
29 | #define REMORA_KERNELS_VECTOR_ASSIGN_HPP
30 | 
31 | #include "../detail/traits.hpp"
32 | #include "default/vector_assign.hpp"
33 | #ifdef REMORA_USE_OPENCL
34 | #include "opencl/vector_assign.hpp"
35 | #endif
36 | #if defined(__HCC__) || defined(__NVCC__)
37 | #include "hip/vector_assign.hpp"
38 | #endif
39 | 
40 | 
41 | namespace remora{namespace kernels {
42 | 
43 | 	
44 | template<class V, class F, class Device>
45 | void apply(vector_expression<V, Device>& v,F const& f) {
46 | 	bindings::apply(v,f);
47 | }
48 | template<class F, class V, class Device>
49 | void assign(vector_expression<V, Device>& v, typename V::value_type t) {
50 | 	bindings::assign<F>(v,t);
51 | }
52 | 
53 | /////////////////////////////////////////////////////////
54 | //direct assignment of two vectors
55 | ////////////////////////////////////////////////////////
56 | 
57 | //dispatcher
58 | template< class V, class E, class Device>
59 | void assign(vector_expression<V, Device>& v, vector_expression<E, Device> const& e) {
60 | 	REMORA_SIZE_CHECK(v().size() == e().size());
61 | 	typedef typename V::evaluation_category::tag TagV;
62 | 	typedef typename E::evaluation_category::tag TagE;
63 | 	bindings::vector_assign(v, e,TagV(),TagE());
64 | }
65 | 
66 | ////////////////////////////////////////////
67 | //assignment with functor
68 | ////////////////////////////////////////////
69 | 
70 | 
71 | // Dispatcher
72 | template<class F, class V, class E, class Device>
73 | void assign(vector_expression<V, Device>& v, vector_expression<E, Device> const& e, F f) {
74 | 	REMORA_SIZE_CHECK(v().size() == e().size());
75 | 	typedef typename V::evaluation_category::tag TagV;
76 | 	typedef typename E::evaluation_category::tag TagE;
77 | 	bindings::vector_assign_functor(v(), e(), f, TagV(),TagE());
78 | }
79 | 
80 | }}
81 | #endif
82 | 


--------------------------------------------------------------------------------
/include/remora/kernels/vector_fold.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * \brief       Algorithm to reduce a vector to a scalar value
 3 |  * 
 4 |  * \author      O. Krause
 5 |  * \date        2016
 6 |  *
 7 |  *
 8 |  * \par Copyright 1995-2015 Shark Development Team
 9 |  * 
10 |  * <BR><HR>
11 |  * This file is part of Shark.
12 |  * <http://image.diku.dk/shark/>
13 |  * 
14 |  * Shark is free software: you can redistribute it and/or modify
15 |  * it under the terms of the GNU Lesser General Public License as published 
16 |  * by the Free Software Foundation, either version 3 of the License, or
17 |  * (at your option) any later version.
18 |  * 
19 |  * Shark is distributed in the hope that it will be useful,
20 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
21 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
22 |  * GNU Lesser General Public License for more details.
23 |  * 
24 |  * You should have received a copy of the GNU Lesser General Public License
25 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
26 |  *
27 |  */
28 | #ifndef REMORA_KERNELS_VECTOR_FOLD_HPP
29 | #define REMORA_KERNELS_VECTOR_FOLD_HPP
30 | 
31 | #include "../detail/traits.hpp"
32 | #include "default/vector_fold.hpp"
33 | #ifdef REMORA_USE_OPENCL
34 | #include "opencl/vector_fold.hpp"
35 | #endif
36 | #if defined(__HCC__) || defined(__NVCC__)
37 | #include "hip/vector_fold.hpp"
38 | #endif
39 | 
40 | namespace remora{namespace kernels {
41 | 
42 | 
43 | ///\brief Appliuees F in any order to the elements of v and a given initial value.
44 | ///
45 | /// result is the same as value = f(v_1,f(v_2,...f(v_n,value))) assuming f is commutative
46 | /// and associative.
47 | template<class F, class V, class Device>
48 | void vector_fold(vector_expression<V, Device> const& v, typename F::result_type& value) {
49 | 	typedef typename V::evaluation_category::tag TagV;
50 | 	bindings::vector_fold<F>(v(), value, TagV());
51 | }
52 | 
53 | }}
54 | #endif
55 | 


--------------------------------------------------------------------------------
/include/remora/kernels/vector_max.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       Kernel for calculating the maximum element of a vector
 5 |  *
 6 |  * \author      O. Krause
 7 |  * \date        2016
 8 |  *
 9 |  *
10 |  * \par Copyright 1995-2015 Shark Development Team
11 |  * 
12 |  * <BR><HR>
13 |  * This file is part of Shark.
14 |  * <http://image.diku.dk/shark/>
15 |  * 
16 |  * Shark is free software: you can redistribute it and/or modify
17 |  * it under the terms of the GNU Lesser General Public License as published 
18 |  * by the Free Software Foundation, either version 3 of the License, or
19 |  * (at your option) any later version.
20 |  * 
21 |  * Shark is distributed in the hope that it will be useful,
22 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
23 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24 |  * GNU Lesser General Public License for more details.
25 |  * 
26 |  * You should have received a copy of the GNU Lesser General Public License
27 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
28 |  *
29 |  */
30 | #ifndef REMORA_KERNELS_VECTOR_MAX_HPP
31 | #define REMORA_KERNELS_VECTOR_MAX_HPP
32 | 
33 | #include "default/vector_max.hpp"
34 | #ifdef REMORA_USE_OPENCL
35 | #include "opencl/vector_max.hpp"
36 | #endif
37 | #if defined(__HCC__) || defined(__NVCC__)
38 | #include "hip/vector_max.hpp"
39 | #endif
40 | 
41 | namespace remora { namespace kernels{
42 | 	
43 | ///\brief Computes the index of the maximum element of a vector
44 | template<class E, class Device>
45 | std::size_t vector_max(
46 | 	vector_expression<E, Device> const& e
47 | ) {
48 | 	REMORA_SIZE_CHECK(e().size() == e().size());
49 | 	return bindings::vector_max(e,typename E::evaluation_category::tag());
50 | }
51 | 
52 | }}
53 | #endif


--------------------------------------------------------------------------------
/include/remora/remora.hpp:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * 
 3 |  *
 4 |  * \brief       includes all uBLAS files needed by Shark linear Algebra
 5 |  * 
 6 |  *
 7 |  * \author      O. Krause
 8 |  * \date        2012
 9 |  *
10 |  *
11 |  * \par Copyright 1995-2015 Shark Development Team
12 |  * 
13 |  * <BR><HR>
14 |  * This file is part of Shark.
15 |  * <http://image.diku.dk/shark/>
16 |  * 
17 |  * Shark is free software: you can redistribute it and/or modify
18 |  * it under the terms of the GNU Lesser General Public License as published 
19 |  * by the Free Software Foundation, either version 3 of the License, or
20 |  * (at your option) any later version.
21 |  * 
22 |  * Shark is distributed in the hope that it will be useful,
23 |  * but WITHOUT ANY WARRANTY; without even the implied warranty of
24 |  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
25 |  * GNU Lesser General Public License for more details.
26 |  * 
27 |  * You should have received a copy of the GNU Lesser General Public License
28 |  * along with Shark.  If not, see <http://www.gnu.org/licenses/>.
29 |  *
30 |  */
31 | 
32 | #ifndef SHARK_LINALG_BLAS_REMORA_HPP
33 | #define SHARK_LINALG_BLAS_REMORA_HPP
34 | 
35 | //expressions
36 | #include "vector_expression.hpp"
37 | #include "matrix_expression.hpp"
38 | #include "solve.hpp"
39 | //containers
40 | #include "dense.hpp"
41 | #include "sparse.hpp"
42 | 
43 | //misc
44 | #include "permutation.hpp"
45 | #include "io.hpp"
46 | #include "random.hpp"
47 | #include "device_copy.hpp"
48 | 
49 | #endif
50 | 


--------------------------------------------------------------------------------