├── .gitignore ├── CMakeLists.txt ├── README.md ├── Test ├── CMakeLists.txt ├── cholesky.cpp ├── conv2d.cpp ├── dense.cpp ├── eigensymm.cpp ├── expression_optimizer.cpp ├── general_solve.cpp ├── getrf.cpp ├── hip_cholesky.cpp ├── hip_copy.cpp ├── hip_dense.cpp ├── hip_matrix_assign.cpp ├── hip_matrix_expression.cpp ├── hip_prod.cpp ├── hip_random.cpp ├── hip_syrk.cpp ├── hip_triangular_prod.cpp ├── hip_triangular_solve.cpp ├── hip_vector_assign.cpp ├── hip_vector_expression.cpp ├── hip_vector_set_expression.cpp ├── iterators.cpp ├── matrix_assign.cpp ├── matrix_expression.cpp ├── matrix_proxy.cpp ├── opencl_cholesky.cpp ├── opencl_conv2d.cpp ├── opencl_copy.cpp ├── opencl_dense.cpp ├── opencl_matrix_assign.cpp ├── opencl_matrix_expression.cpp ├── opencl_prod.cpp ├── opencl_random.cpp ├── opencl_syrk.cpp ├── opencl_triangular_prod.cpp ├── opencl_triangular_solve.cpp ├── opencl_vector_assign.cpp ├── opencl_vector_expression.cpp ├── opencl_vector_set_expression.cpp ├── prod.cpp ├── random.cpp ├── sparse.cpp ├── symm_solve.cpp ├── syrk.cpp ├── triangular_matrix.cpp ├── triangular_prod.cpp ├── triangular_solve.cpp ├── vector_assign.cpp ├── vector_expression.cpp └── vector_set_expression.cpp ├── doc ├── CMakeLists.txt ├── sphinx_pages │ ├── conf.py.in │ ├── index.tut │ └── quick_ref.rst └── tutToRst │ ├── CMakeLists.txt │ └── tut2rst.cpp ├── examples ├── Benchmarks │ ├── Timer.hpp │ ├── conv2d.cpp │ ├── gemm.cpp │ ├── opencl_conv2d.cpp │ ├── potrf.cpp │ ├── syrk.cpp │ └── trmm.cpp ├── CMakeLists.txt ├── createExampleSource.cmake └── linear_regression.tpp └── include ├── CMakeLists.txt └── remora ├── assignment.hpp ├── cpu ├── dense.hpp ├── iterator.hpp ├── sparse.hpp ├── sparse_matrix.hpp └── traits.hpp ├── decompositions.hpp ├── dense.hpp ├── detail ├── check.hpp ├── evaluation_tags.hpp ├── expression_optimizers.hpp ├── matrix_expression_classes.hpp ├── proxy_optimizers_fwd.hpp ├── storage.hpp ├── structure.hpp ├── traits.hpp ├── vector_expression_classes.hpp └── vector_set.hpp ├── device_copy.hpp ├── expression_types.hpp ├── hip ├── buffer.hpp ├── copy.hpp ├── cublas_backend.hpp ├── dense.hpp ├── device.hpp ├── exception.hpp └── traits.hpp ├── io.hpp ├── kernels ├── atlas │ └── potrf.hpp ├── cblas │ ├── cblas_inc.hpp │ ├── dense_gemm.hpp │ ├── gemv.hpp │ ├── syrk.hpp │ ├── tpmv.hpp │ ├── trmm.hpp │ ├── trmv.hpp │ ├── trsm.hpp │ └── trsv.hpp ├── clBlast │ ├── conv2d.hpp │ ├── gemm.hpp │ ├── gemv.hpp │ ├── syrk.hpp │ ├── trmm.hpp │ ├── trmv.hpp │ ├── trsm.hpp │ └── trsv.hpp ├── conv2d.hpp ├── default │ ├── boost_align │ │ ├── aligned_alloc.hpp │ │ ├── aligned_allocator.hpp │ │ ├── assume_aligned.hpp │ │ └── detail │ │ │ ├── aligned_alloc.hpp │ │ │ ├── aligned_alloc_android.hpp │ │ │ ├── aligned_alloc_macos.hpp │ │ │ ├── aligned_alloc_msvc.hpp │ │ │ ├── aligned_alloc_posix.hpp │ │ │ ├── aligned_alloc_sunos.hpp │ │ │ ├── assume_aligned.hpp │ │ │ ├── assume_aligned_clang.hpp │ │ │ ├── assume_aligned_gcc.hpp │ │ │ ├── assume_aligned_intel.hpp │ │ │ ├── assume_aligned_msvc.hpp │ │ │ ├── is_alignment.hpp │ │ │ ├── is_alignment_constant.hpp │ │ │ ├── max_objects.hpp │ │ │ └── max_size.hpp │ ├── conv2d.hpp │ ├── dense_gemm.hpp │ ├── dot.hpp │ ├── fold_rows.hpp │ ├── gemm.hpp │ ├── gemv.hpp │ ├── getrf.hpp │ ├── matrix_assign.hpp │ ├── mgemm.hpp │ ├── potrf.hpp │ ├── pstrf.hpp │ ├── random.hpp │ ├── simd.hpp │ ├── syev.hpp │ ├── syrk.hpp │ ├── tpmv.hpp │ ├── trmm.hpp │ ├── trmv.hpp │ ├── trsm.hpp │ ├── trsv.hpp │ ├── vector_assign.hpp │ ├── vector_fold.hpp │ └── vector_max.hpp ├── fold_rows.hpp ├── gemm.hpp ├── gemv.hpp ├── getrf.hpp ├── hip │ ├── fold_rows.hpp │ ├── gemm.hpp │ ├── gemv.hpp │ ├── matrix_assign.hpp │ ├── potrf.hpp │ ├── random.hpp │ ├── syrk.hpp │ ├── trmm.hpp │ ├── trmv.hpp │ ├── trsm.hpp │ ├── trsv.hpp │ ├── vector_assign.hpp │ ├── vector_fold.hpp │ └── vector_max.hpp ├── lapack │ ├── fortran.hpp │ └── syev.hpp ├── matrix_assign.hpp ├── opencl │ ├── fold_rows.hpp │ ├── gemm.hpp │ ├── gemv.hpp │ ├── matrix_assign.hpp │ ├── potrf.hpp │ ├── random.hpp │ ├── syrk.hpp │ ├── trmm.hpp │ ├── trmv.hpp │ ├── trsm.hpp │ ├── trsv.hpp │ ├── vector_assign.hpp │ ├── vector_fold.hpp │ └── vector_max.hpp ├── potrf.hpp ├── pstrf.hpp ├── random.hpp ├── syev.hpp ├── syrk.hpp ├── tpmv.hpp ├── trmm.hpp ├── trmv.hpp ├── trsm.hpp ├── trsv.hpp ├── vector_assign.hpp ├── vector_fold.hpp └── vector_max.hpp ├── matrix_expression.hpp ├── opencl ├── copy.hpp ├── dense.hpp └── traits.hpp ├── permutation.hpp ├── proxy_expressions.hpp ├── random.hpp ├── remora.hpp ├── solve.hpp ├── sparse.hpp ├── triangular_matrix.hpp └── vector_expression.hpp /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Remora BLAS Library 2 | ================================ 3 | 4 | Remora is a general purpose linear algebra library written in C++. 5 | It features: 6 | 7 | * Dense and Sparse Matrix and Vector operations 8 | * A basic set of optimized routines for matrix products, solving linear systems of equations etc 9 | * Bindings to highly optimized routines of BLAS packages 10 | * A powerful expression template syntax which features algebraic optimizations of operations 11 | * (experimental and very early) GPU support via OpenCL 12 | 13 | Remora is used by the shark machine learning library. 14 | 15 | Installation 16 | --------------------------------------- 17 | 18 | Remora is header-only. just download and copy the contents of the include/ folder into its 19 | target location. Remora depends on the boost c++ libraries. 20 | When using Remora, the following defines can be supplied at compile time 21 | 22 | * REMORA_USE_SIMD if defined, Remora uses the compilers' auto vectorizing capabilities 23 | to speed up its computational routines. Requires G++ or clang 24 | * REMORA_USE_CBLAS if defined, Remora binds to a cblas library. 25 | On MacOsX, this flag is interpreted as using the accelerate framework. 26 | Make sure to add the appropriate compile and linker flags for the library. 27 | * REMORA_USE_GPU if defined, Remora is enabling gpu support via the boost/compute 28 | library. Highly experimental. 29 | * REMORA_USE_CLBLAST if defined, Remora is using CLBlast as GPU/opencl backend. 30 | This should be more stable and give better performance on most devices. 31 | 32 | Contributing 33 | ---------------------------------------------------------- 34 | Contributing is easy via [Pull Requests][1]. We are open 35 | for any types of contribution, but we favour them in the following order 36 | 37 | 1. Bug fixes, test cases, documentation, benchmarks, examples... 38 | 2. Optimizing existing computational routines found in include/Remora/kernels/ 39 | 3. Adding new computation routines in include/Remora/kernels/ 40 | 4. extending the expression template system 41 | 42 | The reason for this order is that extending the expression template system often leads 43 | to a large increase of possible operation combinations to cover, some of which might not 44 | be implemented in the kernel routines and thus leading to compile errors or inefficient code. 45 | We would like to prevent this by favouring the quality and number of underlying computational 46 | routines, which are very often easier to implement (and it is okay if those routines are very 47 | specialized). 48 | 49 | 50 | [1]: https://github.com/Shark-ML/Remora/pulls 51 | -------------------------------------------------------------------------------- /Test/eigensymm.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_eigensymm 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | using namespace remora; 9 | 10 | BOOST_AUTO_TEST_SUITE (Remora_eigensymm) 11 | 12 | matrix createSymm(std::size_t dimensions, std::size_t rank = 0){ 13 | if(rank == 0) rank = dimensions; 14 | matrix R(dimensions,dimensions,0.0); 15 | 16 | for(std::size_t i = 0; i != dimensions; ++i){ 17 | for(std::size_t j = 0; j A = prod(R,trans(R)); 24 | if(rank != dimensions){ 25 | for(std::size_t i = 0; i != rank/2; ++i){ 26 | A.swap_rows(2*i,dimensions-i-1); 27 | A.swap_columns(2*i,dimensions-i-1); 28 | } 29 | } 30 | return A; 31 | } 32 | 33 | BOOST_AUTO_TEST_CASE( Remora_eigensymm_decomposition) 34 | { 35 | std::size_t Dimensions = 123; 36 | matrix A = createSymm(Dimensions); 37 | 38 | symm_eigenvalue_decomposition > solver(A); 39 | 40 | matrix Atest = solver.Q() % to_diagonal(solver.D()) % trans(solver.Q()); 41 | BOOST_CHECK_SMALL(norm_inf(Atest-A),norm_inf(A) * 1.e-12); 42 | 43 | } 44 | 45 | BOOST_AUTO_TEST_CASE( Remora_eigensymm_solve ) 46 | { 47 | std::size_t Dimensions = 153; 48 | std::size_t K = 35; 49 | //first generate a suitable eigenvalue problem matrix A 50 | matrix A = createSymm(Dimensions); 51 | 52 | symm_eigenvalue_decomposition > solver(A); 53 | cholesky_decomposition > solver_cholesky(A); 54 | 55 | matrix B(Dimensions,K); 56 | for(std::size_t i = 0; i != K; ++i){ 57 | for(std::size_t j = 0; j != Dimensions; ++j){ 58 | B(j,i) = (1.0 + j+K)/Dimensions; 59 | } 60 | } 61 | 62 | vector b(Dimensions); 63 | for(std::size_t j = 0; j != Dimensions; ++j){ 64 | b(j) = (1.0 + j)/Dimensions; 65 | } 66 | 67 | { 68 | vector sol=b; 69 | vector sol2=b; 70 | solver.solve(sol,left()); 71 | solver_cholesky.solve(sol2,left()); 72 | BOOST_CHECK_SMALL(norm_2(sol - sol2), norm_2(sol2)*1.e-8); 73 | } 74 | { 75 | vector sol=b; 76 | vector sol2=b; 77 | solver.solve(sol,right()); 78 | solver_cholesky.solve(sol2,right()); 79 | BOOST_CHECK_SMALL(norm_2(sol - sol2), norm_2(sol2)*1.e-8); 80 | } 81 | 82 | { 83 | matrix sol=B; 84 | matrix sol2=B; 85 | solver.solve(sol,left()); 86 | solver_cholesky.solve(sol2,left()); 87 | BOOST_CHECK_SMALL(norm_inf(sol - sol2), norm_inf(sol2)*1.e-8); 88 | } 89 | { 90 | matrix sol=trans(B); 91 | matrix sol2=trans(B); 92 | solver.solve(sol,right()); 93 | solver_cholesky.solve(sol2,right()); 94 | BOOST_CHECK_SMALL(norm_frobenius(sol - sol2), norm_frobenius(sol2)*1.e-8); 95 | } 96 | } 97 | BOOST_AUTO_TEST_SUITE_END() 98 | -------------------------------------------------------------------------------- /Test/getrf.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_Getrf 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace remora; 14 | 15 | //the matrix is designed such that permutation will always give the next row 16 | matrix createMatrix(std::size_t dimensions){ 17 | matrix L(dimensions,dimensions,0.0); 18 | matrix U(dimensions,dimensions,0.0); 19 | 20 | for(std::size_t i = 0; i != dimensions; ++i){ 21 | for(std::size_t j = 0; j A = prod(L,U); 29 | return A; 30 | } 31 | typedef boost::mpl::list result_orientations; 32 | 33 | 34 | BOOST_AUTO_TEST_SUITE (Remora_Cholesky) 35 | 36 | BOOST_AUTO_TEST_CASE_TEMPLATE(Remora_Potrf, Orientation,result_orientations) { 37 | std::size_t Dimensions = 123; 38 | //first generate a suitable eigenvalue problem matrix A 39 | matrix A = createMatrix(Dimensions); 40 | //calculate lu decomposition 41 | permutation_matrix P(Dimensions); 42 | matrix dec = A; 43 | kernels::getrf(dec,P); 44 | 45 | //copy upper matrix to temporary 46 | matrix upper(Dimensions,Dimensions,0.0); 47 | for (size_t row = 0; row < Dimensions; row++){ 48 | for (size_t col = row; col < Dimensions ; col++){ 49 | upper(row, col) = dec(row, col); 50 | } 51 | } 52 | 53 | //create reconstruction of A 54 | matrix testA = triangular_prod(dec,upper); 55 | swap_rows_inverted(P,testA); 56 | 57 | //test reconstruction error 58 | double error = max(abs(A - testA)); 59 | BOOST_CHECK_SMALL(error,1.e-12); 60 | BOOST_CHECK(!(boost::math::isnan)(norm_frobenius(testA)));//test for nans 61 | } 62 | 63 | BOOST_AUTO_TEST_SUITE_END() 64 | -------------------------------------------------------------------------------- /Test/hip_cholesky.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_Cholesky 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | using namespace remora; 12 | 13 | //the matrix is designed such that a lot of permutations will be performed 14 | matrix createSymm(std::size_t dimensions, std::size_t rank = 0){ 15 | if(rank == 0) rank = dimensions; 16 | matrix R(dimensions,dimensions,0.0); 17 | 18 | for(std::size_t i = 0; i != dimensions; ++i){ 19 | for(std::size_t j = 0; j A = prod(R,trans(R)); 26 | if(rank != dimensions){ 27 | for(std::size_t i = 0; i != rank/2; ++i){ 28 | A.swap_rows(2*i,dimensions-i-1); 29 | A.swap_columns(2*i,dimensions-i-1); 30 | } 31 | } 32 | return A; 33 | } 34 | BOOST_AUTO_TEST_SUITE (Remora_Cholesky) 35 | 36 | template 37 | void potrf_test(Orientation) { 38 | std::size_t Dimensions = 123; 39 | //first generate a suitable eigenvalue problem matrix A 40 | matrix A = createSymm(Dimensions); 41 | //calculate Cholesky 42 | matrix lowDec_opencl = copy_to_device(A, hip_tag()); 43 | matrix upDec_opencl = copy_to_device(A, hip_tag()); 44 | kernels::potrf(lowDec_opencl); 45 | kernels::potrf(upDec_opencl); 46 | matrix lowDec = copy_to_cpu(lowDec_opencl); 47 | matrix upDec = copy_to_cpu(upDec_opencl); 48 | matrix lowDec_test = A; 49 | matrix upDec_test = A; 50 | kernels::potrf(lowDec_test); 51 | kernels::potrf(upDec_test); 52 | //check that upper diagonal elements are correct and set them to zero 53 | for (size_t row = 0; row < Dimensions; row++){ 54 | for (size_t col =0; col < Dimensions ; col++){ 55 | BOOST_CHECK_CLOSE(lowDec(row, col), lowDec_test(row,col),1.e-12); 56 | BOOST_CHECK_CLOSE(upDec(row, col), upDec_test(row,col),1.e-12); 57 | } 58 | } 59 | 60 | } 61 | 62 | BOOST_AUTO_TEST_CASE(Remora_Potrf) { 63 | potrf_test(row_major()); 64 | potrf_test(column_major()); 65 | } 66 | BOOST_AUTO_TEST_SUITE_END() 67 | -------------------------------------------------------------------------------- /Test/hip_matrix_assign.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_HIP_MatrixAssign 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | using namespace remora; 11 | 12 | template 13 | void checkMatrixEqual(M1 const& m1_opencl, M2 const& m2_opencl){ 14 | BOOST_REQUIRE_EQUAL(m1_opencl.size1(),m2_opencl.size1()); 15 | BOOST_REQUIRE_EQUAL(m1_opencl.size2(),m2_opencl.size2()); 16 | 17 | matrix m1 = copy_to_cpu(m1_opencl); 18 | matrix m2 = copy_to_cpu(m2_opencl); 19 | for(std::size_t i = 0; i != m2.size1(); ++i){ 20 | for(std::size_t j = 0; j != m2.size2(); ++j){ 21 | BOOST_CHECK_EQUAL(m1(i,j),m2(i,j)); 22 | } 23 | } 24 | } 25 | 26 | BOOST_AUTO_TEST_SUITE (Remora_opencl_matrix_assign) 27 | 28 | BOOST_AUTO_TEST_CASE( Remora_Matrix_Assign_Dense ){ 29 | std::cout<<"testing dense-dense assignment"< source_cpu(100,237); 31 | matrix target_cpu(100,237); 32 | matrix result_add_cpu(100,237); 33 | matrix result_add_scalar_cpu(100,237); 34 | float scalar = 10; 35 | for(std::size_t i = 0; i != 100; ++i){ 36 | for(std::size_t j = 0; j != 237; ++j){ 37 | source_cpu(i,j) = 2*i+1+0.3*j; 38 | target_cpu(i,j) = 3*i+2+0.3*j; 39 | result_add_cpu(i,j) = source_cpu(i,j) + target_cpu(i,j); 40 | result_add_scalar_cpu(i,j) = target_cpu(i,j) + scalar; 41 | } 42 | } 43 | matrix source = copy_to_device(source_cpu, hip_tag()); 44 | matrix source_cm = copy_to_device(source_cpu, hip_tag()); 45 | matrix result_add = copy_to_device(result_add_cpu, hip_tag()); 46 | matrix result_add_scalar = copy_to_device(result_add_scalar_cpu, hip_tag()); 47 | { 48 | std::cout<<"testing direct assignment row-row"< target = copy_to_device(target_cpu, hip_tag()); 50 | kernels::assign(target,source); 51 | checkMatrixEqual(target,source); 52 | } 53 | { 54 | std::cout<<"testing functor assignment row-row"< target = copy_to_device(target_cpu, hip_tag()); 56 | kernels::assign(target,source, device_traits::add()); 57 | checkMatrixEqual(target,result_add); 58 | } 59 | { 60 | std::cout<<"testing direct assignment row-column"< target = copy_to_device(target_cpu, hip_tag()); 62 | kernels::assign(target,source_cm); 63 | checkMatrixEqual(target,source_cm); 64 | } 65 | { 66 | std::cout<<"testing functor assignment row-column"< target = copy_to_device(target_cpu, hip_tag()); 68 | kernels::assign(target,source_cm, device_traits::add()); 69 | checkMatrixEqual(target,result_add); 70 | } 71 | { 72 | std::cout<<"testing functor scalar assignment"< target = copy_to_device(target_cpu, hip_tag()); 74 | kernels::assign::add >(target,scalar); 75 | checkMatrixEqual(target,result_add_scalar); 76 | } 77 | 78 | } 79 | 80 | BOOST_AUTO_TEST_SUITE_END() 81 | -------------------------------------------------------------------------------- /Test/hip_syrk.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_HIP_Syrk 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | #include 15 | using namespace remora; 16 | 17 | template 18 | void checkSyrk(M const& arg_opencl, Result const& result_opencl,double init, double alpha, bool upper){ 19 | BOOST_REQUIRE_EQUAL(arg_opencl.size1(), result_opencl.size1()); 20 | BOOST_REQUIRE_EQUAL(result_opencl.size1(), result_opencl.size2()); 21 | 22 | matrix arg = copy_to_cpu(arg_opencl); 23 | matrix result = copy_to_cpu(result_opencl); 24 | 25 | if(upper){ 26 | for(std::size_t i = 0; i != result.size1(); ++i) { 27 | for(std::size_t j = 0; j != result.size2(); ++j) { 28 | if(j < i){ 29 | BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4); 30 | }else{ 31 | double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init; 32 | BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4); 33 | } 34 | } 35 | } 36 | }else{ 37 | for(std::size_t i = 0; i != result.size1(); ++i) { 38 | for(std::size_t j = 0; j != result.size2(); ++j) { 39 | if(j > i){ 40 | BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4); 41 | }else{ 42 | double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init; 43 | BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4); 44 | } 45 | } 46 | } 47 | } 48 | } 49 | 50 | BOOST_AUTO_TEST_SUITE (Remora_HIP_Syrk) 51 | 52 | template 53 | void syrk_test(Orientation) { 54 | std::size_t dims = 936;//chosen as not to be a multiple of the block size 55 | std::size_t K = 1039; 56 | 57 | //rhs 58 | matrix arg_cpu(dims, K, 1.0); 59 | for(std::size_t i = 0; i != dims; ++i) { 60 | for(std::size_t j = 0; j != K; ++j) { 61 | arg_cpu(i, j) = (1.0/ dims) * i + 0.2/K * j + 1; 62 | } 63 | } 64 | 65 | matrix argrm = copy_to_device(arg_cpu, hip_tag()); 66 | matrix argcm = copy_to_device(arg_cpu, hip_tag()); 67 | 68 | std::cout << "\nchecking syrk V+=AA^T" << std::endl; 69 | { 70 | std::cout<<"row major A, lower V"< result(dims,dims,3.0); 72 | kernels::syrk(argrm,result, 2.0); 73 | checkSyrk(argrm,result, 3.0, 2.0,false); 74 | } 75 | { 76 | std::cout<<"row major A, upper V"< result(dims,dims,3.0); 78 | kernels::syrk(argrm,result, 2.0); 79 | checkSyrk(argrm,result, 3.0, 2.0,true); 80 | } 81 | { 82 | std::cout<<"column major A, lower V"< result(dims,dims,3.0); 84 | kernels::syrk(argcm,result, 2.0); 85 | checkSyrk(argrm,result, 3.0, 2.0,false); 86 | } 87 | { 88 | std::cout<<"column major A, upper V"< result(dims,dims,3.0); 90 | kernels::syrk(argcm,result, 2.0); 91 | checkSyrk(argrm,result, 3.0, 2.0,true); 92 | } 93 | } 94 | 95 | BOOST_AUTO_TEST_CASE(HIP_syrk){ 96 | syrk_test(row_major()); 97 | syrk_test(column_major()); 98 | } 99 | 100 | BOOST_AUTO_TEST_SUITE_END() 101 | -------------------------------------------------------------------------------- /Test/hip_vector_assign.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_HIP_VectorAssign 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | using namespace remora; 11 | 12 | template 13 | void checkVectorEqual(V1 const& v1_opencl, V2 const& v2_opencl){ 14 | BOOST_REQUIRE_EQUAL(v1_opencl.size(),v2_opencl.size()); 15 | 16 | vector v1 = copy_to_cpu(v1_opencl); 17 | vector v2 = copy_to_cpu(v2_opencl); 18 | for(std::size_t i = 0; i != v2.size(); ++i){ 19 | BOOST_CHECK_EQUAL(v1(i),v2(i)); 20 | } 21 | } 22 | 23 | BOOST_AUTO_TEST_SUITE (Remora_HIP_vector_assign) 24 | 25 | BOOST_AUTO_TEST_CASE( Remora_Vector_Assign_Dense ){ 26 | std::cout<<"testing dense-dense assignment"< source_cpu(1000); 28 | vector target_cpu(1000); 29 | vector result_add_cpu(1000); 30 | vector result_add_scalar_cpu(1000); 31 | unsigned int scalar = 10; 32 | for(std::size_t i = 0; i != 1000; ++i){ 33 | source_cpu(i) = 2*i+1; 34 | target_cpu(i) = 3*i+2; 35 | result_add_cpu(i) = source_cpu(i) + target_cpu(i); 36 | result_add_scalar_cpu(i) = target_cpu(i) + scalar; 37 | } 38 | vector source = copy_to_device(source_cpu, hip_tag()); 39 | vector result_add = copy_to_device(result_add_cpu, hip_tag()); 40 | vector result_add_scalar = copy_to_device(result_add_scalar_cpu, hip_tag()); 41 | { 42 | std::cout<<"testing direct assignment"< target = copy_to_device(target_cpu, hip_tag()); 44 | kernels::assign(target,source); 45 | checkVectorEqual(target,source); 46 | } 47 | { 48 | std::cout<<"testing functor assignment"< target = copy_to_device(target_cpu, hip_tag()); 50 | kernels::assign(target,source, device_traits::add()); 51 | checkVectorEqual(target,result_add); 52 | } 53 | { 54 | std::cout<<"testing functor scalar assignment"< target = copy_to_device(target_cpu, hip_tag()); 56 | kernels::assign::add >(target,scalar); 57 | checkVectorEqual(target,result_add_scalar); 58 | } 59 | 60 | } 61 | 62 | BOOST_AUTO_TEST_SUITE_END() 63 | -------------------------------------------------------------------------------- /Test/opencl_cholesky.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_Cholesky 2 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | using namespace remora; 15 | 16 | //the matrix is designed such that a lot of permutations will be performed 17 | matrix createSymm(std::size_t dimensions, std::size_t rank = 0){ 18 | if(rank == 0) rank = dimensions; 19 | matrix R(dimensions,dimensions,0.0); 20 | 21 | for(std::size_t i = 0; i != dimensions; ++i){ 22 | for(std::size_t j = 0; j A = prod(R,trans(R)); 29 | if(rank != dimensions){ 30 | for(std::size_t i = 0; i != rank/2; ++i){ 31 | A.swap_rows(2*i,dimensions-i-1); 32 | A.swap_columns(2*i,dimensions-i-1); 33 | } 34 | } 35 | return A; 36 | } 37 | typedef boost::mpl::list result_orientations; 38 | 39 | 40 | BOOST_AUTO_TEST_SUITE (Remora_Cholesky) 41 | 42 | BOOST_AUTO_TEST_CASE_TEMPLATE(Remora_Potrf, Orientation,result_orientations) { 43 | std::size_t Dimensions = 123; 44 | //first generate a suitable eigenvalue problem matrix A 45 | matrix A = createSymm(Dimensions); 46 | //calculate Cholesky 47 | matrix lowDec_opencl = copy_to_opencl(A); 48 | matrix upDec_opencl = copy_to_opencl(A); 49 | kernels::potrf(lowDec_opencl); 50 | kernels::potrf(upDec_opencl); 51 | matrix lowDec = copy_to_cpu(lowDec_opencl); 52 | matrix upDec = copy_to_cpu(upDec_opencl); 53 | matrix lowDec_test = A; 54 | matrix upDec_test = A; 55 | kernels::potrf(lowDec_test); 56 | kernels::potrf(upDec_test); 57 | //check that upper diagonal elements are correct and set them to zero 58 | for (size_t row = 0; row < Dimensions; row++){ 59 | for (size_t col =0; col < Dimensions ; col++){ 60 | BOOST_CHECK_CLOSE(lowDec(row, col), lowDec_test(row,col),1.e-12); 61 | BOOST_CHECK_CLOSE(upDec(row, col), upDec_test(row,col),1.e-12); 62 | } 63 | } 64 | 65 | } 66 | 67 | BOOST_AUTO_TEST_SUITE_END() 68 | -------------------------------------------------------------------------------- /Test/opencl_conv2d.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_OPENCL_Conv2d 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | using namespace remora; 12 | 13 | void test( 14 | std::size_t image_size1, std::size_t image_size2, 15 | std::size_t filter_size1, std::size_t filter_size2, 16 | std::size_t num_channels, 17 | std::size_t num_filters, 18 | std::size_t num_images, 19 | std::size_t padding_height = 0, 20 | std::size_t padding_width = 0 21 | ){ 22 | //create filter on CPU 23 | vector filter(num_channels * num_filters * filter_size1 * filter_size2); 24 | { 25 | std::size_t lin_elem = 0; 26 | for(std::size_t f = 0; f != num_filters; ++f){ 27 | for(std::size_t i = 0; i != filter_size1; ++i){ 28 | for(std::size_t j = 0; j != filter_size2; ++j){ 29 | for(std::size_t c = 0; c != num_channels; ++c, ++lin_elem){ 30 | double val = 1.0/(num_channels * filter_size1)*i + 0.1 - (0.1/filter_size2)*j+0.01*f-0.01*c; 31 | filter(lin_elem) = val; 32 | } 33 | } 34 | } 35 | } 36 | } 37 | 38 | //Create images on CPU 39 | matrix image(num_images, num_channels * image_size1 * image_size2); 40 | //create images and ground truth 41 | for(std::size_t im = 0; im != num_images; ++im){ 42 | std::size_t lin_elem = 0; 43 | for(std::size_t i = 0; i != image_size1; ++i){ 44 | for(std::size_t j = 0; j != image_size2; ++j){ 45 | for(std::size_t c = 0; c != num_channels; ++c, ++lin_elem){ 46 | image(im,lin_elem) = 1.0/(num_channels * image_size1)*i + 0.1 - (0.1/image_size2)*j; 47 | } 48 | } 49 | } 50 | } 51 | 52 | //copy to Device 53 | vector filter_opencl = copy_to_opencl(filter); 54 | matrix image_opencl = copy_to_opencl(image); 55 | 56 | //Reserve enough space for output 57 | 58 | std::size_t output_size1 = image_size1 - filter_size1 + 1 + padding_height; 59 | std::size_t output_size2 = image_size2 - filter_size2 + 1 + padding_width; 60 | matrix out(num_images, output_size1 * output_size2 * num_filters, 0.0); 61 | matrix out_opencl(num_images, output_size1 * output_size2 * num_filters, 0.0); 62 | 63 | 64 | //compute baseline and opencl result 65 | kernels::conv2d( 66 | image,filter,out,num_channels, num_filters, 67 | image_size1, image_size2, filter_size1, filter_size2, 68 | padding_height, padding_width 69 | ); 70 | 71 | kernels::conv2d( 72 | image_opencl,filter_opencl,out_opencl,num_channels, num_filters, 73 | image_size1, image_size2, filter_size1, filter_size2, 74 | padding_height, padding_width 75 | ); 76 | 77 | //copy result back and test 78 | matrix out_cpu = copy_to_cpu(out_opencl); 79 | 80 | for(std::size_t im = 0; im != num_images; ++im){ 81 | for(std::size_t k = 0; k != out.size2(); ++k){ 82 | BOOST_CHECK_CLOSE(out(im,k),out_cpu(im,k),1.e-2); 83 | } 84 | } 85 | } 86 | 87 | 88 | 89 | BOOST_AUTO_TEST_SUITE(Remora_Conv2d) 90 | 91 | BOOST_AUTO_TEST_CASE(conv2d_test) { 92 | test(32,16,4,8,5,1,1); 93 | test(16,12,4,8,4,4,3); 94 | test(57,33,7,3,22,15,3); 95 | } 96 | 97 | BOOST_AUTO_TEST_SUITE_END() 98 | -------------------------------------------------------------------------------- /Test/opencl_matrix_assign.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_OPENCL_MatrixAssign 2 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | using namespace remora; 12 | 13 | template 14 | void checkMatrixEqual(M1 const& m1_opencl, M2 const& m2_opencl){ 15 | BOOST_REQUIRE_EQUAL(m1_opencl.size1(),m2_opencl.size1()); 16 | BOOST_REQUIRE_EQUAL(m1_opencl.size2(),m2_opencl.size2()); 17 | 18 | matrix m1 = copy_to_cpu(m1_opencl); 19 | matrix m2 = copy_to_cpu(m2_opencl); 20 | for(std::size_t i = 0; i != m2.size1(); ++i){ 21 | for(std::size_t j = 0; j != m2.size2(); ++j){ 22 | BOOST_CHECK_EQUAL(m1(i,j),m2(i,j)); 23 | } 24 | } 25 | } 26 | 27 | BOOST_AUTO_TEST_SUITE (Remora_opencl_matrix_assign) 28 | 29 | BOOST_AUTO_TEST_CASE( Remora_Matrix_Assign_Dense ){ 30 | std::cout<<"testing dense-dense assignment"< source_cpu(100,237); 32 | matrix target_cpu(100,237); 33 | matrix result_add_cpu(100,237); 34 | matrix result_add_scalar_cpu(100,237); 35 | float scalar = 10; 36 | for(std::size_t i = 0; i != 100; ++i){ 37 | for(std::size_t j = 0; j != 237; ++j){ 38 | source_cpu(i,j) = 2*i+1+0.3*j; 39 | target_cpu(i,j) = 3*i+2+0.3*j; 40 | result_add_cpu(i,j) = source_cpu(i,j) + target_cpu(i,j); 41 | result_add_scalar_cpu(i,j) = target_cpu(i,j) + scalar; 42 | } 43 | } 44 | matrix source = copy_to_opencl(source_cpu); 45 | matrix source_cm = copy_to_opencl(source_cpu); 46 | matrix result_add = copy_to_opencl(result_add_cpu); 47 | matrix result_add_scalar = copy_to_opencl(result_add_scalar_cpu); 48 | { 49 | std::cout<<"testing direct assignment row-row"< target = copy_to_opencl(target_cpu); 51 | kernels::assign(target,source); 52 | checkMatrixEqual(target,source); 53 | } 54 | { 55 | std::cout<<"testing functor assignment row-row"< target = copy_to_opencl(target_cpu); 57 | kernels::assign(target,source, device_traits::add()); 58 | checkMatrixEqual(target,result_add); 59 | } 60 | { 61 | std::cout<<"testing direct assignment row-column"< target = copy_to_opencl(target_cpu); 63 | kernels::assign(target,source_cm); 64 | checkMatrixEqual(target,source_cm); 65 | } 66 | { 67 | std::cout<<"testing functor assignment row-column"< target = copy_to_opencl(target_cpu); 69 | kernels::assign(target,source_cm, device_traits::add()); 70 | checkMatrixEqual(target,result_add); 71 | } 72 | { 73 | std::cout<<"testing functor scalar assignment"< target = copy_to_opencl(target_cpu); 75 | kernels::assign::add >(target,scalar); 76 | target.queue().finish(); 77 | checkMatrixEqual(target,result_add_scalar); 78 | } 79 | 80 | } 81 | 82 | BOOST_AUTO_TEST_SUITE_END() 83 | -------------------------------------------------------------------------------- /Test/opencl_syrk.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_OPENCL_Syrk 2 | #include 3 | #include 4 | #include 5 | 6 | 7 | #define BOOST_COMPUTE_DEBUG_KERNEL_COMPILATION 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | #include 15 | using namespace remora; 16 | 17 | template 18 | void checkSyrk(M const& arg_opencl, Result const& result_opencl,double init, double alpha, bool upper){ 19 | BOOST_REQUIRE_EQUAL(arg_opencl.size1(), result_opencl.size1()); 20 | BOOST_REQUIRE_EQUAL(result_opencl.size1(), result_opencl.size2()); 21 | 22 | matrix arg = copy_to_cpu(arg_opencl); 23 | matrix result = copy_to_cpu(result_opencl); 24 | 25 | if(upper){ 26 | for(std::size_t i = 0; i != result.size1(); ++i) { 27 | for(std::size_t j = 0; j != result.size2(); ++j) { 28 | if(j < i){ 29 | BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4); 30 | }else{ 31 | double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init; 32 | BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4); 33 | } 34 | } 35 | } 36 | }else{ 37 | for(std::size_t i = 0; i != result.size1(); ++i) { 38 | for(std::size_t j = 0; j != result.size2(); ++j) { 39 | if(j > i){ 40 | BOOST_CHECK_CLOSE(result(i,j),init, 1.e-4); 41 | }else{ 42 | double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init; 43 | BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-4); 44 | } 45 | } 46 | } 47 | } 48 | } 49 | 50 | BOOST_AUTO_TEST_SUITE (Remora_Gpu_Syrk) 51 | 52 | typedef boost::mpl::list result_orientations; 53 | BOOST_AUTO_TEST_CASE_TEMPLATE(syrk_test, Orientation,result_orientations) { 54 | std::size_t dims = 936;//chosen as not to be a multiple of the block size 55 | std::size_t K = 1039; 56 | 57 | //rhs 58 | matrix arg_cpu(dims, K, 1.0); 59 | for(std::size_t i = 0; i != dims; ++i) { 60 | for(std::size_t j = 0; j != K; ++j) { 61 | arg_cpu(i, j) = (1.0/ dims) * i + 0.2/K * j + 1; 62 | } 63 | } 64 | 65 | matrix argrm = copy_to_opencl(arg_cpu); 66 | matrix argcm = copy_to_opencl(arg_cpu); 67 | 68 | std::cout << "\nchecking syrk V+=AA^T" << std::endl; 69 | { 70 | std::cout<<"row major A, lower V"< result(dims,dims,3.0); 72 | kernels::syrk(argrm,result, 2.0); 73 | checkSyrk(argrm,result, 3.0, 2.0,false); 74 | } 75 | { 76 | std::cout<<"row major A, upper V"< result(dims,dims,3.0); 78 | kernels::syrk(argrm,result, 2.0); 79 | checkSyrk(argrm,result, 3.0, 2.0,true); 80 | } 81 | { 82 | std::cout<<"column major A, lower V"< result(dims,dims,3.0); 84 | kernels::syrk(argcm,result, 2.0); 85 | checkSyrk(argrm,result, 3.0, 2.0,false); 86 | } 87 | { 88 | std::cout<<"column major A, upper V"< result(dims,dims,3.0); 90 | kernels::syrk(argcm,result, 2.0); 91 | checkSyrk(argrm,result, 3.0, 2.0,true); 92 | } 93 | } 94 | 95 | BOOST_AUTO_TEST_SUITE_END() 96 | -------------------------------------------------------------------------------- /Test/opencl_vector_assign.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_OPENCL_VectorAssign 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | using namespace remora; 11 | 12 | template 13 | void checkVectorEqual(V1 const& v1_opencl, V2 const& v2_opencl){ 14 | BOOST_REQUIRE_EQUAL(v1_opencl.size(),v2_opencl.size()); 15 | 16 | vector v1 = copy_to_cpu(v1_opencl); 17 | vector v2 = copy_to_cpu(v2_opencl); 18 | for(std::size_t i = 0; i != v2.size(); ++i){ 19 | BOOST_CHECK_EQUAL(v1(i),v2(i)); 20 | } 21 | } 22 | 23 | BOOST_AUTO_TEST_SUITE (Remora_opencl_vector_assign) 24 | 25 | BOOST_AUTO_TEST_CASE( Remora_Vector_Assign_Dense ){ 26 | std::cout<<"testing dense-dense assignment"< source_cpu(1000); 28 | vector target_cpu(1000); 29 | vector result_add_cpu(1000); 30 | vector result_add_scalar_cpu(1000); 31 | unsigned int scalar = 10; 32 | for(std::size_t i = 0; i != 1000; ++i){ 33 | source_cpu(i) = 2*i+1; 34 | target_cpu(i) = 3*i+2; 35 | result_add_cpu(i) = source_cpu(i) + target_cpu(i); 36 | result_add_scalar_cpu(i) = target_cpu(i) + scalar; 37 | } 38 | vector source = copy_to_opencl(source_cpu); 39 | vector result_add = copy_to_opencl(result_add_cpu); 40 | vector result_add_scalar = copy_to_opencl(result_add_scalar_cpu); 41 | { 42 | std::cout<<"testing direct assignment"< target = copy_to_opencl(target_cpu); 44 | kernels::assign(target,source); 45 | checkVectorEqual(target,source); 46 | } 47 | { 48 | std::cout<<"testing functor assignment"< target = copy_to_opencl(target_cpu); 50 | kernels::assign(target,source, device_traits::add()); 51 | checkVectorEqual(target,result_add); 52 | } 53 | { 54 | std::cout<<"testing functor scalar assignment"< target = copy_to_opencl(target_cpu); 56 | kernels::assign::add >(target,scalar); 57 | checkVectorEqual(target,result_add_scalar); 58 | } 59 | 60 | } 61 | 62 | BOOST_AUTO_TEST_SUITE_END() 63 | -------------------------------------------------------------------------------- /Test/syrk.cpp: -------------------------------------------------------------------------------- 1 | #define BOOST_TEST_MODULE Remora_Syrk 2 | #include 3 | #include 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | using namespace remora; 12 | 13 | template 14 | void checkSyrk(M1 const& arg, Result const& result,double init, double alpha, bool upper){ 15 | BOOST_REQUIRE_EQUAL(arg.size1(), result.size1()); 16 | BOOST_REQUIRE_EQUAL(result.size1(), result.size2()); 17 | 18 | if(upper){ 19 | for(std::size_t i = 0; i != result.size1(); ++i) { 20 | for(std::size_t j = 0; j != result.size2(); ++j) { 21 | if(j < i){ 22 | BOOST_CHECK_CLOSE(result(i,j),init, 1.e-10); 23 | }else{ 24 | double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init; 25 | BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-10); 26 | } 27 | } 28 | } 29 | }else{ 30 | for(std::size_t i = 0; i != result.size1(); ++i) { 31 | for(std::size_t j = 0; j != result.size2(); ++j) { 32 | if(j > i){ 33 | BOOST_CHECK_CLOSE(result(i,j),init, 1.e-10); 34 | }else{ 35 | double test_result = alpha*inner_prod(row(arg,i),row(arg,j))+init; 36 | BOOST_CHECK_CLOSE(result(i,j), test_result, 1.e-10); 37 | } 38 | } 39 | } 40 | } 41 | } 42 | 43 | BOOST_AUTO_TEST_SUITE(Remora_SYRK) 44 | 45 | 46 | 47 | typedef boost::mpl::list result_orientations; 48 | BOOST_AUTO_TEST_CASE_TEMPLATE(syrk_test, Orientation,result_orientations) { 49 | std::size_t dims = 384;//chosen as not to be a multiple of the block size 50 | std::size_t K = 244; 51 | 52 | //rhs 53 | matrix argrm(dims, K, 1.0); 54 | matrix argcm(dims, K, 1.0); 55 | for(std::size_t i = 0; i != dims; ++i) { 56 | for(std::size_t j = 0; j != K; ++j) { 57 | argrm(i, j) = argcm(i, j) = (1.0/ dims) * i + 0.2/K * j + 1; 58 | } 59 | } 60 | 61 | std::cout << "\nchecking syrk V+=AA^T" << std::endl; 62 | { 63 | std::cout<<"row major A, lower V"< result(dims, dims, 3.0); 65 | kernels::syrk(argrm,result, 2.0); 66 | checkSyrk(argrm,result, 3.0, 2.0,false); 67 | } 68 | { 69 | std::cout<<"row major A, upper V"< result(dims, dims, 3.0); 71 | kernels::syrk(argrm,result, 2.0); 72 | checkSyrk(argrm,result, 3.0, 2.0,true); 73 | } 74 | { 75 | std::cout<<"column major A, lower V"< result(dims, dims, 3.0); 77 | kernels::syrk(argcm,result, 2.0); 78 | checkSyrk(argrm,result, 3.0, 2.0,false); 79 | } 80 | { 81 | std::cout<<"column major A, upper V"< result(dims, dims, 3.0); 83 | kernels::syrk(argcm,result, 2.0); 84 | checkSyrk(argrm,result, 3.0, 2.0,true); 85 | } 86 | 87 | } 88 | 89 | BOOST_AUTO_TEST_SUITE_END() 90 | -------------------------------------------------------------------------------- /doc/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED( VERSION 2.8 ) 2 | 3 | find_package( Doxygen REQUIRED ) 4 | find_package( PythonInterp REQUIRED ) 5 | 6 | ADD_SUBDIRECTORY(tutToRst) 7 | 8 | CONFIGURE_FILE ( 9 | "${CMAKE_CURRENT_SOURCE_DIR}/sphinx_pages/conf.py.in" 10 | "${CMAKE_CURRENT_SOURCE_DIR}/sphinx_pages/conf.py" 11 | ) 12 | 13 | set( SPHINX_EXECUTABLE sphinx-build ) 14 | set( SPHINX_PARAMETERS -b html ) 15 | 16 | add_custom_target(doc_creation) 17 | 18 | #find all .tut files 19 | file(GLOB_RECURSE TutFiles sphinx_pages *.tut) 20 | message(STATUS ${TutFiles}) 21 | foreach(tut ${TutFiles}) 22 | GET_FILENAME_COMPONENT(tutPath ${tut} PATH) 23 | GET_FILENAME_COMPONENT(tutName ${tut} NAME_WE) 24 | add_custom_command(TARGET doc_creation POST_BUILD COMMAND 25 | tut2rst ${tutPath}/${tutName} ${PROJECT_SOURCE_DIR}/examples 26 | ) 27 | endforeach() 28 | add_dependencies(doc_creation tut2rst) 29 | 30 | add_custom_target( doc ALL 31 | COMMAND ${SPHINX_EXECUTABLE} ${SPHINX_PARAMETERS} ${CMAKE_CURRENT_SOURCE_DIR}/sphinx_pages ${CMAKE_CURRENT_BINARY_DIR}/sphinx_pages/build/html 32 | ) 33 | 34 | 35 | add_dependencies(doc doc_creation) -------------------------------------------------------------------------------- /doc/tutToRst/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED( VERSION 2.8 ) 2 | 3 | ADD_EXECUTABLE( tut2rst 4 | tut2rst.cpp 5 | ) -------------------------------------------------------------------------------- /examples/Benchmarks/Timer.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Timer abstraction with microsecond resolution 5 | * 6 | * 7 | * 8 | * \author T. Voss, M. Tuma 9 | * \date 2010 10 | * 11 | * 12 | * \par Copyright 1995-2015 Shark Development Team 13 | * 14 | *

15 | * This file is part of Shark. 16 | * 17 | * 18 | * Shark is free software: you can redistribute it and/or modify 19 | * it under the terms of the GNU Lesser General Public License as published 20 | * by the Free Software Foundation, either version 3 of the License, or 21 | * (at your option) any later version. 22 | * 23 | * Shark is distributed in the hope that it will be useful, 24 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 25 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 26 | * GNU Lesser General Public License for more details. 27 | * 28 | * You should have received a copy of the GNU Lesser General Public License 29 | * along with Shark. If not, see . 30 | * 31 | */ 32 | 33 | #ifndef TIMER_HPP 34 | #define TIMER_HPP 35 | 36 | 37 | #ifdef _WIN32 38 | #define WIN32_LEAN_AND_MEAN 39 | #include 40 | #include 41 | #else 42 | #include 43 | #include 44 | #endif 45 | 46 | /// \brief Timer abstraction with microsecond resolution 47 | /// 48 | /// \par 49 | /// Use start() to start the timer and stop() to retrive the 50 | /// elapsed time in seconds (guaranteed/forced to be >= 0 ). 51 | /// Use now() to get the current time (may in rare cases give decreasing values). 52 | class Timer 53 | { 54 | public: 55 | Timer(bool measureWallclockTime = true) 56 | : m_lastLap( 0.0 ) 57 | , m_startTime( 0.0 ) 58 | , m_measureWallclockTime(measureWallclockTime) 59 | { start();} 60 | 61 | /// \brief Returns the current time in a microsecond resolution. Att: may in rare cases give decreasing values. 62 | static double now(bool measureWallclockTime = true) { 63 | #ifdef _WIN32 64 | if(measureWallclockTime){ 65 | return static_cast(std::clock()) / CLOCKS_PER_SEC; 66 | } 67 | else{ 68 | LARGE_INTEGER tick, tps; 69 | QueryPerformanceFrequency(&tps); 70 | QueryPerformanceCounter(&tick); 71 | return( static_cast( tick.QuadPart ) / static_cast( tps.QuadPart ) ); 72 | } 73 | #else 74 | if(measureWallclockTime){ 75 | timeval time; 76 | if (gettimeofday(&time,0)){ 77 | // Handle error 78 | return 0; 79 | } 80 | return time.tv_sec +1e-6 *time.tv_usec; 81 | } 82 | else 83 | { 84 | rusage res; 85 | getrusage(RUSAGE_SELF, &res); 86 | return(res.ru_utime.tv_sec + res.ru_stime.tv_sec) 87 | + 1e-6 * (res.ru_utime.tv_usec + res.ru_stime.tv_usec); 88 | } 89 | #endif 90 | } 91 | 92 | /// \brief Stores the current time in m_startTime. 93 | void start() { 94 | m_startTime = now(m_measureWallclockTime); 95 | } 96 | 97 | /// \brief Returns the difference between current time and the start time. 98 | /// 99 | /// The time is meeasured since the last time start() was called. Thus several consecutive 100 | /// calls to stop() will return ascending numbers. start() is called automatically at construction time. 101 | double stop() { 102 | double stop = now(m_measureWallclockTime); 103 | m_lastLap = stop - m_startTime; 104 | 105 | // avoid rare cases of non-increasing timer values (cf. eg. http://www.linuxmisc.com/8-freebsd/d4c6ddc8fbfbd523.htm) 106 | 107 | if ( m_lastLap < 0.0 ) { 108 | m_lastLap = 0.0; 109 | } 110 | 111 | return m_lastLap; 112 | } 113 | 114 | /// \brief Returns the last value of stop(). 115 | double lastLap() { 116 | return m_lastLap; 117 | } 118 | 119 | private: 120 | double m_lastLap; 121 | double m_startTime; 122 | bool m_measureWallclockTime; 123 | }; 124 | #endif 125 | -------------------------------------------------------------------------------- /examples/Benchmarks/conv2d.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "Timer.hpp" 4 | #include 5 | using namespace remora; 6 | 7 | template 8 | void benchmark( 9 | matrix_expression const& image, 10 | vector_expression const& filter, 11 | std::size_t num_channels, 12 | std::size_t num_filters, 13 | std::size_t image_size1, 14 | std::size_t image_size2, 15 | std::size_t filter_size 16 | ){ 17 | std::size_t output_size1 = image_size1 - filter_size +1; 18 | std::size_t output_size2 = image_size2 - filter_size +1; 19 | typedef typename E1::value_type value_type; 20 | 21 | remora::matrix out(image().size1(), output_size1 * num_filters * output_size2, 0.0); 22 | double minOptTime = std::numeric_limits::max(); 23 | for(std::size_t i = 0; i != std::max(1, std::size_t(20/image().size1())); ++i){ 24 | Timer time; 25 | kernels::conv2d(image,filter,out, num_channels, num_filters, image_size1, image_size2, filter_size, filter_size,0,0); 26 | minOptTime = std::min(minOptTime,time.stop()); 27 | } 28 | 29 | double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels; 30 | double flops = image().size1() * mults /1024/1024/minOptTime; 31 | 32 | std::cout< 37 | void benchmark(std::size_t num_channels, std::size_t num_outputs, std::size_t num_images){ 38 | std::cout<<"Flops"< image(num_images, num_channels * sizeIm1 * sizeIm2); 47 | remora::vector filter(num_channels * num_outputs * filterSize * filterSize); 48 | 49 | for(std::size_t im = 0; im != num_images; ++im){ 50 | for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){ 51 | for(std::size_t j = 0; j != sizeIm2; ++j){ 52 | image(im, i * sizeIm2 + j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j; 53 | } 54 | } 55 | } 56 | for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){ 57 | for(std::size_t j = 0; j != filterSize; ++j){ 58 | filter(i * filterSize + j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j; 59 | } 60 | } 61 | 62 | benchmark(image,filter,num_channels,num_outputs, sizeIm1, sizeIm2, filterSize); 63 | } 64 | } 65 | } 66 | 67 | 68 | int main(int argc, char **argv) { 69 | std::cout<<"performance float"<(8,32,16); 71 | std::cout<<"performance double"<(8,32,16); 73 | } 74 | -------------------------------------------------------------------------------- /examples/Benchmarks/gemm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Timer.hpp" 3 | #include 4 | using namespace remora; 5 | 6 | template 7 | double benchmark( 8 | matrix_expression const& A, 9 | matrix_expression const& B, 10 | matrix_expression & C 11 | ){ 12 | double minTime = std::numeric_limits::max(); 13 | for(std::size_t i = 0; i != 10; ++i){ 14 | Timer time; 15 | noalias(C) += prod(A,B); 16 | minTime = std::min(minTime,time.stop()); 17 | } 18 | return (A().size1()*A().size2()*B().size2())/minTime/1024/1024; 19 | } 20 | 21 | int main(int argc, char **argv) { 22 | std::size_t size = 100; 23 | std::cout<<"Flops"< Arow(size,middle); 27 | for(std::size_t i = 0; i != size; ++i){ 28 | for(std::size_t k = 0; k != middle; ++k){ 29 | Arow(i,k) = 0.1/size*i+0.1/size*k; 30 | } 31 | } 32 | 33 | matrix Brow(middle,size); 34 | for(std::size_t k = 0; k != middle; ++k){ 35 | for(std::size_t j = 0; j != size; ++j){ 36 | Brow(k,j) = 0.1/size*j+0.1/size*k; 37 | } 38 | } 39 | matrix Acol = Arow; 40 | matrix Bcol = Brow; 41 | 42 | matrix Crow(size,size,0.0); 43 | matrix Ccol(size,size,0.0); 44 | std::cout< 2 | #include 3 | #include 4 | #include "Timer.hpp" 5 | #include 6 | using namespace remora; 7 | 8 | template 9 | void benchmark( 10 | matrix_expression const& image, 11 | vector_expression const& filter, 12 | std::size_t num_channels, 13 | std::size_t num_filters, 14 | std::size_t image_size1, 15 | std::size_t image_size2, 16 | std::size_t filter_size 17 | ){ 18 | std::size_t output_size1 = image_size1 - filter_size +1; 19 | std::size_t output_size2 = image_size2 - filter_size +1; 20 | typedef typename E1::value_type value_type; 21 | 22 | remora::matrix image_opencl = copy_to_opencl(image); 23 | remora::vector filter_opencl = copy_to_opencl(filter); 24 | remora::matrix out_opencl(image().size1(), output_size1 * num_filters * output_size2, 0.0); 25 | kernels::conv2d(image_opencl,filter_opencl,out_opencl, num_channels, num_filters, image_size1, image_size2, filter_size, filter_size,0,0); 26 | out_opencl.queue().finish(); 27 | double minOptTime = std::numeric_limits::max(); 28 | for(std::size_t i = 0; i != 10; ++i){ 29 | Timer time; 30 | kernels::conv2d(image_opencl,filter_opencl,out_opencl, num_channels, num_filters, image_size1, image_size2, filter_size, filter_size,0,0); 31 | out_opencl.queue().finish(); 32 | minOptTime = std::min(minOptTime,time.stop()); 33 | } 34 | 35 | double mults = output_size1 * output_size2 * filter_size * filter_size * num_filters * num_channels; 36 | double flops = image().size1() * mults /1024/1024/minOptTime; 37 | 38 | double storage = double(out_opencl.size1() * out_opencl.size2())/1024/1024; 39 | std::cout< 44 | void benchmark(std::size_t num_channels, std::size_t num_outputs, std::size_t num_images){ 45 | std::cout<<"im_size\tfiltpx\tincChan\tOutChan\tmemOut\tFlops"< image(num_images, num_channels * sizeIm1 * sizeIm2); 54 | remora::vector filter(num_channels * num_outputs * filterSize * filterSize); 55 | 56 | for(std::size_t im = 0; im != num_images; ++im){ 57 | for(std::size_t i = 0; i != num_channels * sizeIm1; ++i){ 58 | for(std::size_t j = 0; j != sizeIm2; ++j){ 59 | image(im, i * sizeIm2 + j) = 1.0/(num_channels * sizeOut1)*i + 0.1 - (0.1/sizeOut2)*j; 60 | } 61 | } 62 | } 63 | for(std::size_t i = 0; i != num_channels * num_outputs * filterSize; ++i){ 64 | for(std::size_t j = 0; j != filterSize; ++j){ 65 | filter(i * filterSize + j) = 1.0/(num_channels * filterSize)*i + 0.1 - (0.1/filterSize)*j; 66 | } 67 | } 68 | 69 | benchmark(image,filter,num_channels,num_outputs, sizeIm1, sizeIm2, filterSize); 70 | } 71 | } 72 | } 73 | 74 | 75 | int main(int argc, char **argv) { 76 | std::cout<<"performance float"<(3,16,4); 78 | } 79 | -------------------------------------------------------------------------------- /examples/Benchmarks/potrf.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "Timer.hpp" 3 | #include 4 | using namespace remora; 5 | 6 | template 7 | double benchmark( 8 | matrix_expression const& A, 9 | Triang 10 | ){ 11 | double minTime = std::numeric_limits::max(); 12 | volatile double res = 0; 13 | for(std::size_t i = 0; i != 20; ++i){ 14 | typename matrix_temporary::type Acopy = A; 15 | Timer time; 16 | kernels::potrf(Acopy); 17 | minTime = std::min(minTime,time.stop()); 18 | res += max(Acopy); 19 | } 20 | return (1.0/3.0*A().size1()*A().size1()*A().size1())/minTime/1024/1024; 21 | } 22 | 23 | int main(int argc, char **argv) { 24 | std::size_t size = 128; 25 | std::cout<<"Mega Flops"< Arow(size,size); 28 | for(std::size_t i = 0; i != size; ++i){ 29 | for(std::size_t j = 0; j != size; ++j){ 30 | Arow(i,j) = 0.1/size*i+0.1/size*j; 31 | } 32 | Arow(i,i) += 1000.0; 33 | } 34 | matrix Acol = Arow; 35 | std::cout< 2 | #include "Timer.hpp" 3 | #include 4 | using namespace remora; 5 | 6 | template 7 | double benchmark( 8 | matrix_expression const& A, 9 | matrix_expression & C 10 | ){ 11 | double minTime = std::numeric_limits::max(); 12 | for(std::size_t i = 0; i != 10; ++i){ 13 | Timer time; 14 | kernels::syrk(A,C,2.0); 15 | minTime = std::min(minTime,time.stop()); 16 | } 17 | return (0.5*A().size1()*A().size2()*A().size1())/minTime/1024/1024; 18 | } 19 | 20 | int main(int argc, char **argv) { 21 | std::size_t size = 100; 22 | std::cout<<"Mega Flops"< Arow(size,size); 25 | for(std::size_t i = 0; i != size; ++i){ 26 | for(std::size_t k = 0; k != size; ++k){ 27 | Arow(i,k) = 0.1/size*i+0.1/size*k; 28 | } 29 | } 30 | matrix Acol = Arow; 31 | 32 | matrix Crow(size,size,0.0); 33 | matrix Ccol(size,size,0.0); 34 | std::cout<(Arow,Crow)<<"\t"<< benchmark(Acol,Crow)<(Arow,Crow)<<"\t"<< benchmark(Acol,Crow)<(Arow,Ccol)<<"\t"<< benchmark(Acol,Ccol)<(Arow,Ccol)<<"\t"<< benchmark(Acol,Ccol)< 2 | #include "Timer.hpp" 3 | #include 4 | using namespace remora; 5 | 6 | template 7 | double benchmark( 8 | matrix_expression const& A, 9 | matrix_expression const& B, 10 | matrix_expression & C 11 | ){ 12 | double minTime = std::numeric_limits::max(); 13 | for(std::size_t i = 0; i != 10; ++i){ 14 | Timer time; 15 | noalias(C) += triangular_prod(A,B); 16 | minTime = std::min(minTime,time.stop()); 17 | } 18 | return (0.5*A().size1()*A().size2()*B().size2())/minTime/1024/1024; 19 | } 20 | 21 | int main(int argc, char **argv) { 22 | std::size_t size = 100; 23 | std::cout<<"Mega Flops"< Arow(size,size); 26 | for(std::size_t i = 0; i != size; ++i){ 27 | for(std::size_t k = 0; k != size; ++k){ 28 | Arow(i,k) = 0.1/size*i+0.1/size*k; 29 | } 30 | } 31 | 32 | matrix Brow(size,size); 33 | for(std::size_t k = 0; k != size; ++k){ 34 | for(std::size_t j = 0; j != size; ++j){ 35 | Brow(k,j) = 0.1/size*j+0.1/size*k; 36 | } 37 | } 38 | matrix Acol = Arow; 39 | matrix Bcol = Brow; 40 | 41 | matrix Crow(size,size,0.0); 42 | matrix Ccol(size,size,0.0); 43 | std::cout<(Arow,Brow,Crow)<<"\t"<< benchmark(Acol,Brow,Crow) 44 | <<"\t"<< benchmark(Arow,Bcol,Crow) <<"\t" <(Acol,Bcol,Crow) <(Arow,Brow,Crow)<<"\t"<< benchmark(Acol,Brow,Crow) 46 | <<"\t"<< benchmark(Arow,Bcol,Crow) <<"\t" <(Acol,Bcol,Crow) <(Arow,Brow,Ccol)<<"\t"<< benchmark(Acol,Brow,Ccol) 48 | <<"\t"<< benchmark(Arow,Bcol,Ccol) <<"\t" <(Acol,Bcol,Ccol) <(Arow,Brow,Ccol)<<"\t"<< benchmark(Acol,Brow,Ccol) 50 | <<"\t"<< benchmark(Arow,Bcol,Ccol) <<"\t" <(Acol,Bcol,Ccol) < 2 | #include 3 | 4 | //###begin 5 | #include 6 | using namespace remora; 7 | //###end 8 | 9 | 10 | int main(){ 11 | //Step 0: Theory 12 | // The goal of linear regression is to find a linear function f(x) = w^Tx + b 13 | // that fits best a given set of point-label pairs (x1,y1),(x2,y2),...,(xN,yN). 14 | // This is measured by the squared error: 15 | // E(w) = 1/(2N) sum_i (f(x_i)-y_i)^2 16 | // It turns out that the optimal solution can be written in simple matrix algebra, 17 | // when X is the data matrix where points are stored row-wise and y is the vector 18 | // of labels: 19 | // w=((X|1)^T (X|1))^{-1} (X|1)^Ty 20 | 21 | //Step 1: Generate some random data 22 | //###begin 23 | std::size_t num_data_points = 100; 24 | std::size_t num_dims = 50; 25 | matrix X(num_data_points, num_dims); 26 | vector y(num_data_points); 27 | //###end 28 | //###begin 29 | std::random_device rd; 30 | std::mt19937 gen(rd()); 31 | std::normal_distribution<> normal(0,2); 32 | for(std::size_t i = 0; i != num_data_points; ++i){ 33 | for(std::size_t j = 0; j != num_dims; ++j){ 34 | X(i,j) = normal(gen); //set element (i,j) of X to a rnadomly generated number 35 | } 36 | } 37 | //###end 38 | //###begin 39 | std::normal_distribution<> normal_noise(0,0.1); 40 | for(std::size_t i = 0; i != num_data_points; ++i){ 41 | //label is chosen to be just the sum of entries plus some noise 42 | y(i) = sum(row(X,i)) + normal_noise(gen) + 1; 43 | } 44 | //###end 45 | // Step 2: compute the linear regression 46 | // formula is w=((X|1)^T (X|1))^{-1} (X|1)^Ty 47 | // we need to tell the algebra how to solve the system of equations, 48 | // in this case we tell it that the matrix is symmetric positive definite. 49 | // but we have to be aware that our matrix is not always full rank, 50 | // e..g when we have more variables than data or when some variable 51 | // is constant 0. 52 | //###begin 53 | vector w = inv(trans(X|1) % (X|1), symm_semi_pos_def()) % trans(X|1) % y; 54 | //###end 55 | // Step 3: evaluate solution 56 | // we compute: E(w) = 1/(2N) sum_i (f(x_i)-y_i)^2 57 | //###begin 58 | double error = 0.5 * sum(sqr((X|1) % w - y)) / num_data_points; 59 | //###end 60 | // Step 4: For ensuring correctness, we will check that 61 | // the derivative of E(w) at the solution is small (on the order of 1.e-14) 62 | //###begin 63 | vector derE = trans(X|1) % ((X|1) % w - y) / num_data_points; 64 | double error_derivative = norm_inf(derE); 65 | //###end 66 | std::cout<<"final error of fit: "<< error< 5 | 6 | #ifndef NDEBUG 7 | #define REMORA_RANGE_CHECK(cond) assert(cond) 8 | #define REMORA_SIZE_CHECK(cond) assert(cond) 9 | #else 10 | #define REMORA_RANGE_CHECK(cond) do { (void)sizeof(cond); } while (false) 11 | #define REMORA_SIZE_CHECK(cond) do { (void)sizeof(cond); } while (false) 12 | #endif 13 | 14 | #endif -------------------------------------------------------------------------------- /include/remora/detail/evaluation_tags.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief Tags representing different type of expression evaluation categories 6 | * 7 | * \author O. Krause 8 | * \date 2016 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | 33 | #ifndef REMORA_DETAIL_EVALUATION_TAGS_HPP 34 | #define REMORA_DETAIL_EVALUATION_TAGS_HPP 35 | 36 | namespace remora{ 37 | 38 | // Evaluation type tags: 39 | // dense_tag -> dense storage scheme an dense interface supported 40 | // continuous_dense_tag -> dense storage scheme where stride between all elements is 1 41 | // sparse_tag -> sparse storage scheme and supports sparse interface. 42 | // packed_tag ->BLAS packed format and supports packed interface 43 | // unknown_tag -> no known storage scheme, only supports basic interface(probably blockwise evaluation) 44 | struct unknown_tag{}; 45 | struct sparse_tag:public unknown_tag{}; 46 | struct dense_tag: public unknown_tag{}; 47 | struct continuous_dense_tag: public dense_tag{}; 48 | struct packed_tag: public unknown_tag{}; 49 | 50 | struct elementwise_tag{}; 51 | struct blockwise_tag{}; 52 | 53 | //evaluation categories 54 | template 55 | struct elementwise: public elementwise_tag{ 56 | typedef Tag tag; 57 | }; 58 | template 59 | struct blockwise: public blockwise_tag{ 60 | typedef Tag tag; 61 | }; 62 | 63 | 64 | template 65 | struct evaluation_tag_restrict_traits{ 66 | typedef Tag1 type; 67 | }; 68 | 69 | template 70 | struct evaluation_tag_restrict_traits { 71 | typedef dense_tag type; 72 | }; 73 | 74 | template<> 75 | struct evaluation_tag_restrict_traits { 76 | typedef sparse_tag type; 77 | }; 78 | 79 | namespace detail{ 80 | template 81 | struct evaluation_restrict_traits{ 82 | typedef blockwise::type> type; 85 | }; 86 | template 87 | struct evaluation_restrict_traits, elementwise >{ 88 | typedef elementwise::type> type; 89 | }; 90 | } 91 | template 92 | struct evaluation_restrict_traits: public detail::evaluation_restrict_traits< 93 | typename E1::evaluation_category, 94 | typename E2::evaluation_category 95 | >{}; 96 | 97 | } 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /include/remora/detail/proxy_optimizers_fwd.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Proxy Optimizations 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_DETAIL_PROXY_OPTIMIZERS_FWD_HPP 29 | #define REMORA_DETAIL_PROXY_OPTIMIZERS_FWD_HPP 30 | 31 | namespace remora{namespace detail{ 32 | 33 | //forward declarations 34 | template 35 | struct vector_range_optimizer; 36 | 37 | template 38 | struct matrix_transpose_optimizer; 39 | template 40 | struct matrix_row_optimizer; 41 | template 42 | struct matrix_range_optimizer; 43 | 44 | template 45 | struct matrix_rows_optimizer; 46 | 47 | template 48 | struct linearized_matrix_optimizer; 49 | 50 | template 51 | struct vector_to_matrix_optimizer; 52 | 53 | template 54 | struct matrix_diagonal_optimizer; 55 | 56 | template 57 | struct triangular_proxy_optimizer; 58 | 59 | template 60 | struct vector_scalar_multiply_optimizer; 61 | 62 | template 63 | struct matrix_scalar_multiply_optimizer; 64 | 65 | }} 66 | #endif 67 | -------------------------------------------------------------------------------- /include/remora/detail/vector_set.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Classes used for matrix expressions. 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_VECTOR_PROXY_SET_CLASSES_HPP 29 | #define REMORA_VECTOR_PROXY_SET_CLASSES_HPP 30 | 31 | #include "traits.hpp" 32 | #include "../expression_types.hpp" 33 | 34 | namespace remora{ 35 | 36 | template 37 | class vector_set:public vector_set_expression, typename E::device_type >{ 38 | public: 39 | typedef typename closure::type expression_closure_type; 40 | typedef typename E::size_type size_type; 41 | typedef typename E::value_type value_type; 42 | typedef typename E::const_reference const_reference; 43 | typedef typename reference::type reference; 44 | 45 | typedef vector_set const_closure_type; 46 | typedef vector_set closure_type; 47 | typedef O point_orientation; 48 | typedef typename E::evaluation_category evaluation_category; 49 | typedef typename E::device_type device_type; 50 | 51 | // Construction 52 | explicit vector_set(expression_closure_type const& e):m_expression(e){} 53 | 54 | // Accessors 55 | size_type size() const{ 56 | return point_orientation::index_M(m_expression.size1(), m_expression.size2()); 57 | } 58 | size_type point_size() const{ 59 | return point_orientation::index_m(m_expression.size1(), m_expression.size2()); 60 | } 61 | 62 | expression_closure_type const& expression() const{ 63 | return m_expression; 64 | } 65 | typename device_traits::queue_type& queue()const{ 66 | return m_expression.queue(); 67 | } 68 | 69 | // Computation Kernels 70 | template 71 | void assign_to(matrix_expression& X, typename MatX::value_type alpha)const{ 72 | assign(X, m_expression, alpha); 73 | } 74 | template 75 | void plus_assign_to(matrix_expression& X, typename MatX::value_type alpha)const{ 76 | plus_assign(X, m_expression, alpha); 77 | } 78 | private: 79 | expression_closure_type m_expression; 80 | }; 81 | 82 | } 83 | #endif 84 | -------------------------------------------------------------------------------- /include/remora/device_copy.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief expression templates for copying from cpu to device and back 3 | * 4 | * \author O. Krause 5 | * \date 2013 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_DEVICE_COPY_HPP 29 | #define REMORA_DEVICE_COPY_HPP 30 | 31 | #include "expression_types.hpp" 32 | 33 | namespace remora{ 34 | 35 | template 36 | E const& copy_to_cpu(vector_expression const& e){ 37 | return e(); 38 | } 39 | 40 | 41 | template 42 | E const& copy_to_cpu(matrix_expression const& e){ 43 | return e(); 44 | } 45 | 46 | template 47 | E const& copy_to_device(vector_expression const& e, cpu_tag){ 48 | return e(); 49 | } 50 | 51 | 52 | template 53 | E const& copy_to_device(matrix_expression const& e, cpu_tag){ 54 | return e(); 55 | } 56 | 57 | } 58 | 59 | #ifdef REMORA_USE_OPENCL 60 | #include "opencl/copy.hpp" 61 | #endif 62 | 63 | #if defined(__HCC__) || defined(__NVCC__) 64 | #include "hip/copy.hpp" 65 | #endif 66 | 67 | #endif 68 | -------------------------------------------------------------------------------- /include/remora/hip/exception.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief Error handling for the HIP runtime 6 | * 7 | * \author O. Krause 8 | * \date 2018 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | 33 | #ifndef REMORA_HIP_EXCEPTION_HPP 34 | #define REMORA_HIP_EXCEPTION_HPP 35 | 36 | #include 37 | #include 38 | #include 39 | namespace remora{namespace hip{ 40 | 41 | class hip_error_category: public std::error_category{ 42 | public: 43 | const char* name() const noexcept{ 44 | return "HIP"; 45 | } 46 | std::string message( int error ) const{ 47 | return hipGetErrorString(static_cast(error)); 48 | } 49 | static hip_error_category& category(){ 50 | static hip_error_category cat; 51 | return cat; 52 | } 53 | }; 54 | 55 | class hip_exception:public std::system_error{ 56 | public: 57 | hip_exception(hipError_t code): std::system_error(code, hip_error_category::category()){} 58 | }; 59 | 60 | inline void check_hip(hipError_t code){ 61 | if(code != hipSuccess) 62 | throw hip_exception(code); 63 | } 64 | 65 | }} 66 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/atlas/potrf.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2011 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | 33 | #ifndef REMORA_KERNELS_ATLAS_POTRF_H 34 | #define REMORA_KERNELS_ATLAS_POTRF_H 35 | 36 | #include "../cblas/cblas_inc.hpp" 37 | extern "C"{ 38 | #include 39 | } 40 | 41 | namespace remora { 42 | namespace bindings { 43 | 44 | inline int potrf( 45 | CBLAS_ORDER const Order, CBLAS_UPLO const Uplo, 46 | int const N, float *A, int const lda 47 | ) { 48 | return clapack_spotrf(Order, Uplo, N, A, lda); 49 | } 50 | 51 | inline int potrf( 52 | CBLAS_ORDER const Order, CBLAS_UPLO const Uplo, 53 | int const N, double *A, int const lda 54 | ) { 55 | return clapack_dpotrf(Order, Uplo, N, A, lda); 56 | } 57 | 58 | inline int potrf( 59 | CBLAS_ORDER const Order, CBLAS_UPLO const Uplo, 60 | int const N, std::complex* A, int const lda 61 | ) { 62 | return clapack_cpotrf(Order, Uplo, N, static_cast(A), lda); 63 | } 64 | 65 | inline int potrf( 66 | CBLAS_ORDER const Order, CBLAS_UPLO const Uplo, 67 | int const N, std::complex* A, int const lda 68 | ) { 69 | return clapack_zpotrf(Order, Uplo, N, static_cast(A), lda); 70 | } 71 | 72 | template 73 | inline int potrf( 74 | matrix_container& A, 75 | std::true_type 76 | ) { 77 | CBLAS_UPLO const uplo = Triangular::is_upper ? CblasUpper : CblasLower; 78 | CBLAS_ORDER const stor_ord = 79 | (CBLAS_ORDER)storage_order::value; 80 | 81 | std::size_t n = A().size1(); 82 | REMORA_SIZE_CHECK(n == A().size2()); 83 | 84 | auto storageA = A().raw_storage(); 85 | return potrf( 86 | stor_ord, uplo, (int)n, 87 | storageA.values, 88 | storageA.leading_dimension 89 | ); 90 | } 91 | 92 | template 93 | struct optimized_potrf_detail { 94 | typedef std::false_type type; 95 | }; 96 | template<> 97 | struct optimized_potrf_detail < 98 | dense_tag, 99 | double 100 | > { 101 | typedef std::true_type type; 102 | }; 103 | template<> 104 | struct optimized_potrf_detail < 105 | dense_tag, 106 | float 107 | > { 108 | typedef std::true_type type; 109 | }; 110 | template<> 111 | struct optimized_potrf_detail < 112 | dense_tag, 113 | std::complex 114 | > { 115 | typedef std::true_type type; 116 | }; 117 | 118 | template<> 119 | struct optimized_potrf_detail < 120 | dense_tag, 121 | std::complex 122 | > { 123 | typedef std::true_type type; 124 | }; 125 | 126 | template 127 | struct has_optimized_potrf 128 | : public optimized_potrf_detail < 129 | typename M::storage_type::storage_tag, 130 | typename M::value_type 131 | > {}; 132 | }} 133 | #endif 134 | -------------------------------------------------------------------------------- /include/remora/kernels/cblas/cblas_inc.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief - 5 | * 6 | * \author - 7 | * \date - 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_CBLAS_CBLAS_INC_HPP 32 | #define REMORA_KERNELS_CBLAS_CBLAS_INC_HPP 33 | 34 | #ifdef __APPLE__ 35 | 36 | #ifdef __ASSERTMACROS__ //is AssertMacros already included? 37 | //AssertMacros automatically defines __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES as 1 38 | //if not already included 39 | #if __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 40 | #warning "AssertMacros.h already included by some file. Disabling macros as otherwise compilation will fail" 41 | 42 | //incomplete list (probably the worst offenders that will fail compilation. 43 | #ifdef check 44 | #undef check 45 | #endif 46 | #ifdef require 47 | #undef require 48 | #endif 49 | #ifdef verify 50 | #undef verify 51 | #endif 52 | 53 | #endif 54 | #else 55 | //noone included it yet, so we can just prevent these macros... 56 | #define __ASSERT_MACROS_DEFINE_VERSIONS_WITHOUT_UNDERSCORES 0 57 | #endif 58 | 59 | // included to make Accelerate work with boost on MacOS 60 | #include 61 | 62 | // Accelerate framework support added by TG 19.06.2015 63 | extern "C" { 64 | #include 65 | } 66 | #undef nil 67 | 68 | #else 69 | 70 | extern "C" { 71 | #include 72 | } 73 | 74 | #endif 75 | 76 | //all atlas using functions need this anyway... 77 | //so we prevent multiple includes in all atlas using functions 78 | //which should decrease compile time a small bit 79 | #include 80 | #include "../../detail/traits.hpp" 81 | 82 | namespace remora {namespace bindings { 83 | 84 | template struct storage_order {}; 85 | template<> struct storage_order { 86 | enum ename { value = CblasRowMajor }; 87 | }; 88 | template<> struct storage_order { 89 | enum ename { value = CblasColMajor }; 90 | }; 91 | 92 | template 93 | struct allowed_cblas_type{ 94 | typedef std::false_type type; 95 | }; 96 | 97 | template<> 98 | struct allowed_cblas_type{ 99 | typedef std::true_type type; 100 | }; 101 | template<> 102 | struct allowed_cblas_type{ 103 | typedef std::true_type type; 104 | }; 105 | template<> 106 | struct allowed_cblas_type >{ 107 | typedef std::true_type type; 108 | }; 109 | template<> 110 | struct allowed_cblas_type >{ 111 | typedef std::true_type type; 112 | }; 113 | 114 | }} 115 | 116 | #ifndef OPENBLAS_CONST 117 | typedef void cblas_float_complex_type; 118 | typedef void cblas_double_complex_type; 119 | #else 120 | typedef float cblas_float_complex_type; 121 | typedef double cblas_double_complex_type; 122 | #endif 123 | 124 | 125 | #endif 126 | -------------------------------------------------------------------------------- /include/remora/kernels/cblas/syrk.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2010 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_CBLAS_SYRK_HPP 33 | #define REMORA_KERNELS_CBLAS_SYRK_HPP 34 | 35 | #include "cblas_inc.hpp" 36 | #include 37 | 38 | namespace remora{ namespace bindings { 39 | 40 | inline void syrk( 41 | CBLAS_ORDER const order, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, 42 | int N, int K, 43 | float alpha, float const *A, int lda, 44 | float beta, float *C, int ldc 45 | ){ 46 | cblas_ssyrk( 47 | order, uplo, trans, 48 | N, K, 49 | alpha, A, lda, 50 | beta, C, ldc 51 | ); 52 | } 53 | 54 | inline void syrk( 55 | CBLAS_ORDER const order, CBLAS_UPLO uplo, CBLAS_TRANSPOSE trans, 56 | int N, int K, 57 | double alpha, double const *A, int lda, 58 | double beta, double *C, int ldc 59 | ){ 60 | cblas_dsyrk( 61 | order, uplo, trans, 62 | N, K, 63 | alpha, A, lda, 64 | beta, C, ldc 65 | ); 66 | } 67 | 68 | 69 | // C <- C + alpha * A * A^T 70 | template 71 | void syrk( 72 | matrix_expression const& A, 73 | matrix_expression& C, 74 | typename MatC::value_type alpha, 75 | std::true_type 76 | ) { 77 | REMORA_SIZE_CHECK(A().size1() == C().size1()); 78 | REMORA_SIZE_CHECK(C().size1() == C().size2()); 79 | 80 | CBLAS_ORDER stor_ord = (CBLAS_ORDER) storage_order::value; 81 | CBLAS_UPLO uplo = Upper?CblasUpper: CblasLower; 82 | CBLAS_TRANSPOSE trans = std::is_same::value?CblasNoTrans:CblasTrans; 83 | std::size_t n = C().size1(); 84 | std::size_t k = A().size2(); 85 | 86 | 87 | auto storageA = A().raw_storage(); 88 | auto storageC = C().raw_storage(); 89 | syrk(stor_ord, uplo, trans, 90 | (int)n, (int)k, alpha, 91 | storageA.values, 92 | (int)storageA.leading_dimension, 93 | typename MatC::value_type(1), 94 | storageC.values, 95 | (int)storageC.leading_dimension 96 | ); 97 | } 98 | 99 | template 100 | struct has_optimized_syrk: std::integral_constant::type::value 102 | && std::is_same::value 103 | && std::is_base_of::value 104 | && std::is_base_of::value 105 | >{}; 106 | 107 | }} 108 | 109 | #endif 110 | -------------------------------------------------------------------------------- /include/remora/kernels/clBlast/gemv.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_CLBLAST_GEMV_HPP 33 | #define REMORA_KERNELS_CLBLAST_GEMV_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | #include 38 | namespace remora{ namespace kernels{ 39 | 40 | // v <- v + alpha * A * x 41 | template 42 | void gemv( 43 | matrix_expression const& A, 44 | vector_expression const& x, 45 | vector_expression& v, 46 | typename VecV::value_type const& alpha 47 | ) { 48 | REMORA_SIZE_CHECK(A().size1() == v().size()); 49 | REMORA_SIZE_CHECK(A().size2() == x().size()); 50 | 51 | static_assert(std::is_same::value, "[gemv] Arguments do not have same element type"); 52 | static_assert(std::is_same::value, "[gemv] Arguments do not have same element type"); 53 | static_assert(std::is_same::value, "[gemv] A is not dense"); 54 | static_assert(std::is_same::value, "[gemv] x is not dense"); 55 | static_assert(std::is_base_of::value, "[gemv] v does not have dense storage layout"); 56 | 57 | //pre-evaluate A and x into a temporary if necessary 58 | auto const& Aeval = eval_expression(A); 59 | auto const& xeval = eval_expression(x); 60 | 61 | 62 | using namespace clblast; 63 | 64 | //obtain geometry information 65 | auto layout = std::is_same::value? Layout::kRowMajor: Layout::kColMajor; 66 | std::size_t m = A().size1(); 67 | std::size_t n = A().size2(); 68 | 69 | //obtain raw storage 70 | auto storageA = Aeval.raw_storage(); 71 | auto storagex = xeval.raw_storage(); 72 | auto storagev = v().raw_storage(); 73 | 74 | cl_event* event = nullptr;//todo: store events for out-of-order queues 75 | auto code = Gemv(layout, Transpose::kNo, 76 | m, n, alpha, 77 | storageA.buffer.get(), storageA.offset, storageA.leading_dimension, 78 | storagex.buffer.get(), storagex.offset, storagex.stride, 79 | typename VecV::value_type(1), 80 | storagev.buffer.get(), storagev.offset, storagev.stride, 81 | &v().queue().get(), event 82 | ); 83 | assert(code == StatusCode::kSuccess); 84 | } 85 | 86 | }} 87 | 88 | #endif 89 | -------------------------------------------------------------------------------- /include/remora/kernels/clBlast/syrk.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2016 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_CLBLAST_SYRK_HPP 33 | #define REMORA_KERNELS_CLBLAST_SYRK_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | #include 38 | namespace remora{ namespace kernels{ 39 | 40 | // C <- C + alpha * A * A^T 41 | template 42 | void syrk( 43 | matrix_expression const& A, 44 | matrix_expression& C, 45 | typename MatC::value_type const& alpha 46 | ) { 47 | REMORA_SIZE_CHECK(A().size1() == C().size1()); 48 | REMORA_SIZE_CHECK(C().size1()== C().size2()); 49 | 50 | static_assert(std::is_same::value, "[syrk] Arguments do not have same element type"); 51 | static_assert(std::is_same::value, "[syrk] A is not dense"); 52 | static_assert(std::is_base_of::value, "[syrk] C does not have dense storage layout"); 53 | 54 | //pre-evaluate A into a temporary if necessary 55 | auto const& Aeval = eval_expression(A); 56 | 57 | using namespace clblast; 58 | 59 | //obtain geometry information 60 | auto transA = std::is_same::value? Transpose::kNo : Transpose::kYes; 61 | auto layout = std::is_same::value? Layout::kRowMajor: Layout::kColMajor; 62 | auto triangular = Upper? Triangle::kUpper : Triangle::kLower; 63 | std::size_t n = C().size1(); 64 | std::size_t k = A().size2(); 65 | 66 | //obtain matrix storage 67 | auto storageA = Aeval.raw_storage(); 68 | auto storageC = C().raw_storage(); 69 | 70 | //call 71 | cl_event* event = nullptr;//todo: store events for out-of-order queues 72 | auto code = Syrk(layout, triangular, transA, 73 | n, k, alpha, 74 | storageA.buffer.get(), storageA.offset, storageA.leading_dimension, 75 | typename MatC::value_type(1), 76 | storageC.buffer.get(), storageC.offset, storageC.leading_dimension, 77 | &C().queue().get(), event 78 | ); 79 | 80 | assert(code == StatusCode::kSuccess); 81 | } 82 | 83 | }} 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /include/remora/kernels/clBlast/trmm.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_CLBLAST_TRMM_HPP 33 | #define REMORA_KERNELS_CLBLAST_TRMM_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | #include 38 | namespace remora{ namespace kernels{ 39 | 40 | // C <- AC with A being triangular 41 | template 42 | void trmm( 43 | matrix_expression const& A, 44 | matrix_expression& C 45 | ){ 46 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 47 | REMORA_SIZE_CHECK(A().size2() == C().size1()); 48 | 49 | static_assert(std::is_same::value, "[trmm] Arguments do not have same element type"); 50 | static_assert(std::is_same::value, "[trmm] A is not dense"); 51 | static_assert(std::is_base_of::value, "[trmm] C does not have dense storage layout"); 52 | 53 | //pre-evaluate A into a temporary if necessary 54 | auto const& Aeval = eval_expression(A); 55 | 56 | using namespace clblast; 57 | 58 | //obtain geometry information 59 | auto transA = std::is_same::value? Transpose::kNo : Transpose::kYes; 60 | auto layout = std::is_same::value? Layout::kRowMajor : Layout::kColMajor; 61 | auto diagonal = Unit? Diagonal::kUnit : Diagonal::kNonUnit; 62 | auto triangular = Upper? Triangle::kUpper : Triangle::kLower; 63 | if(transA == Transpose::kYes){//when we transpose the matrix, we also have to change its Triangular type 64 | triangular = Upper? Triangle::kLower : Triangle::kUpper; 65 | } 66 | std::size_t m = C().size1(); 67 | std::size_t n = C().size2(); 68 | 69 | //obtain raw storage 70 | auto storageA = Aeval.raw_storage(); 71 | auto storageC = C().raw_storage(); 72 | 73 | cl_event* event = nullptr;//todo: store events for out-of-order queues 74 | auto code = Trmm(layout, Side::kLeft, triangular, transA, diagonal, 75 | m, n, typename MatC::value_type(1), 76 | storageA.buffer.get(), storageA.offset, storageA.leading_dimension, 77 | storageC.buffer.get(), storageC.offset, storageC.leading_dimension, 78 | &C().queue().get(), event 79 | ); 80 | assert(code == StatusCode::kSuccess); 81 | } 82 | 83 | }} 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /include/remora/kernels/clBlast/trmv.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_CLBLAST_TRMV_HPP 33 | #define REMORA_KERNELS_CLBLAST_TRMV_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | #include 38 | namespace remora{ namespace kernels{ 39 | 40 | // v <- Av with A being triangular 41 | template 42 | void trmv( 43 | matrix_expression const& A, 44 | vector_expression& v 45 | ){ 46 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 47 | REMORA_SIZE_CHECK(A().size2() == v().size()); 48 | 49 | static_assert(std::is_same::value, "[trmv] Arguments do not have same element type"); 50 | static_assert(std::is_same::value, "[trmv] A is not dense"); 51 | static_assert(std::is_base_of::value, "[trmv] v does not have dense storage layout"); 52 | 53 | //pre-evaluate A into a temporary if necessary 54 | auto const& Aeval = eval_expression(A); 55 | 56 | using namespace clblast; 57 | 58 | //obtain geometry information 59 | auto layout = std::is_same::value? Layout::kRowMajor : Layout::kColMajor; 60 | auto triangular = Upper? Triangle::kUpper : Triangle::kLower; 61 | auto diagonal = Unit? Diagonal::kUnit : Diagonal::kNonUnit; 62 | std::size_t n = A().size1(); 63 | 64 | //obtain raw storage 65 | auto storageA = Aeval.raw_storage(); 66 | auto storagev = v().raw_storage(); 67 | 68 | cl_event* event = nullptr;//todo: store events for out-of-order queues 69 | auto code = Trmv(layout, triangular, Transpose::kNo, diagonal, 70 | n, 71 | storageA.buffer.get(), storageA.offset, storageA.leading_dimension, 72 | storagev.buffer.get(), storagev.offset, storagev.stride, 73 | &v().queue().get(), event 74 | ); 75 | assert(code == StatusCode::kSuccess); 76 | } 77 | 78 | }} 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /include/remora/kernels/clBlast/trsv.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_CLBLAST_TRSV_HPP 33 | #define REMORA_KERNELS_CLBLAST_TRSV_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | #include 38 | namespace remora{ namespace kernels{ 39 | 40 | // solve Ax = b or xA=b with A being triangular 41 | template 42 | void trsv( 43 | matrix_expression const& A, 44 | vector_expression& b 45 | ){ 46 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 47 | REMORA_SIZE_CHECK(A().size1() == b().size()); 48 | 49 | static_assert(std::is_same::value, "[trsv] Arguments do not have same element type"); 50 | static_assert(std::is_same::value, "[trsv] A is not dense"); 51 | static_assert(std::is_base_of::value, "[trsv] b does not have dense storage layout"); 52 | 53 | //pre-evaluate A into a temporary if necessary 54 | auto const& Aeval = eval_expression(A); 55 | 56 | using namespace clblast; 57 | 58 | //obtain geometry information 59 | auto layout = std::is_same::value? Layout::kRowMajor : Layout::kColMajor; 60 | auto triangular = Triangular::is_upper? Triangle::kUpper : Triangle::kLower; 61 | auto diagonal = Triangular::is_unit? Diagonal::kUnit : Diagonal::kNonUnit; 62 | //transpose if side is right 63 | if(!Side::is_left){ 64 | layout = (layout == Layout::kRowMajor) ? Layout::kColMajor : Layout::kRowMajor; 65 | triangular = Triangular::is_upper? Triangle::kLower : Triangle::kUpper; 66 | } 67 | std::size_t n = A().size1(); 68 | 69 | //obtain raw storage 70 | auto storageA = Aeval.raw_storage(); 71 | auto storageb = b().raw_storage(); 72 | 73 | cl_event* event = nullptr;//todo: store events for out-of-order queues 74 | auto code = Trsv(layout, triangular, Transpose::kNo , diagonal, 75 | n, 76 | storageA.buffer.get(), storageA.offset, storageA.leading_dimension, 77 | storageb.buffer.get(), storageb.offset, storageb.stride, 78 | &b().queue().get(), event 79 | ); 80 | assert(code == StatusCode::kSuccess); 81 | } 82 | 83 | }} 84 | 85 | #endif 86 | -------------------------------------------------------------------------------- /include/remora/kernels/conv2d.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief 2d convolution kernel 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_CONV2D_HPP 32 | #define REMORA_KERNELS_CONV2D_HPP 33 | 34 | #include "default/conv2d.hpp" 35 | 36 | #ifdef REMORA_USE_CLBLAST 37 | #include "clBlast/conv2d.hpp" 38 | #endif 39 | 40 | namespace remora{namespace kernels{ 41 | 42 | 43 | ///\brief Computes the convolution of a set of multi-channel images with a set of filters. 44 | /// 45 | /// Computes the result of applying k filters to a set of images where filters and images are allowed 46 | /// to have multiple channels (some would call this a 3d or even 4d convolution, but we refrain from this as 47 | /// for two dimensions filter dimensions and image dimension must agree. E.g. it does not behave like convoluting a volume) 48 | /// The base for the convolution is the upper left corner and there is no boundary handling, i.e. only pixels within the image area 49 | /// are computed. 50 | /// 51 | /// The images are stored block-row-wise. i.e. an image of size nxm with k channels is stored as 52 | /// and (n*k)x m matrix where n consecutive rows for the row of an image. Each image is stored as a row of the input matrix 53 | /// Filters are stored similarly, only that in their case we have the format (n1*k*l) x m1 for a 54 | /// set of l filters of size n1 x m1 with k channels each. the n1 rows form a channel, k*n1 rows form 55 | /// a filter. 56 | /// the output format is stored in the same way as image just with size (l* (m-m1+1))x(n-n1+1). 57 | /// The caller must ensure that enough memory is available. 58 | template 59 | void conv2d( 60 | matrix_expression const& images, 61 | vector_expression const& filter, 62 | matrix_expression& outputs, 63 | std::size_t num_channels, 64 | std::size_t num_filters, 65 | std::size_t image_height, 66 | std::size_t image_width, 67 | std::size_t filter_height, 68 | std::size_t filter_width, 69 | std::size_t padding_height = 0, 70 | std::size_t padding_width = 0 71 | ){ 72 | std::size_t output_rows_per_filter = (image_height - filter_height +1 + padding_height) * (image_width - filter_width +1 + padding_width); 73 | std::size_t filter_size = filter_width * filter_height * num_channels; 74 | 75 | REMORA_SIZE_CHECK(outputs().size1() == images().size1()); 76 | REMORA_SIZE_CHECK(outputs().size2() == num_filters * output_rows_per_filter); 77 | REMORA_SIZE_CHECK(images().size2() == num_channels * image_width * image_height); 78 | REMORA_SIZE_CHECK(filter().size() == num_filters * filter_size); 79 | 80 | bindings::conv2d( 81 | images, filter, outputs, num_channels, num_filters, 82 | image_height, image_width, filter_height, filter_width, 83 | padding_height, padding_width 84 | ); 85 | } 86 | 87 | }} 88 | #endif 89 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/aligned_alloc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014-2015 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_ALIGNED_ALLOC_HPP 10 | #define BOOST_ALIGN_ALIGNED_ALLOC_HPP 11 | 12 | #include 13 | 14 | #if defined(BOOST_HAS_UNISTD_H) 15 | #include 16 | #endif 17 | 18 | #if defined(__APPLE__) || defined(__APPLE_CC__) || defined(macintosh) 19 | #include 20 | #endif 21 | 22 | #if defined(BOOST_ALIGN_USE_ALLOCATE) 23 | #include "detail/aligned_alloc.hpp" 24 | #elif defined(_MSC_VER) && !defined(UNDER_CE) 25 | #include "detail/aligned_alloc_msvc.hpp" 26 | #elif defined(__MINGW32__) && (__MSVCRT_VERSION__ >= 0x0700) 27 | #include "detail/aligned_alloc_msvc.hpp" 28 | #elif MAC_OS_X_VERSION_MIN_REQUIRED >= 1090 29 | #include "detail/aligned_alloc_posix.hpp" 30 | #elif MAC_OS_X_VERSION_MIN_REQUIRED >= 1060 31 | #include "detail/aligned_alloc_macos.hpp" 32 | #elif defined(__ANDROID__) 33 | #include "detail/aligned_alloc_android.hpp" 34 | #elif defined(__SunOS_5_11) || defined(__SunOS_5_12) 35 | #include "detail/aligned_alloc_posix.hpp" 36 | #elif defined(sun) || defined(__sun) 37 | #include "detail/aligned_alloc_sunos.hpp" 38 | #elif (_POSIX_C_SOURCE >= 200112L) || (_XOPEN_SOURCE >= 600) 39 | #include "detail/aligned_alloc_posix.hpp" 40 | #else 41 | #include "detail/aligned_alloc.hpp" 42 | #endif 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/assume_aligned.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2015 NumScale SAS 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI 4 | 5 | (c) 2015 Glen Joseph Fernandes 6 | 7 | 8 | Distributed under the Boost Software 9 | License, Version 1.0. 10 | http://boost.org/LICENSE_1_0.txt 11 | */ 12 | #ifndef BOOST_ALIGN_ASSUME_ALIGNED_HPP 13 | #define BOOST_ALIGN_ASSUME_ALIGNED_HPP 14 | 15 | #include 16 | 17 | #if defined(BOOST_MSVC) 18 | #include "detail/assume_aligned_msvc.hpp" 19 | #elif defined(BOOST_CLANG) && defined(__has_builtin) 20 | #include "detail/assume_aligned_clang.hpp" 21 | #elif BOOST_GCC_VERSION >= 40700 22 | #include "detail/assume_aligned_gcc.hpp" 23 | #elif defined(__INTEL_COMPILER) 24 | #include "detail/assume_aligned_intel.hpp" 25 | #else 26 | #include "detail/assume_aligned.hpp" 27 | #endif 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/aligned_alloc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014-2015 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_HPP 10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_HPP 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | namespace boost { 18 | namespace alignment { 19 | 20 | inline void* aligned_alloc(std::size_t alignment, std::size_t size) 21 | BOOST_NOEXCEPT 22 | { 23 | BOOST_ASSERT(detail::is_alignment(alignment)); 24 | enum { 25 | min_align = std::alignment_of::value 26 | }; 27 | if (alignment < min_align) { 28 | alignment = min_align; 29 | } 30 | std::size_t n = size + alignment - min_align; 31 | void* r = 0; 32 | void* p = std::malloc(sizeof(void*) + n); 33 | if (p) { 34 | r = static_cast(p) + sizeof p; 35 | (void)std::align(alignment, size, r, n); 36 | *(static_cast(r) - 1) = p; 37 | } 38 | return r; 39 | } 40 | 41 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT 42 | { 43 | if (ptr) { 44 | std::free(*(static_cast(ptr) - 1)); 45 | } 46 | } 47 | 48 | } /* .alignment */ 49 | } /* .boost */ 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/aligned_alloc_android.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_ANDROID_HPP 10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_ANDROID_HPP 11 | 12 | #include "is_alignment.hpp" 13 | #include 14 | #include 15 | 16 | namespace boost { 17 | namespace alignment { 18 | 19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size) 20 | BOOST_NOEXCEPT 21 | { 22 | BOOST_ASSERT(detail::is_alignment(alignment)); 23 | return ::memalign(alignment, size); 24 | } 25 | 26 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT 27 | { 28 | ::free(ptr); 29 | } 30 | 31 | } /* .alignment */ 32 | } /* .boost */ 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/aligned_alloc_macos.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MACOS_HPP 10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MACOS_HPP 11 | 12 | #include "is_alignment.hpp" 13 | #include 14 | #include 15 | 16 | namespace boost { 17 | namespace alignment { 18 | 19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size) 20 | BOOST_NOEXCEPT 21 | { 22 | BOOST_ASSERT(detail::is_alignment(alignment)); 23 | if (size == 0) { 24 | return 0; 25 | } 26 | if (alignment < sizeof(void*)) { 27 | alignment = sizeof(void*); 28 | } 29 | void* p; 30 | if (::posix_memalign(&p, alignment, size) != 0) { 31 | p = 0; 32 | } 33 | return p; 34 | } 35 | 36 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT 37 | { 38 | ::free(ptr); 39 | } 40 | 41 | } /* .alignment */ 42 | } /* .boost */ 43 | 44 | #endif 45 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/aligned_alloc_msvc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MSVC_HPP 10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_MSVC_HPP 11 | 12 | #include "is_alignment.hpp" 13 | #include 14 | #include 15 | 16 | namespace boost { 17 | namespace alignment { 18 | 19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size) 20 | BOOST_NOEXCEPT 21 | { 22 | BOOST_ASSERT(detail::is_alignment(alignment)); 23 | return ::_aligned_malloc(size, alignment); 24 | } 25 | 26 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT 27 | { 28 | ::_aligned_free(ptr); 29 | } 30 | 31 | } /* .alignment */ 32 | } /* .boost */ 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/aligned_alloc_posix.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_POSIX_HPP 10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_POSIX_HPP 11 | 12 | #include "is_alignment.hpp" 13 | #include 14 | #include 15 | 16 | namespace boost { 17 | namespace alignment { 18 | 19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size) 20 | BOOST_NOEXCEPT 21 | { 22 | BOOST_ASSERT(detail::is_alignment(alignment)); 23 | if (alignment < sizeof(void*)) { 24 | alignment = sizeof(void*); 25 | } 26 | void* p; 27 | if (::posix_memalign(&p, alignment, size) != 0) { 28 | p = 0; 29 | } 30 | return p; 31 | } 32 | 33 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT 34 | { 35 | ::free(ptr); 36 | } 37 | 38 | } /* .alignment */ 39 | } /* .boost */ 40 | 41 | #endif 42 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/aligned_alloc_sunos.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_SUNOS_HPP 10 | #define BOOST_ALIGN_DETAIL_ALIGNED_ALLOC_SUNOS_HPP 11 | 12 | #include "is_alignment.hpp" 13 | #include 14 | #include 15 | 16 | namespace boost { 17 | namespace alignment { 18 | 19 | inline void* aligned_alloc(std::size_t alignment, std::size_t size) 20 | BOOST_NOEXCEPT 21 | { 22 | BOOST_ASSERT(detail::is_alignment(alignment)); 23 | return ::memalign(alignment, size); 24 | } 25 | 26 | inline void aligned_free(void* ptr) BOOST_NOEXCEPT 27 | { 28 | ::free(ptr); 29 | } 30 | 31 | } /* .alignment */ 32 | } /* .boost */ 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/assume_aligned.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2015 NumScale SAS 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI 4 | 5 | (c) 2015 Glen Joseph Fernandes 6 | 7 | 8 | Distributed under the Boost Software 9 | License, Version 1.0. 10 | http://boost.org/LICENSE_1_0.txt 11 | */ 12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_HPP 13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_HPP 14 | 15 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/assume_aligned_clang.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2015 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_CLANG_HPP 10 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_CLANG_HPP 11 | 12 | #if __has_builtin(__builtin_assume_aligned) 13 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \ 14 | (p) = (__typeof__(p))(__builtin_assume_aligned((p), (n))) 15 | #else 16 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) 17 | #endif 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/assume_aligned_gcc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2015 NumScale SAS 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI 4 | 5 | (c) 2015 Glen Joseph Fernandes 6 | 7 | 8 | Distributed under the Boost Software 9 | License, Version 1.0. 10 | http://boost.org/LICENSE_1_0.txt 11 | */ 12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_GCC_HPP 13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_GCC_HPP 14 | 15 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \ 16 | (p) = (__typeof__(p))(__builtin_assume_aligned((p), (n))) 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/assume_aligned_intel.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2015 NumScale SAS 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI 4 | 5 | (c) 2015 Glen Joseph Fernandes 6 | 7 | 8 | Distributed under the Boost Software 9 | License, Version 1.0. 10 | http://boost.org/LICENSE_1_0.txt 11 | */ 12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_INTEL_HPP 13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_INTEL_HPP 14 | 15 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \ 16 | __assume_aligned((p), (n)) 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/assume_aligned_msvc.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2015 NumScale SAS 3 | (c) 2015 LRI UMR 8623 CNRS/University Paris Sud XI 4 | 5 | (c) 2015 Glen Joseph Fernandes 6 | 7 | 8 | Distributed under the Boost Software 9 | License, Version 1.0. 10 | http://boost.org/LICENSE_1_0.txt 11 | */ 12 | #ifndef BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_MSVC_HPP 13 | #define BOOST_ALIGN_DETAIL_ASSUME_ALIGNED_MSVC_HPP 14 | 15 | #include 16 | 17 | #define BOOST_ALIGN_ASSUME_ALIGNED(p, n) \ 18 | __assume(((std::size_t)(p) & ((n) - 1)) == 0) 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/is_alignment.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_IS_ALIGNMENT_HPP 10 | #define BOOST_ALIGN_DETAIL_IS_ALIGNMENT_HPP 11 | 12 | #include 13 | #include 14 | 15 | namespace boost { 16 | namespace alignment { 17 | namespace detail { 18 | 19 | BOOST_CONSTEXPR inline bool is_alignment(std::size_t value) 20 | BOOST_NOEXCEPT 21 | { 22 | return (value > 0) && ((value & (value - 1)) == 0); 23 | } 24 | 25 | } /* .detail */ 26 | } /* .alignment */ 27 | } /* .boost */ 28 | 29 | #endif 30 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/is_alignment_constant.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_IS_ALIGNMENT_CONSTANT_HPP 10 | #define BOOST_ALIGN_DETAIL_IS_ALIGNMENT_CONSTANT_HPP 11 | 12 | #include 13 | #include 14 | namespace boost { 15 | namespace alignment { 16 | namespace detail { 17 | 18 | template 19 | struct is_alignment_constant 20 | : std::integral_constant 0) && ((N & (N - 1)) == 0)> { }; 21 | 22 | } /* .detail */ 23 | } /* .alignment */ 24 | } /* .boost */ 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/max_objects.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_MAX_OBJECTS_HPP 10 | #define BOOST_ALIGN_DETAIL_MAX_OBJECTS_HPP 11 | 12 | #include 13 | #include 14 | 15 | namespace boost { 16 | namespace alignment { 17 | namespace detail { 18 | 19 | template 20 | struct max_objects 21 | : std::integral_constant(0) / sizeof(T)> { }; 23 | 24 | } /* .detail */ 25 | } /* .alignment */ 26 | } /* .boost */ 27 | 28 | #endif 29 | -------------------------------------------------------------------------------- /include/remora/kernels/default/boost_align/detail/max_size.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | (c) 2014-2015 Glen Joseph Fernandes 3 | 4 | 5 | Distributed under the Boost Software 6 | License, Version 1.0. 7 | http://boost.org/LICENSE_1_0.txt 8 | */ 9 | #ifndef BOOST_ALIGN_DETAIL_MAX_SIZE_HPP 10 | #define BOOST_ALIGN_DETAIL_MAX_SIZE_HPP 11 | 12 | #include 13 | #include 14 | 15 | namespace boost { 16 | namespace alignment { 17 | namespace detail { 18 | 19 | template 20 | struct max_size 21 | : std::integral_constant B) ? A : B> { }; 22 | 23 | } /* .detail */ 24 | } /* .alignment */ 25 | } /* .boost */ 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /include/remora/kernels/default/dot.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief - 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_DEFAULT_DOT_HPP 31 | #define REMORA_KERNELS_DEFAULT_DOT_HPP 32 | 33 | #include "../../expression_types.hpp"//vector_expression 34 | #include "../../detail/traits.hpp"//storage tags 35 | 36 | namespace remora{namespace bindings{ 37 | 38 | // Dense case 39 | template 40 | void dot( 41 | vector_expression const& v1, 42 | vector_expression const& v2, 43 | result_type& result, 44 | dense_tag, 45 | dense_tag 46 | ) { 47 | result = result_type(); 48 | auto v1_end = v1().end(); 49 | auto v2_pos = v2().begin(); 50 | for(auto v1_pos = v1().begin(); v1_pos != v1_end; ++v1_pos, ++v2_pos){ 51 | result += (*v1_pos) * (*v2_pos); 52 | } 53 | } 54 | // Sparse case 55 | template 56 | void dot( 57 | vector_expression const& v1, 58 | vector_expression const& v2, 59 | result_type& result, 60 | sparse_tag, 61 | sparse_tag 62 | ) { 63 | typename E1::const_iterator iter1=v1().begin(); 64 | typename E1::const_iterator end1=v1().end(); 65 | typename E2::const_iterator iter2=v2().begin(); 66 | typename E2::const_iterator end2=v2().end(); 67 | result = result_type(); 68 | //be aware of empty vectors! 69 | while(iter1 != end1 && iter2 != end2) 70 | { 71 | std::size_t index1=iter1.index(); 72 | std::size_t index2=iter2.index(); 73 | if(index1==index2){ 74 | result += *iter1 * *iter2; 75 | ++iter1; 76 | ++iter2; 77 | } 78 | else if(index1> index2){ 79 | ++iter2; 80 | } 81 | else { 82 | ++iter1; 83 | } 84 | } 85 | } 86 | 87 | // Dense-Sparse case 88 | template 89 | void dot( 90 | vector_expression const& v1, 91 | vector_expression const& v2, 92 | result_type& result, 93 | dense_tag, 94 | sparse_tag 95 | ) { 96 | typename E2::const_iterator iter2=v2().begin(); 97 | typename E2::const_iterator end2=v2().end(); 98 | result = result_type(); 99 | auto v1_elem = v1().elements(); 100 | for(;iter2 != end2;++iter2){ 101 | result += v1_elem(iter2.index()) * (*iter2); 102 | } 103 | } 104 | //Sparse-Dense case is reduced to Dense-Sparse using symmetry. 105 | template 106 | void dot( 107 | vector_expression const& v1, 108 | vector_expression const& v2, 109 | result_type& result, 110 | sparse_tag t1, 111 | dense_tag t2 112 | ) { 113 | //use commutativity! 114 | dot(v2,v1,result,t2,t1); 115 | } 116 | 117 | }} 118 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/default/fold_rows.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Folds the rows of a row-major or column major matrix. 5 | * 6 | * \author O. Krause 7 | * \date 2018 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_DEFAULT_FOLD_ROWS_HPP 32 | #define REMORA_KERNELS_DEFAULT_FOLD_ROWS_HPP 33 | 34 | #include "../../expression_types.hpp"//for vector/matrix_expression 35 | #include "../../detail/traits.hpp" 36 | 37 | namespace remora{namespace bindings{ 38 | 39 | template 40 | void fold_rows( 41 | matrix_expression const& A, 42 | vector_expression& v, 43 | F f, 44 | G g, 45 | row_major 46 | ){ 47 | for(std::size_t i = 0; i != v().size(); ++i){ 48 | auto end = A().major_end(i); 49 | auto pos = A().major_begin(i); 50 | typename V::value_type s = *pos; 51 | ++pos; 52 | for(; pos != end; ++pos){ 53 | s = f(s,*pos); 54 | } 55 | v()(i) += g(s); 56 | } 57 | } 58 | 59 | template 60 | void fold_rows( 61 | matrix_expression const& A, 62 | vector_expression& v, 63 | F f, 64 | G g, 65 | column_major 66 | ){ 67 | std::size_t n = v().size(); 68 | const std::size_t BLOCK_SIZE = 16; 69 | typename V::value_type storage[BLOCK_SIZE]; 70 | std::size_t numBlocks = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; 71 | auto A_elem = A().elements(); 72 | for(std::size_t b = 0; b != numBlocks; ++b){ 73 | std::size_t start = b * BLOCK_SIZE; 74 | std::size_t cur_size = std::min(BLOCK_SIZE, n - start); 75 | for(std::size_t i = 0; i != cur_size; ++i){ 76 | storage[i] = A_elem(start + i, 0); 77 | } 78 | for(std::size_t j = 1; j != A().size2(); ++j){ 79 | for(std::size_t i = 0; i != cur_size; ++i){ 80 | storage[i] = f(storage[i], A_elem(start + i, j)); 81 | } 82 | } 83 | for(std::size_t i = 0; i != cur_size; ++i){ 84 | v()(start + i) += g(storage[i]); 85 | } 86 | } 87 | } 88 | 89 | //dispatcher for triangular matrix 90 | template 91 | void fold_rows( 92 | matrix_expression const& A, 93 | vector_expression& v, 94 | F f, 95 | G g, 96 | triangular 97 | ){ 98 | fold_rows(A,v, f, g, Orientation()); 99 | } 100 | 101 | }} 102 | 103 | #endif 104 | -------------------------------------------------------------------------------- /include/remora/kernels/default/gemv.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief - 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MatAERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_DEFAULT_GEMatAV_HPP 31 | #define REMORA_KERNELS_DEFAULT_GEMatAV_HPP 32 | 33 | #include "../../expression_types.hpp" //matrix/vector_expression 34 | #include "../../proxy_expressions.hpp" //matrix row,, transpose 35 | #include "../../detail/traits.hpp" //matrix orientations 36 | #include "../default/dot.hpp" //inner product 37 | #include "../vector_assign.hpp" //assignment of vectors 38 | #include //std::false_type marker for unoptimized 39 | 40 | namespace remora{namespace bindings { 41 | 42 | //row major can be further reduced to inner_prod() 43 | template 44 | void gemv_impl( 45 | matrix_expression const& A, 46 | vector_expression const& x, 47 | vector_expression& result, 48 | typename ResultV::value_type alpha, 49 | row_major 50 | ) { 51 | typedef typename ResultV::value_type value_type; 52 | value_type value; 53 | for(std::size_t i = 0; i != A().size1();++i){ 54 | bindings::dot(row(A,i), x, value, typename MatA::evaluation_category::tag(), typename V::evaluation_category::tag()); 55 | if(value != value_type())//handling of sparse results. 56 | result()(i) += alpha * value; 57 | } 58 | } 59 | 60 | //column major is implemented by computing a linear combination of matrix-rows 61 | template 62 | void gemv_impl( 63 | matrix_expression const& A, 64 | vector_expression const& x, 65 | vector_expression& result, 66 | typename ResultV::value_type alpha, 67 | column_major 68 | ) { 69 | typedef typename V::const_iterator iterator; 70 | typedef typename ResultV::value_type value_type; 71 | typedef device_traits::multiply_and_add MultAdd; 72 | iterator end = x().end(); 73 | for(iterator it = x().begin(); it != end; ++it) { 74 | //FIXME: for sparse result vectors, this might hurt. 75 | kernels::assign(result, column(A,it.index()), MultAdd(alpha * (*it))); 76 | } 77 | } 78 | 79 | //unknown orientation is dispatched to row_major 80 | template 81 | void gemv_impl( 82 | matrix_expression const& A, 83 | vector_expression const& x, 84 | vector_expression& result, 85 | typename ResultV::value_type alpha, 86 | unknown_orientation 87 | ) { 88 | gemv_impl(A,x,result,alpha,row_major()); 89 | } 90 | 91 | // result += alpha * A * x 92 | template 93 | void gemv( 94 | matrix_expression const& A, 95 | vector_expression const& x, 96 | vector_expression& result, 97 | typename ResultV::value_type alpha, 98 | std::false_type 99 | ) { 100 | typedef typename MatA::orientation orientation; 101 | 102 | gemv_impl(A, x, result, alpha, orientation()); 103 | } 104 | 105 | }} 106 | #endif 107 | -------------------------------------------------------------------------------- /include/remora/kernels/default/random.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Generation of random variates on cpu 5 | * 6 | * \author O. Krause 7 | * \date 2017 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_DEFAULT_RANDOM_HPP 31 | #define REMORA_KERNELS_DEFAULT_RANDOM_HPP 32 | 33 | #include 34 | #include 35 | 36 | namespace remora{ namespace bindings{ 37 | template 38 | void generate_normal( 39 | vector_expression& v, 40 | Rng& rng, 41 | typename V::value_type mean, 42 | typename V::value_type variance 43 | ) { 44 | std::normal_distribution dist(mean,std::sqrt(variance)); 45 | for(auto& val: v()) 46 | val = dist(rng); 47 | } 48 | 49 | template 50 | void generate_normal( 51 | matrix_expression& m, 52 | Rng& rng, 53 | typename M::value_type mean, 54 | typename M::value_type variance 55 | ) { 56 | std::normal_distribution dist(mean,std::sqrt(variance)); 57 | std::size_t size = M::orientation::index_M(m().size1(),m().size2()); 58 | for(std::size_t i = 0; i != size; ++i){ 59 | auto end = m().major_end(i); 60 | for(auto pos = m().major_begin(i);pos != end; ++pos){ 61 | *pos = dist(rng); 62 | } 63 | } 64 | } 65 | 66 | template 67 | void generate_uniform( 68 | vector_expression& v, 69 | Rng& rng, 70 | typename V::value_type low, 71 | typename V::value_type high 72 | ) { 73 | std::uniform_real_distribution dist(low,high); 74 | for(auto& val: v()) 75 | val = dist(rng); 76 | } 77 | 78 | template 79 | void generate_uniform( 80 | matrix_expression& m, 81 | Rng& rng, 82 | typename M::value_type low, 83 | typename M::value_type high 84 | ) { 85 | std::uniform_real_distribution dist(low,high); 86 | std::size_t size = M::orientation::index_M(m().size1(),m().size2()); 87 | for(std::size_t i = 0; i != size; ++i){ 88 | auto end = m().major_end(i); 89 | for(auto pos = m().major_begin(i);pos != end; ++pos){ 90 | *pos = dist(rng); 91 | } 92 | } 93 | } 94 | 95 | template 96 | void generate_discrete( 97 | vector_expression& v, 98 | Rng& rng, 99 | int low, 100 | int high 101 | ) { 102 | std::uniform_int_distribution dist(low,high); 103 | for(auto& val: v()) 104 | val = dist(rng); 105 | } 106 | 107 | template 108 | void generate_discrete( 109 | matrix_expression& m, 110 | Rng& rng, 111 | int low, 112 | int high 113 | ) { 114 | std::uniform_int_distribution dist(low,high); 115 | std::size_t size = M::orientation::index_M(m().size1(),m().size2()); 116 | for(std::size_t i = 0; i != size; ++i){ 117 | auto end = m().major_end(i); 118 | for(auto pos = m().major_begin(i);pos != end; ++pos){ 119 | *pos = dist(rng); 120 | } 121 | } 122 | } 123 | 124 | }} 125 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/default/simd.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Some Macros and basic definitions for the use of SIMD block storage 5 | * 6 | * \author O. Krause 7 | * \date 2016 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_DEFAULT_SIMD_HPP 32 | #define REMORA_KERNELS_DEFAULT_SIMD_HPP 33 | 34 | #include 35 | #include 36 | 37 | //older boost versions have some issues 38 | #if (BOOST_VERSION >= 106300) 39 | #include 40 | #include 41 | #else//subset of boost/align 1.63 42 | #include "boost_align/assume_aligned.hpp" 43 | #include "boost_align/aligned_allocator.hpp" 44 | #endif 45 | 46 | 47 | 48 | 49 | #ifdef __AVX__ 50 | #define REMORA_VECTOR_LENGTH 32 51 | #else 52 | #define REMORA_VECTOR_LENGTH 16 53 | #endif 54 | 55 | namespace remora{namespace bindings{namespace detail{ 56 | template 57 | struct block{ 58 | static const std::size_t max_vector_elements = REMORA_VECTOR_LENGTH/sizeof(T); 59 | #ifdef REMORA_USE_SIMD 60 | static const std::size_t vector_elements = REMORA_VECTOR_LENGTH/sizeof(T); 61 | #ifdef BOOST_COMP_CLANG_DETECTION 62 | typedef T type __attribute__((ext_vector_type (vector_elements))); 63 | #else 64 | typedef T type __attribute__((vector_size (REMORA_VECTOR_LENGTH))); 65 | #endif 66 | #else 67 | static const std::size_t vector_elements = 1; 68 | typedef T type; 69 | #endif 70 | static const std::size_t align = 64; 71 | }; 72 | }}} 73 | #endif 74 | -------------------------------------------------------------------------------- /include/remora/kernels/default/vector_fold.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Kernels for folding vector expressions 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_DEFAULT_VECTOR_FOLD_HPP 29 | #define REMORA_KERNELS_DEFAULT_VECTOR_FOLD_HPP 30 | 31 | #include "../../expression_types.hpp" 32 | 33 | namespace remora{namespace bindings{ 34 | template 35 | void vector_fold(vector_expression const& v, typename F::result_type& value, dense_tag) { 36 | F f; 37 | auto end = v().end(); 38 | for(auto pos = v().begin(); pos != end; ++pos){ 39 | value = f(value,*pos); 40 | } 41 | } 42 | 43 | template 44 | void vector_fold(vector_expression const& v, typename F::result_type& value, sparse_tag) { 45 | F f; 46 | std::size_t nnz = 0; 47 | auto iter = v().begin(); 48 | auto end = v().end(); 49 | for(;iter != end;++iter,++nnz){ 50 | value = f(value,*iter); 51 | } 52 | //apply final operator f(0,v) 53 | if(nnz != v().size()) 54 | value = f(value, 0); 55 | } 56 | 57 | }} 58 | #endif 59 | -------------------------------------------------------------------------------- /include/remora/kernels/default/vector_max.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief - 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_DEFAULT_VECTOR_MAX_HPP 31 | #define REMORA_KERNELS_DEFAULT_VECTOR_MAX_HPP 32 | 33 | #include "../../detail/traits.hpp" 34 | #include 35 | namespace remora{namespace bindings{ 36 | 37 | template 38 | std::size_t vector_max(vector_expression const& v,Tag) { 39 | return std::max_element(v().begin(),v().end()).index(); 40 | } 41 | 42 | 43 | }} 44 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/fold_rows.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Folds the rows of a row-major or column major matrix. 5 | * 6 | * \author O. Krause 7 | * \date 2018 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_FOLD_ROWS_HPP 32 | #define REMORA_KERNELS_FOLD_ROWS_HPP 33 | 34 | #include "default/fold_rows.hpp" 35 | #ifdef REMORA_USE_OPENCL 36 | #include "opencl/fold_rows.hpp" 37 | #endif 38 | #if defined(__HCC__) || defined(__NVCC__) 39 | #include "hip/fold_rows.hpp" 40 | #endif 41 | 42 | 43 | namespace remora {namespace bindings{ 44 | template 45 | void fold_rows( 46 | matrix_expression const & A, 47 | vector_expression& b, 48 | F f, 49 | G g, 50 | unknown_orientation 51 | ){ 52 | fold_rows(A, b, f, g, row_major()); 53 | } 54 | } 55 | 56 | namespace kernels{ 57 | ///\brief Folds each row of a matrix with a function f and transforms the result with another function g 58 | /// 59 | /// output v_i is computed as v_i += g( f(A_i0, f(A_i1,... f(A_n-2i, A_n-1i) ))). That is, the result is the same 60 | /// as folding each row separately as if it was a collection of numbers. 61 | template 62 | void fold_rows( 63 | matrix_expression const & A, 64 | vector_expression& b, 65 | F f, 66 | G g 67 | ){ 68 | REMORA_SIZE_CHECK(A().size1() == b().size()); 69 | if(A().size1() == 0) return; //undefined 70 | bindings::fold_rows( 71 | A, b, f, g, typename M::orientation() 72 | ); 73 | } 74 | 75 | }} 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /include/remora/kernels/gemv.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief matrix-vector multiplication kernel 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_GEMV_HPP 31 | #define REMORA_KERNELS_GEMV_HPP 32 | 33 | #include "default/gemv.hpp" 34 | 35 | #ifdef REMORA_USE_CBLAS 36 | #include "cblas/gemv.hpp" 37 | #else 38 | // if no bindings are included, we have to provide the default has_optimized_gemv 39 | // otherwise the binding will take care of this 40 | namespace remora{ namespace bindings{ 41 | template 42 | struct has_optimized_gemv 43 | : public std::false_type{}; 44 | }} 45 | #endif 46 | 47 | #include 48 | 49 | namespace remora{namespace kernels{ 50 | 51 | ///\brief Well known GEneral Matrix-Vector product kernel M+=alpha*E1*e2. 52 | /// 53 | /// If bindings are included and the matrix/vector combination allows for a specific binding 54 | /// to be applied, the binding is called automatically from {binding}/gemv.h 55 | /// otherwise default/gemv.h is used which is fully implemented for all dense/sparse combinations. 56 | /// if a combination is optimized, bindings::has_optimized_gemv::type evaluates to std::true_type 57 | /// The kernels themselves are implemented in bindings::gemv. 58 | template 59 | void gemv( 60 | matrix_expression const& e1, 61 | vector_expression const& e2, 62 | vector_expression& m, 63 | typename M::value_type alpha 64 | ) { 65 | assert(m().size() == e1().size1()); 66 | assert(e1().size2() == e2().size()); 67 | 68 | bindings::gemv( 69 | e1, e2, m,alpha, 70 | typename bindings::has_optimized_gemv::type() 71 | ); 72 | } 73 | 74 | }} 75 | 76 | #ifdef REMORA_USE_CLBLAST 77 | #include "clBlast/gemv.hpp" 78 | #elif defined REMORA_USE_OPENCL 79 | #include "opencl/gemv.hpp" 80 | #endif 81 | #if defined(__HCC__) || defined(__NVCC__) 82 | #include "hip/gemv.hpp" 83 | #endif 84 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/getrf.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Dispatches the GETRF algorithm 5 | * 6 | * \author O. Krause 7 | * \date 2016 8 | * 9 | * 10 | * \par Copyright 1995-2014 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_GETRF_HPP 32 | #define REMORA_KERNELS_GETRF_HPP 33 | 34 | 35 | #include "default/getrf.hpp" 36 | 37 | namespace remora{namespace kernels { 38 | 39 | ///\brief Implements the GEneral TRiangular matrix Factorisation GETRF. 40 | /// 41 | /// It is better known as the LU decomposition with partial row-pivoting for dense matrices. 42 | /// The algorithm works in place and does not require additional memory. 43 | /// 44 | /// The algorithm computes 45 | /// A = P * L * U 46 | /// 47 | /// where L is lower unit-triangular and U upper triangular. 48 | /// 49 | /// The unit diagonal part of L is not stored explicitely. P is a permutation matrix 50 | /// where P(i) stores the index of the row that row i is swapped with. 51 | template 52 | void getrf( 53 | matrix_expression& A, 54 | vector_expression& P 55 | ) { 56 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 57 | REMORA_SIZE_CHECK(P().size() == A().size1()); 58 | return bindings::getrf(A,P); 59 | } 60 | 61 | }} 62 | #endif 63 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/fold_rows.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Folds the rows of a row-major or column major matrix. 5 | * 6 | * \author O. Krause 7 | * \date 2018 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_HIP_FOLD_ROWS_HPP 32 | #define REMORA_KERNELS_HIP_FOLD_ROWS_HPP 33 | 34 | #include "../../expression_types.hpp" 35 | #include "../../detail/traits.hpp" 36 | 37 | namespace remora{ 38 | 39 | namespace hip{ 40 | template 41 | __global__ void fold_rows_kernel(hipLaunchParm lp,MatA A, size_t size1, size_t size2, VecV v, F f, G g){ 42 | typedef typename std::remove_reference::type value_type; 43 | __shared__ value_type folds[64]; 44 | size_t rowid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; 45 | size_t colid = hipThreadIdx_y; 46 | value_type& entry = folds[hipThreadIdx_y]; 47 | if(colid < size2){ 48 | entry = A(rowid,colid); 49 | colid += hipBlockDim_y; 50 | for(;colid < size2; colid += hipBlockDim_y){ 51 | entry = f(entry, A(rowid,colid)); 52 | } 53 | } 54 | __threadfence_block(); 55 | if(hipThreadIdx_y == 0){ 56 | value_type acc = folds[0]; 57 | for(size_t i = 1 ; i < min(size_t(hipBlockDim_y), size2); ++i){ 58 | acc = f(acc, folds[i]); 59 | } 60 | v(rowid) += g(acc); 61 | } 62 | } 63 | } 64 | 65 | namespace bindings{ 66 | 67 | template 68 | void fold_rows( 69 | matrix_expression const& A, 70 | vector_expression& v, 71 | F f, 72 | G g, 73 | Orientation 74 | ){ 75 | std::size_t blockSize1 = 1; 76 | std::size_t blockSize2 = std::min(64, A().queue().warp_size()); 77 | std::size_t numBlocks1 = A().size1(); 78 | std::size_t numBlocks2 = 1; 79 | auto stream = get_stream(A().queue()).handle(); 80 | hipLaunchKernel( 81 | hip::fold_rows_kernel, 82 | dim3(numBlocks1, numBlocks2), dim3(blockSize1, blockSize2), 0, stream, 83 | A().elements(), A().size1(), A().size2(), 84 | v().elements(), f, g 85 | ); 86 | } 87 | 88 | 89 | }} 90 | 91 | #endif 92 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/gemv.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief Hip GEMM kernel frontend using cuBLAS or rocmBLAS backends 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_HIP_GEMV_HPP 33 | #define REMORA_KERNELS_HIP_GEMV_HPP 34 | 35 | #include "../../proxy_expressions.hpp" 36 | #include "../../hip/traits.hpp" 37 | 38 | #ifdef __NVCC__ 39 | #include "../../hip/cublas_backend.hpp" 40 | #endif 41 | 42 | namespace remora{ 43 | namespace kernels{ 44 | 45 | // v <- v + alpha * A * x 46 | template 47 | void gemv( 48 | matrix_expression const& A, 49 | vector_expression const& x, 50 | vector_expression& v, 51 | typename VecV::value_type const& alpha 52 | ) { 53 | REMORA_SIZE_CHECK(A().size1() == v().size()); 54 | REMORA_SIZE_CHECK(A().size2() == x().size()); 55 | 56 | static_assert(std::is_same::value, "[gemv] Arguments do not have same element type"); 57 | static_assert(std::is_same::value, "[gemv] Arguments do not have same element type"); 58 | static_assert(std::is_same::value, "[gemv] A is not dense"); 59 | static_assert(std::is_same::value, "[gemv] x is not dense"); 60 | static_assert(std::is_base_of::value, "[gemv] v does not have dense storage layout"); 61 | 62 | //pre-evaluate A and x into a temporary if necessary 63 | auto const& Aeval = eval_expression(A); 64 | auto const& xeval = eval_expression(x); 65 | 66 | //obtain geometry information 67 | bool transA = std::is_same::value; 68 | std::size_t m = A().size1(); 69 | std::size_t n = A().size2(); 70 | if(transA) 71 | std::swap(m,n); 72 | 73 | //obtain matrix storage 74 | auto storageA = Aeval.raw_storage(); 75 | auto storagex = xeval.raw_storage(); 76 | auto storagev = v().raw_storage(); 77 | 78 | hip::get_blas(A().queue()).gemv( 79 | transA, 80 | m, n, 81 | alpha, 82 | storageA.values, storageA.leading_dimension, 83 | storagex.values, storagex.stride, 84 | typename VecV::value_type(1), 85 | storagev.values, storagev.stride, 86 | hip::get_stream(A().queue()) 87 | ); 88 | } 89 | 90 | }} 91 | 92 | #endif 93 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/syrk.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2016 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_HIP_SYRK_HPP 33 | #define REMORA_KERNELS_HIP_SYRK_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | 38 | #ifdef __NVCC__ 39 | #include "../../hip/cublas_backend.hpp" 40 | #endif 41 | namespace remora{ namespace kernels{ 42 | 43 | // C <- C + alpha * A * A^T 44 | template 45 | void syrk( 46 | matrix_expression const& A, 47 | matrix_expression& C, 48 | typename MatC::value_type const& alpha 49 | ) { 50 | REMORA_SIZE_CHECK(A().size1() == C().size1()); 51 | REMORA_SIZE_CHECK(C().size1()== C().size2()); 52 | 53 | static_assert(std::is_same::value, "[syrk] Arguments do not have same element type"); 54 | static_assert(std::is_same::value, "[syrk] A is not dense"); 55 | static_assert(std::is_base_of::value, "[syrk] C does not have dense storage layout"); 56 | 57 | //pre-evaluate A into a temporary if necessary 58 | auto const& Aeval = eval_expression(A); 59 | 60 | //obtain geometry information 61 | bool transA = !std::is_same::value; 62 | bool is_column_majorA = std::is_same::value; 63 | auto upperA = Upper; 64 | if(!is_column_majorA){ 65 | transA = !transA; 66 | upperA = !upperA; 67 | } 68 | 69 | 70 | std::size_t n = C().size1(); 71 | std::size_t k = A().size2(); 72 | 73 | //obtain matrix storage 74 | auto storageA = Aeval.raw_storage(); 75 | auto storageC = C().raw_storage(); 76 | 77 | hip::get_blas(C().queue()).syrk( 78 | upperA, transA, 79 | n, k, alpha, 80 | storageA.values, storageA.leading_dimension, 81 | typename MatC::value_type(1), 82 | storageC.values, storageC.leading_dimension, 83 | hip::get_stream(C().queue()) 84 | ); 85 | } 86 | 87 | }} 88 | 89 | #endif 90 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/trmm.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_HIP_TRMM_HPP 33 | #define REMORA_KERNELS_HIP_TRMM_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | 38 | #ifdef __NVCC__ 39 | #include "../../hip/cublas_backend.hpp" 40 | #endif 41 | 42 | namespace remora{ namespace kernels{ 43 | 44 | // C <- AC with A being triangular 45 | template 46 | void trmm_impl( 47 | matrix_expression const& A, 48 | matrix_expression& B, 49 | column_major 50 | ){ 51 | 52 | //obtain geometry information 53 | auto transA = !std::is_same::value; 54 | 55 | //obtain raw storage 56 | auto storageA = A().raw_storage(); 57 | auto storageB = B().raw_storage(); 58 | 59 | if(!transA){ 60 | hip::get_blas(B().queue()).trmm( 61 | Left, Upper, transA, Unit, 62 | B().size1(), B().size2(), 63 | typename MatB::value_type(1), 64 | storageA.values, storageA.leading_dimension, 65 | storageB.values, storageB.leading_dimension, 66 | hip::get_stream(B().queue()) 67 | ); 68 | }else{ 69 | hip::get_blas(B().queue()).trmm( 70 | !Left, Upper, transA, Unit, 71 | B().size2(), B().size1(), 72 | typename MatB::value_type(1), 73 | storageA.values, storageA.leading_dimension, 74 | storageB.values, storageB.leading_dimension, 75 | hip::get_stream(B().queue()) 76 | ); 77 | } 78 | } 79 | 80 | template 81 | void trmm_impl( 82 | matrix_expression const& A, 83 | matrix_expression& B, 84 | row_major 85 | ) { 86 | auto transB = trans(B); 87 | trmm_impl(trans(A), transB, column_major()); 88 | } 89 | 90 | template 91 | void trmm( 92 | matrix_expression const& A, 93 | matrix_expression& B 94 | ){ 95 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 96 | REMORA_SIZE_CHECK(A().size2() == B().size1()); 97 | 98 | static_assert(std::is_same::value, "[trmm] Arguments do not have same element type"); 99 | static_assert(std::is_same::value, "[trmm] A is not dense"); 100 | static_assert(std::is_base_of::value, "[trmm] C does not have dense storage layout"); 101 | 102 | //pre-evaluate A into a temporary if necessary 103 | auto const& Aeval = eval_expression(A); 104 | 105 | trmm_impl(Aeval, B, typename MatA::orientation()); 106 | } 107 | 108 | }} 109 | 110 | #endif 111 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/trmv.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_HIP_TRMV_HPP 33 | #define REMORA_KERNELS_HIP_TRMV_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | 38 | #ifdef __NVCC__ 39 | #include "../../hip/cublas_backend.hpp" 40 | #endif 41 | namespace remora{ namespace kernels{ 42 | 43 | // v <- Av with A being triangular 44 | template 45 | void trmv( 46 | matrix_expression const& A, 47 | vector_expression& v 48 | ){ 49 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 50 | REMORA_SIZE_CHECK(A().size2() == v().size()); 51 | 52 | static_assert(std::is_same::value, "[trmv] Arguments do not have same element type"); 53 | static_assert(std::is_same::value, "[trmv] A is not dense"); 54 | static_assert(std::is_base_of::value, "[trmv] v does not have dense storage layout"); 55 | 56 | //pre-evaluate A into a temporary if necessary 57 | auto const& Aeval = eval_expression(A); 58 | 59 | //obtain geometry information 60 | auto transA = std::is_same::value; 61 | bool triangular = transA? !Upper : Upper; 62 | std::size_t n = A().size1(); 63 | 64 | //obtain raw storage 65 | auto storageA = Aeval.raw_storage(); 66 | auto storagev = v().raw_storage(); 67 | 68 | hip::get_blas(v().queue()).trmv( 69 | triangular, transA, Unit, 70 | n, 71 | storageA.values, storageA.leading_dimension, 72 | storagev.values, storagev.stride, 73 | hip::get_stream(v().queue()) 74 | ); 75 | } 76 | }} 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/trsv.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief - 6 | * 7 | * \author O. Krause 8 | * \date 2017 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_HIP_TRSV_HPP 33 | #define REMORA_KERNELS_HIP_TRSV_HPP 34 | 35 | #include "../../expression_types.hpp" 36 | #include "../../detail/traits.hpp" 37 | 38 | #ifdef __NVCC__ 39 | #include "../../hip/cublas_backend.hpp" 40 | #endif 41 | namespace remora{ namespace kernels{ 42 | 43 | // solve Ax = b or xA=b with A being triangular 44 | template 45 | void trsv( 46 | matrix_expression const& A, 47 | vector_expression& v 48 | ){ 49 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 50 | REMORA_SIZE_CHECK(A().size2() == v().size()); 51 | 52 | static_assert(std::is_same::value, "[trmv] Arguments do not have same element type"); 53 | static_assert(std::is_same::value, "[trmv] A is not dense"); 54 | static_assert(std::is_base_of::value, "[trmv] v does not have dense storage layout"); 55 | 56 | //pre-evaluate A into a temporary if necessary 57 | auto const& Aeval = eval_expression(A); 58 | 59 | //obtain geometry information 60 | auto transA = !std::is_same::value; 61 | bool upperA = transA? !Triangular::is_upper : Triangular::is_upper; 62 | //transpose if side is right 63 | if(!Side::is_left){ 64 | transA = !transA; 65 | } 66 | 67 | std::size_t n = A().size1(); 68 | 69 | 70 | //obtain raw storage 71 | auto storageA = Aeval.raw_storage(); 72 | auto storagev = v().raw_storage(); 73 | 74 | hip::get_blas(v().queue()).trsv( 75 | upperA, transA, Triangular::is_unit, 76 | n, 77 | storageA.values, storageA.leading_dimension, 78 | storagev.values, storagev.stride, 79 | hip::get_stream(v().queue()) 80 | ); 81 | } 82 | }} 83 | 84 | #endif 85 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/vector_fold.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief kernels for folding vectors with hip 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_HIP_VECTOR_FOLD_HPP 29 | #define REMORA_KERNELS_HIP_VECTOR_FOLD_HPP 30 | 31 | #include "../../expression_types.hpp" 32 | #include "../../detail/traits.hpp" 33 | #include "../../hip/buffer.hpp" 34 | namespace remora{ 35 | namespace hip{ 36 | template 37 | __global__ void vector_fold_kernel(hipLaunchParm lp, VecV v, size_t size, R* resultp, F f){ 38 | __shared__ R folds[64]; 39 | R& entry = folds[hipThreadIdx_x]; 40 | size_t i = hipThreadIdx_x; 41 | if(i < size){ 42 | entry = v(i); 43 | i += hipBlockDim_x; 44 | for(;i < size; i += hipBlockDim_x){ 45 | entry = f(entry, v(i)); 46 | } 47 | } 48 | __threadfence(); 49 | 50 | if(hipThreadIdx_x == 0){ 51 | for(size_t i = 0 ; i < min(size_t(hipBlockDim_x), size); ++i){ 52 | *resultp = f(*resultp, folds[i]); 53 | } 54 | } 55 | } 56 | } 57 | namespace bindings{ 58 | template 59 | void vector_fold(vector_expression const& v, typename F::result_type& value, dense_tag){ 60 | if(v().size() == 0) return; 61 | typedef typename F::result_type value_type; 62 | hip::buffer result(1, v().queue()); 63 | 64 | hipMemcpy(result.get(), &value, sizeof(value), hipMemcpyHostToDevice); 65 | 66 | std::size_t blockSize = std::min(64, v().queue().warp_size()); 67 | std::size_t numBlocks = 1; 68 | auto stream = hip::get_stream(v().queue()).handle(); 69 | hipLaunchKernel( 70 | hip::vector_fold_kernel, 71 | dim3(numBlocks), dim3(blockSize), 0, stream, 72 | v().elements(), v().size(), result.get(), F() 73 | ); 74 | hipMemcpy(&value, result.get(), sizeof(value), hipMemcpyDeviceToHost); 75 | } 76 | 77 | 78 | }} 79 | #endif 80 | -------------------------------------------------------------------------------- /include/remora/kernels/hip/vector_max.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief kernels for getting the maximum element of a vector with hip 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_HIP_VECTOR_MAX_HPP 29 | #define REMORA_KERNELS_HIP_VECTOR_MAX_HPP 30 | 31 | #include "../../expression_types.hpp" 32 | #include "../../detail/traits.hpp" 33 | #include "../../hip/buffer.hpp" 34 | namespace remora{ 35 | namespace hip{ 36 | template 37 | __global__ void vector_max_kernel(hipLaunchParm lp, VecV v, size_t size, size_t* max){ 38 | typedef typename std::remove_const< 39 | typename std::remove_reference::type 40 | > ::type value_type; 41 | __shared__ value_type max_value[64]; 42 | __shared__ std::size_t max_index[64]; 43 | value_type& thread_max = max_value[hipThreadIdx_x]; 44 | std::size_t& thread_index = max_index[hipThreadIdx_x]; 45 | thread_max = 1.e-30; 46 | thread_index = 0; 47 | for(size_t i = hipThreadIdx_x; i < size; i += hipBlockDim_x){ 48 | if(thread_max < v(i)){ 49 | thread_max = v(i); 50 | thread_index = i; 51 | } 52 | } 53 | __threadfence(); 54 | 55 | if(hipThreadIdx_x == 0){ 56 | for(size_t i = 1 ; i < min(size_t(hipBlockDim_x), size); ++i){ 57 | if(thread_max < max_value[i]){ 58 | thread_max = max_value[i]; 59 | thread_index = max_index[i]; 60 | } 61 | } 62 | *max = thread_index; 63 | } 64 | } 65 | } 66 | namespace bindings{ 67 | template 68 | std::size_t vector_max(vector_expression const& v, dense_tag){ 69 | if(v().size() == 0) return 0; 70 | hip::buffer result(1, v().queue()); 71 | 72 | std::size_t blockSize = std::min(64, v().queue().warp_size()); 73 | std::size_t numBlocks = (v().size() + blockSize - 1) / blockSize; 74 | auto stream = hip::get_stream(v().queue()).handle(); 75 | hipLaunchKernel( 76 | hip::vector_max_kernel, 77 | dim3(numBlocks), dim3(blockSize), 0, stream, 78 | v().elements(), v().size(), result.get() 79 | ); 80 | std::size_t index; 81 | hipMemcpy(&index, result.get(), sizeof(index), hipMemcpyDeviceToHost); 82 | return index; 83 | } 84 | 85 | 86 | }} 87 | #endif 88 | -------------------------------------------------------------------------------- /include/remora/kernels/lapack/fortran.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Defines Fortran naming conventions when binding to lapack routines 3 | * 4 | * \author O. Krause 5 | * \date 2012 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | * This is based on boost::numeric::bindings, written by Toon Knapen and Kresimir Fresl 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_LAPACK_FORTRAN_H 32 | #define REMORA_KERNELS_LAPACK_FORTRAN_H 33 | 34 | #if defined(BIND_FORTRAN_LOWERCASE_UNDERSCORE) || defined(BIND_FORTRAN_LOWERCASE) 35 | // Allow manual override of the defaults, e.g. if you want to use a fortran 36 | // lib compiled with gcc from MSVC 37 | #else 38 | 39 | // First we need to know what the conventions for linking 40 | // C with Fortran is on this platform/toolset 41 | #if defined(__GNUC__) || defined(__ICC) || defined(__sgi) || defined(__COMO__) || defined(__KCC) 42 | #define BIND_FORTRAN_LOWERCASE_UNDERSCORE 43 | #elif defined(__IBMCPP__) || defined(_MSC_VER) 44 | #define BIND_FORTRAN_LOWERCASE 45 | #else 46 | #error do not know how to link with fortran for the given platform 47 | #endif 48 | 49 | #endif 50 | 51 | // Next we define macro's to convert our symbols to 52 | // the current convention 53 | #if defined(BIND_FORTRAN_LOWERCASE_UNDERSCORE) 54 | #define FORTRAN_ID( id ) id##_ 55 | #elif defined(BIND_FORTRAN_LOWERCASE) 56 | #define FORTRAN_ID( id ) id 57 | #else 58 | #error do not know how to bind to fortran calling convention 59 | #endif 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /include/remora/kernels/lapack/syev.hpp: -------------------------------------------------------------------------------- 1 | //=========================================================================== 2 | /*! 3 | * 4 | * 5 | * \brief Contains the lapack bindings for the symmetric eigenvalue problem syev. 6 | * 7 | * \author O. Krause 8 | * \date 2010 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | //=========================================================================== 32 | #ifndef REMORA_KERNELS_LAPACK_SYEV_HPP 33 | #define REMORA_KERNELS_LAPACK_SYEV_HPP 34 | 35 | #include "fortran.hpp" 36 | #include "../../detail/traits.hpp" 37 | 38 | #define REMORA_LAPACK_DSYEV FORTRAN_ID(dsyev) 39 | 40 | extern "C"{ 41 | void REMORA_LAPACK_DSYEV( 42 | const char* jobz, const char* uplo, const int *n, 43 | double* a, const int * lda, double* w, 44 | double* work, const int * lwork, int* info 45 | ); 46 | } 47 | 48 | 49 | 50 | namespace remora {namespace bindings { 51 | 52 | inline void syev( 53 | int n, bool upper, 54 | double* A, int lda, 55 | double* eigenvalues 56 | ){ 57 | if(n == 0) return; 58 | int lwork = std::min(130,4*n)*n; 59 | double* work = new double[lwork]; 60 | int info; 61 | char job = 'V'; 62 | char uplo = upper?'U':'L'; 63 | REMORA_LAPACK_DSYEV(&job, &uplo, &n, A, &lda,eigenvalues,work,&lwork,&info); 64 | delete[] work; 65 | 66 | } 67 | 68 | 69 | template 70 | void syev( 71 | matrix_expression& A, 72 | vector_expression& eigenValues 73 | ) { 74 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 75 | REMORA_SIZE_CHECK(A().size1() == eigenValues().size()); 76 | 77 | std::size_t n = A().size1(); 78 | bool upper = false; 79 | //lapack is column major storage. 80 | if(std::is_same::value){ 81 | upper = !upper; 82 | } 83 | auto storageA = A().raw_storage(); 84 | auto storageEig = eigenValues().raw_storage(); 85 | syev( 86 | n, upper, 87 | storageA.values, 88 | storageA.leading_dimension, 89 | storageEig.values 90 | ); 91 | 92 | A() = trans(A); 93 | 94 | //reverse eigenvectors and eigenvalues 95 | for (int i = 0; i < (int)n-i-1; i++) 96 | { 97 | int l = n-i-1; 98 | std::swap(eigenValues()( l ),eigenValues()( i )); 99 | } 100 | for (int j = 0; j < (int)n; j++) { 101 | for (int i = 0; i < (int)n-i-1; i++) 102 | { 103 | int l = n-i-1; 104 | std::swap(A()( j , l ), A()( j , i )); 105 | } 106 | } 107 | } 108 | 109 | }} 110 | 111 | #undef REMORA_LAPACK_DSYEV 112 | 113 | #endif 114 | -------------------------------------------------------------------------------- /include/remora/kernels/opencl/vector_assign.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Assignment kernels for vector expressions 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_CLBLAS_VECTOR_ASSIGN_HPP 29 | #define REMORA_KERNELS_CLBLAS_VECTOR_ASSIGN_HPP 30 | 31 | #include "../../expression_types.hpp" 32 | #include "../../detail/traits.hpp" 33 | 34 | namespace remora{namespace bindings{ 35 | 36 | template 37 | void apply(vector_expression& v_unreg, F const& f_unreg) { 38 | if(v_unreg().size() == 0) return; 39 | opencl::detail::meta_kernel k("blas_vector_apply_dense"); 40 | 41 | auto v = k.register_args(v_unreg().elements()); 42 | auto f = k.register_args(f_unreg); 43 | 44 | //create source 45 | k< 53 | void assign(vector_expression& v, typename V::value_type t) { 54 | static_assert(std::is_base_of::value, "target must have dense storage for assignment"); 55 | auto f = device_traits::make_bind_second(F(), t); 56 | apply(v,f); 57 | } 58 | 59 | //////////////////////////////////////////// 60 | //assignment with functor 61 | //////////////////////////////////////////// 62 | 63 | // Dense-Dense case 64 | template 65 | void vector_assign_functor( 66 | vector_expression& v_unreg, 67 | vector_expression const& e_unreg, 68 | F f_unreg, 69 | dense_tag, dense_tag 70 | ) { 71 | if(v_unreg().size() == 0) return; 72 | 73 | opencl::detail::meta_kernel k("blas_vector_assign_functor_dense"); 74 | 75 | auto v = k.register_args(v_unreg().elements()); 76 | auto e = k.register_args(e_unreg().elements()); 77 | auto f = k.register_args(f_unreg); 78 | 79 | //create source 80 | k< 93 | void vector_assign( 94 | vector_expression& v, vector_expression const& e, 95 | dense_tag t, dense_tag 96 | ) { 97 | vector_assign_functor(v, e, device_traits::right_arg(), t, t); 98 | } 99 | 100 | 101 | 102 | 103 | }} 104 | #endif 105 | -------------------------------------------------------------------------------- /include/remora/kernels/opencl/vector_fold.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief kernels for folding kernels with openCL 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP 29 | #define REMORA_KERNELS_CLBLAS_VECTOR_FOLD_HPP 30 | 31 | #include "../../expression_types.hpp" 32 | #include "../../detail/traits.hpp" 33 | #include 34 | #include 35 | namespace remora{namespace bindings{ 36 | 37 | template 38 | void vector_fold(vector_expression const& v_unreg, typename F::result_type& value, dense_tag) { 39 | if(v_unreg().size() == 0) return; 40 | auto& queue = v_unreg().queue(); 41 | typedef typename F::result_type value_type; 42 | opencl::detail::meta_kernel k("blas_vector_fold"); 43 | std::size_t size_index = k.add_arg("size"); 44 | auto v = k.register_args(v_unreg().elements()); 45 | auto f = k.register_args(F()); 46 | 47 | boost::compute::array device_result; 48 | boost::compute::copy_n(&value, 1, device_result.begin(), queue); 49 | auto exprSubFold = k.expr("subfold[get_local_id(0)]"); 50 | k << "__local " <("subfold")<< "[TILE_DIM];\n"; 51 | k << exprSubFold<<" = "<("min(size-1,get_local_id(0))"))<<";\n"; 52 | k << "for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n "; 53 | k << exprSubFold << '=' << f(exprSubFold,v(k.expr("i")))<<";\n"; 54 | k << "}\n"; 55 | k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing 56 | //sum up the rows 57 | k << "if(get_local_id(0) == 0){\n"; 58 | k << " for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n"; 59 | k << " subfold[0] =" << f(k.expr("subfold[0]"),k.expr("subfold[i]"))<<";\n"; 60 | k << " }\n "; 61 | k << device_result.begin()[0]<< "= subfold[0];\n"; 62 | k << "}\n"; 63 | 64 | std::size_t TILE_DIM = 32; 65 | boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32"); 66 | kernel.set_arg(size_index, v_unreg().size()); 67 | 68 | std::size_t global_work_size[1] = {TILE_DIM}; 69 | std::size_t local_work_size[1] = {TILE_DIM}; 70 | queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size); 71 | boost::compute::copy_n(device_result.begin(), 1, &value, queue); 72 | } 73 | 74 | 75 | }} 76 | #endif 77 | -------------------------------------------------------------------------------- /include/remora/kernels/opencl/vector_max.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief - 5 | * 6 | * \author O. Krause 7 | * \date 2016 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP 31 | #define REMORA_KERNELS_CLBLAS_VECTOR_MAX_HPP 32 | 33 | #include "../../detail/traits.hpp" 34 | #include "../../expression_types.hpp" 35 | namespace remora {namespace bindings{ 36 | 37 | template 38 | std::size_t vector_max(vector_expression const& v_unreg, dense_tag) { 39 | if(v_unreg().size() == 0) return 0; 40 | auto& queue = v_unreg().queue(); 41 | typedef typename E::value_type value_type; 42 | opencl::detail::meta_kernel k("blas_vector_fold"); 43 | std::size_t size_index = k.add_arg("size"); 44 | auto v = k.register_args(v_unreg().elements()); 45 | 46 | boost::compute::array device_result; 47 | auto exprMax = k.expr("maximum[get_local_id(0)]"); 48 | k << "__local " <("maximum")<< "[TILE_DIM];\n"; 49 | k << "__local uint maximum_index[TILE_DIM];\n"; 50 | k << exprMax<<" = "<("min(size-1,get_local_id(0))"))<<";\n"; 51 | k << "maximum_index[get_local_id(0)] = get_local_id(0);\n"; 52 | k << "for(uint i = TILE_DIM + get_local_id(0); i < size; i += TILE_DIM){\n"; 53 | k << " if( " << exprMax << '<' << v(k.expr("i"))<<"){\n "; 54 | k << exprMax << '=' << v(k.expr("i"))<<";\n"; 55 | k << " maximum_index[get_local_id(0)] = i;\n"; 56 | k << " }\n"; 57 | k << "}\n"; 58 | k << "barrier(CLK_LOCAL_MEM_FENCE);\n";//wait until all threads are done with computing 59 | //sum up the rows 60 | k << "if(get_local_id(0) == 0){\n"; 61 | k << " for(uint i = 1 ; i < min((uint)size,(uint)TILE_DIM); ++i){\n"; 62 | k << " if( " << exprMax<< '<' << v(k.expr("i"))<<"){\n"; 63 | k << " maximum_index[0] = maximum_index[i];\n"; 64 | k << " maximum[0] = maximum[i];\n"; 65 | k << " }\n"; 66 | k << " }\n"; 67 | k << device_result.begin()[0]<< "= maximum_index[0];\n"; 68 | k << "}\n"; 69 | 70 | std::size_t TILE_DIM = 32; 71 | boost::compute::kernel kernel = k.compile(queue.get_context(), "-DTILE_DIM=32"); 72 | kernel.set_arg(size_index, v_unreg().size()); 73 | 74 | std::size_t global_work_size[1] = {TILE_DIM}; 75 | std::size_t local_work_size[1] = {TILE_DIM}; 76 | queue.enqueue_nd_range_kernel(kernel, 1,nullptr, global_work_size, local_work_size); 77 | std::size_t result; 78 | boost::compute::copy_n(device_result.begin(), 1, &result, queue); 79 | return result; 80 | } 81 | 82 | 83 | }} 84 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/potrf.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Dispatches the POTRF algorithm 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2014 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_POTRF_HPP 32 | #define REMORA_KERNELS_POTRF_HPP 33 | 34 | #include 35 | #ifdef REMORA_USE_ATLAS_LAPACK 36 | #include "atlas/potrf.hpp" 37 | #else 38 | 39 | // if no bindings are included, we have to provide the default has_optimized_gemv 40 | // otherwise the binding will take care of this 41 | namespace remora {namespace bindings { 42 | template 43 | struct has_optimized_potrf 44 | : public std::false_type {}; 45 | }} 46 | #endif 47 | 48 | #include "default/potrf.hpp" 49 | 50 | namespace remora {namespace kernels { 51 | 52 | ///\brief Implements the POsitive TRiangular matrix Factorisation POTRF. 53 | /// 54 | /// It is better known as the cholesky decomposition for dense matrices. 55 | /// The algorithm works in place and does not require additional memory. 56 | template 57 | std::size_t potrf( 58 | matrix_container& A 59 | ) { 60 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 61 | return bindings::potrf(A, typename bindings::has_optimized_potrf::type()); 62 | } 63 | 64 | }} 65 | 66 | #ifdef REMORA_USE_OPENCL 67 | #include "opencl/potrf.hpp" 68 | #endif 69 | 70 | #if defined(__HCC__) || defined(__NVCC__) 71 | #include "hip/potrf.hpp" 72 | #endif 73 | 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /include/remora/kernels/pstrf.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Dispatches the POTRF algorithm 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2014 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_PSTRF_HPP 32 | #define REMORA_KERNELS_PSTRF_HPP 33 | 34 | #include "default/pstrf.hpp" 35 | 36 | namespace remora { 37 | namespace kernels { 38 | 39 | /*! 40 | * \brief Cholesky decomposition with full pivoting performed in place. 41 | * 42 | * Given an \f$ m \times m \f$ symmetric positive semi-definite matrix 43 | * \f$A\f$, compute thes matrix \f$L\f$ and permutation Matrix P such that 44 | * \f$P^TAP = LL^T \f$. If matrix A has rank(A) = k, the first k columns of A hold the full 45 | * decomposition, while the rest of the matrix is zero. 46 | * This method is slower than the cholesky decomposition without pivoting but numerically more 47 | * stable. The diagonal elements are ordered such that i > j => L(i,i) >= L(j,j) 48 | * 49 | * The implementation used here is described in the working paper 50 | * "LAPACK-Style Codes for Level 2 and 3 Pivoted Cholesky Factorizations" 51 | * http://www.netlib.org/lapack/lawnspdf/lawn161.pdf 52 | * 53 | * The computation is carried out in place this means A is destroyed and replaced by L. 54 | * 55 | * 56 | * \param A \f$ m \times m \f$ matrix, which must be symmetric and positive definite. It is replaced by L in the end. 57 | * \param P The pivoting matrix of dimension \f$ m \f$ 58 | * \return The rank of the matrix A 59 | */ 60 | template 61 | std::size_t pstrf( 62 | matrix_expression&A, 63 | vector_expression& P 64 | ){ 65 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 66 | REMORA_SIZE_CHECK(P().size() == A().size1()); 67 | return bindings::pstrf(A,P, Triangular()); 68 | } 69 | 70 | 71 | }} 72 | 73 | #endif 74 | -------------------------------------------------------------------------------- /include/remora/kernels/random.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Generation of random variates 5 | * 6 | * \author O. Krause 7 | * \date 2017 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_RANDOM_HPP 31 | #define REMORA_KERNELS_RANDOM_HPP 32 | 33 | #include "default/random.hpp" 34 | #ifdef REMORA_USE_OPENCL 35 | #include "opencl/random.hpp" 36 | #endif 37 | #if defined(__HCC__) || defined(__NVCC__) 38 | #include "hip/random.hpp" 39 | #endif 40 | 41 | 42 | namespace remora{namespace kernels{ 43 | 44 | template 45 | void generate_normal( 46 | vector_expression& v, 47 | Rng& rng, 48 | typename V::value_type mean, 49 | typename V::value_type variance 50 | ) { 51 | bindings::generate_normal(v, rng, mean, variance); 52 | } 53 | 54 | template 55 | void generate_normal( 56 | matrix_expression& m, 57 | Rng& rng, 58 | typename M::value_type mean, 59 | typename M::value_type variance 60 | ) { 61 | bindings::generate_normal(m, rng, mean, variance); 62 | } 63 | 64 | template 65 | void generate_uniform( 66 | vector_expression& v, 67 | Rng& rng, 68 | typename V::value_type low, 69 | typename V::value_type high 70 | ) { 71 | bindings::generate_uniform(v, rng, low, high); 72 | } 73 | 74 | template 75 | void generate_uniform( 76 | matrix_expression& m, 77 | Rng& rng, 78 | typename M::value_type low, 79 | typename M::value_type high 80 | ) { 81 | bindings::generate_uniform(m, rng, low, high); 82 | } 83 | 84 | template 85 | void generate_discrete( 86 | vector_expression& v, 87 | Rng& rng, 88 | int low, 89 | int high 90 | ) { 91 | bindings::generate_discrete(v, rng, low, high); 92 | } 93 | 94 | template 95 | void generate_discrete( 96 | matrix_expression& m, 97 | Rng& rng, 98 | int low, 99 | int high 100 | ) { 101 | bindings::generate_discrete(m, rng, low, high); 102 | } 103 | 104 | }} 105 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/syev.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Symmetric eigenvalue decomposition 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_SYEV_HPP 31 | #define REMORA_KERNELS_SYEV_HPP 32 | 33 | 34 | #ifdef REMORA_USE_LAPACK 35 | #include "lapack/syev.hpp" 36 | #else 37 | #include "default/syev.hpp" 38 | #endif 39 | 40 | namespace remora{ namespace kernels{ 41 | 42 | ///\brief Well known SYmmetric EigenValue function (SYEV). 43 | /// 44 | /// A given matrix A is decomposed as 45 | /// A=QDQ^T 46 | /// where Q is an orthogonal (or unitary) matrix with QQ^T=Q^TQ=I and D are the eigenvalue 47 | /// of A. As A is symmetric, only the lower part of it is accessed for reading. 48 | /// The wholee matrix will in the end contain the eigenvectors of A and thus 49 | /// A is replaced by Q. 50 | /// Additionally the eigenvalues are stored in the second argument. 51 | template 52 | void syev( 53 | matrix_expression& matA, 54 | vector_expression& eigenValues 55 | ) { 56 | bindings::syev(matA,eigenValues); 57 | } 58 | 59 | 60 | }} 61 | #endif -------------------------------------------------------------------------------- /include/remora/kernels/syrk.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief matrix-matrix multiplication kernel for symmetrik Rank-K updates 5 | * 6 | * \author O. Krause 7 | * \date 2016 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_SYRK_HPP 32 | #define REMORA_KERNELS_SYRK_HPP 33 | 34 | #include "default/syrk.hpp" 35 | 36 | #ifdef REMORA_USE_CBLAS 37 | #include "cblas/syrk.hpp" 38 | #else 39 | //if no bindings are included, we have to provide the default has_optimized_syrk otherwise the binding will take care of this 40 | namespace remora{ namespace bindings{ 41 | template 42 | struct has_optimized_syrk 43 | : public std::false_type{}; 44 | }} 45 | #endif 46 | 47 | namespace remora{namespace kernels{ 48 | 49 | ///\brief Well known SYmmetric Rank-K update kernel M+=alpha*A*A^T. 50 | /// 51 | /// Note that it assumes M to be symmetric and it will only touch the upper or lower triangular area. 52 | /// If bindings are included and the matrix combination allow for a specific binding 53 | /// to be applied, the binding is called automatically from {binding}/syrk.h 54 | /// otherwise default/syrk.h is used. 55 | template 56 | void syrk( 57 | matrix_expression const& e, 58 | matrix_expression& m, 59 | typename M::value_type alpha 60 | ) { 61 | REMORA_SIZE_CHECK(m().size1() == m().size2()); 62 | REMORA_SIZE_CHECK(m().size1() == e().size1()); 63 | 64 | bindings::syrk(e, m, alpha, 65 | typename bindings::has_optimized_syrk::type() 66 | ); 67 | } 68 | 69 | }} 70 | 71 | #ifdef REMORA_USE_CLBLAST 72 | #include "clBlast/syrk.hpp" 73 | #elif defined REMORA_USE_OPENCL 74 | #include "opencl/syrk.hpp" 75 | #endif 76 | #if defined(__HCC__) || defined(__NVCC__) 77 | #include "hip/syrk.hpp" 78 | #endif 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /include/remora/kernels/tpmv.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Triangular packed matrix-vector multiplication 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_TPMV_HPP 32 | #define REMORA_KERNELS_TPMV_HPP 33 | 34 | #ifdef REMORA_USE_CBLAS 35 | #include "cblas/tpmv.hpp" 36 | #else 37 | // if no bindings are included, we have to provide the default has_optimized_gemv 38 | // otherwise the binding will take care of this 39 | namespace remora{ namespace bindings{ 40 | template 41 | struct has_optimized_tpmv 42 | : public std::false_type{}; 43 | }} 44 | #endif 45 | 46 | #include "default/tpmv.hpp" 47 | 48 | namespace remora{namespace kernels{ 49 | 50 | ///\brief Implements the Tringular Packed Matrix-Vector multiplication(TPMV) 51 | /// 52 | /// It computes b=A*b where A is a lower or upper packed triangular matrix. 53 | template 54 | void tpmv( 55 | matrix_expression const &A, 56 | vector_expression& b 57 | ){ 58 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 59 | REMORA_SIZE_CHECK(A().size1() == b().size()); 60 | 61 | bindings::tpmv(A,b,typename bindings::has_optimized_tpmv::type()); 62 | } 63 | 64 | }} 65 | 66 | #endif 67 | -------------------------------------------------------------------------------- /include/remora/kernels/trmm.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Triangular Matrix-Matrix multiplication kernel 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2014 Shark Developcbment Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_TRMM_HPP 32 | #define REMORA_KERNELS_TRMM_HPP 33 | 34 | #ifdef REMORA_USE_CBLAS 35 | #include "cblas/trmm.hpp" 36 | #else 37 | // if no bindings are included, we have to provide the default has_optimized_gemv 38 | // otherwise the binding will take care of this 39 | namespace remora{ namespace bindings{ 40 | template 41 | struct has_optimized_trmm 42 | : public std::false_type{}; 43 | }} 44 | #endif 45 | 46 | #include "default/trmm.hpp" 47 | 48 | namespace remora{namespace kernels{ 49 | 50 | ///\brief Implements the TRiangular Matrix Matrix multiply. 51 | /// 52 | /// It computes B=A*B in place, where A is a triangular matrix and B a dense matrix 53 | template 54 | void trmm( 55 | matrix_expression const &A, 56 | matrix_expression& B 57 | ){ 58 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 59 | REMORA_SIZE_CHECK(A().size1() == B().size1()); 60 | 61 | bindings::trmm(A,B,typename bindings::has_optimized_trmm::type()); 62 | } 63 | 64 | }} 65 | 66 | #ifdef REMORA_USE_CLBLAST 67 | #include "clBlast/trmm.hpp" 68 | #elif defined REMORA_USE_OPENCL 69 | #include "opencl/trmm.hpp" 70 | #endif 71 | #if defined(__HCC__) || defined(__NVCC__) 72 | #include "hip/trmm.hpp" 73 | #endif 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /include/remora/kernels/trmv.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Triangular matrix-vector multiplication kernel 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_TRMV_HPP 32 | #define REMORA_KERNELS_TRMV_HPP 33 | 34 | #ifdef REMORA_USE_CBLAS 35 | #include "cblas/trmv.hpp" 36 | #else 37 | // if no bindings are included, we have to provide the default has_optimized_gemv 38 | // otherwise the binding will take care of this 39 | namespace remora{ namespace bindings{ 40 | template 41 | struct has_optimized_trmv 42 | : public std::false_type{}; 43 | }} 44 | #endif 45 | 46 | #include "default/trmv.hpp" 47 | 48 | namespace remora{namespace kernels{ 49 | 50 | ///\brief Implements the TRiangular Solver for Vectors. 51 | /// 52 | /// It solves Systems of the form Ax = b where A is a square lower or upper triangular matrix. 53 | /// It can optionally assume that the diagonal is 1 and won't access the diagonal elements. 54 | template 55 | void trmv( 56 | matrix_expression const &A, 57 | vector_expression& b 58 | ){ 59 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 60 | REMORA_SIZE_CHECK(A().size1() == b().size()); 61 | 62 | bindings::trmv(A,b,typename bindings::has_optimized_trmv::type()); 63 | } 64 | 65 | }} 66 | 67 | #ifdef REMORA_USE_CLBLAST 68 | #include "clBlast/trmv.hpp" 69 | #elif defined REMORA_USE_OPENCL 70 | #include "opencl/trmv.hpp" 71 | #endif 72 | #if defined(__HCC__) || defined(__NVCC__) 73 | #include "hip/trmv.hpp" 74 | #endif 75 | #endif 76 | -------------------------------------------------------------------------------- /include/remora/kernels/trsm.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Triangular solve kernel for matrix epressions. 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_TRSM_HPP 32 | #define REMORA_KERNELS_TRSM_HPP 33 | 34 | #include //std::false_type marker for unoptimized 35 | #ifdef REMORA_USE_CBLAS 36 | #include "cblas/trsm.hpp" 37 | #else 38 | // if no bindings are included, we have to provide the default has_optimized_gemv 39 | // otherwise the binding will take care of this 40 | namespace remora{ namespace bindings{ 41 | template 42 | struct has_optimized_trsm 43 | : public std::false_type{}; 44 | }} 45 | #endif 46 | 47 | #include "default/trsm.hpp" 48 | 49 | namespace remora{namespace kernels{ 50 | 51 | ///\brief Implements the TRiangular Solver for Vectors. 52 | /// 53 | /// It solves Systems of the form Ax = b where A is a square lower or upper triangular matrix. 54 | /// It can optionally assume that the diagonal is 1 and won't access the diagonal elements. 55 | template 56 | void trsm( 57 | matrix_expression const &A, 58 | matrix_expression &B 59 | ){ 60 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 61 | REMORA_SIZE_CHECK(!Side::is_left || A().size2() == B().size1()); 62 | REMORA_SIZE_CHECK(Side::is_left || A().size2() == B().size2()); 63 | 64 | bindings::trsm(A,B,typename bindings::has_optimized_trsm::type()); 65 | } 66 | 67 | }} 68 | 69 | #ifdef REMORA_USE_CLBLAST 70 | #include "clBlast/trsm.hpp" 71 | #elif defined REMORA_USE_OPENCL 72 | #include "opencl/trsm.hpp" 73 | #endif 74 | #if defined(__HCC__) || defined(__NVCC__) 75 | #include "hip/trsm.hpp" 76 | #endif 77 | 78 | #endif 79 | -------------------------------------------------------------------------------- /include/remora/kernels/trsv.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Triangular solve kernel for vector expressions. 5 | * 6 | * \author O. Krause 7 | * \date 2012 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | 31 | #ifndef REMORA_KERNELS_TRSV_HPP 32 | #define REMORA_KERNELS_TRSV_HPP 33 | 34 | #include 35 | #ifdef REMORA_USE_CBLAS 36 | #include "cblas/trsv.hpp" 37 | #else 38 | // if no bindings are included, we have to provide the default has_optimized_gemv 39 | // otherwise the binding will take care of this 40 | namespace remora {namespace bindings{ 41 | template 42 | struct has_optimized_trsv 43 | : public std::false_type{}; 44 | }} 45 | #endif 46 | 47 | #include "default/trsv.hpp" 48 | 49 | namespace remora{namespace kernels{ 50 | 51 | ///\brief Implements the TRiangular Solver for Vectors. 52 | /// 53 | /// It solves Systems of the form Ax = b where A is a square lower or upper triangular matrix. 54 | /// It can optionally assume that the diagonal is 1 and won't access the diagonal elements. 55 | template 56 | void trsv( 57 | matrix_expression const &A, 58 | vector_expression &b 59 | ){ 60 | REMORA_SIZE_CHECK(A().size1() == A().size2()); 61 | REMORA_SIZE_CHECK(A().size1() == b().size()); 62 | 63 | bindings::trsv(A,b,typename bindings::has_optimized_trsv::type()); 64 | } 65 | 66 | }} 67 | 68 | #ifdef REMORA_USE_CLBLAST 69 | #include "clBlast/trsv.hpp" 70 | #elif defined REMORA_USE_OPENCL 71 | #include "opencl/trsv.hpp" 72 | #endif 73 | #if defined(__HCC__) || defined(__NVCC__) 74 | #include "hip/trsv.hpp" 75 | #endif 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /include/remora/kernels/vector_assign.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Assignment kernels for vector expressions 3 | * 4 | * \author O. Krause 5 | * \date 2015 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_VECTOR_ASSIGN_HPP 29 | #define REMORA_KERNELS_VECTOR_ASSIGN_HPP 30 | 31 | #include "../detail/traits.hpp" 32 | #include "default/vector_assign.hpp" 33 | #ifdef REMORA_USE_OPENCL 34 | #include "opencl/vector_assign.hpp" 35 | #endif 36 | #if defined(__HCC__) || defined(__NVCC__) 37 | #include "hip/vector_assign.hpp" 38 | #endif 39 | 40 | 41 | namespace remora{namespace kernels { 42 | 43 | 44 | template 45 | void apply(vector_expression& v,F const& f) { 46 | bindings::apply(v,f); 47 | } 48 | template 49 | void assign(vector_expression& v, typename V::value_type t) { 50 | bindings::assign(v,t); 51 | } 52 | 53 | ///////////////////////////////////////////////////////// 54 | //direct assignment of two vectors 55 | //////////////////////////////////////////////////////// 56 | 57 | //dispatcher 58 | template< class V, class E, class Device> 59 | void assign(vector_expression& v, vector_expression const& e) { 60 | REMORA_SIZE_CHECK(v().size() == e().size()); 61 | typedef typename V::evaluation_category::tag TagV; 62 | typedef typename E::evaluation_category::tag TagE; 63 | bindings::vector_assign(v, e,TagV(),TagE()); 64 | } 65 | 66 | //////////////////////////////////////////// 67 | //assignment with functor 68 | //////////////////////////////////////////// 69 | 70 | 71 | // Dispatcher 72 | template 73 | void assign(vector_expression& v, vector_expression const& e, F f) { 74 | REMORA_SIZE_CHECK(v().size() == e().size()); 75 | typedef typename V::evaluation_category::tag TagV; 76 | typedef typename E::evaluation_category::tag TagE; 77 | bindings::vector_assign_functor(v(), e(), f, TagV(),TagE()); 78 | } 79 | 80 | }} 81 | #endif 82 | -------------------------------------------------------------------------------- /include/remora/kernels/vector_fold.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * \brief Algorithm to reduce a vector to a scalar value 3 | * 4 | * \author O. Krause 5 | * \date 2016 6 | * 7 | * 8 | * \par Copyright 1995-2015 Shark Development Team 9 | * 10 | *

11 | * This file is part of Shark. 12 | * 13 | * 14 | * Shark is free software: you can redistribute it and/or modify 15 | * it under the terms of the GNU Lesser General Public License as published 16 | * by the Free Software Foundation, either version 3 of the License, or 17 | * (at your option) any later version. 18 | * 19 | * Shark is distributed in the hope that it will be useful, 20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 22 | * GNU Lesser General Public License for more details. 23 | * 24 | * You should have received a copy of the GNU Lesser General Public License 25 | * along with Shark. If not, see . 26 | * 27 | */ 28 | #ifndef REMORA_KERNELS_VECTOR_FOLD_HPP 29 | #define REMORA_KERNELS_VECTOR_FOLD_HPP 30 | 31 | #include "../detail/traits.hpp" 32 | #include "default/vector_fold.hpp" 33 | #ifdef REMORA_USE_OPENCL 34 | #include "opencl/vector_fold.hpp" 35 | #endif 36 | #if defined(__HCC__) || defined(__NVCC__) 37 | #include "hip/vector_fold.hpp" 38 | #endif 39 | 40 | namespace remora{namespace kernels { 41 | 42 | 43 | ///\brief Appliuees F in any order to the elements of v and a given initial value. 44 | /// 45 | /// result is the same as value = f(v_1,f(v_2,...f(v_n,value))) assuming f is commutative 46 | /// and associative. 47 | template 48 | void vector_fold(vector_expression const& v, typename F::result_type& value) { 49 | typedef typename V::evaluation_category::tag TagV; 50 | bindings::vector_fold(v(), value, TagV()); 51 | } 52 | 53 | }} 54 | #endif 55 | -------------------------------------------------------------------------------- /include/remora/kernels/vector_max.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief Kernel for calculating the maximum element of a vector 5 | * 6 | * \author O. Krause 7 | * \date 2016 8 | * 9 | * 10 | * \par Copyright 1995-2015 Shark Development Team 11 | * 12 | *

13 | * This file is part of Shark. 14 | * 15 | * 16 | * Shark is free software: you can redistribute it and/or modify 17 | * it under the terms of the GNU Lesser General Public License as published 18 | * by the Free Software Foundation, either version 3 of the License, or 19 | * (at your option) any later version. 20 | * 21 | * Shark is distributed in the hope that it will be useful, 22 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24 | * GNU Lesser General Public License for more details. 25 | * 26 | * You should have received a copy of the GNU Lesser General Public License 27 | * along with Shark. If not, see . 28 | * 29 | */ 30 | #ifndef REMORA_KERNELS_VECTOR_MAX_HPP 31 | #define REMORA_KERNELS_VECTOR_MAX_HPP 32 | 33 | #include "default/vector_max.hpp" 34 | #ifdef REMORA_USE_OPENCL 35 | #include "opencl/vector_max.hpp" 36 | #endif 37 | #if defined(__HCC__) || defined(__NVCC__) 38 | #include "hip/vector_max.hpp" 39 | #endif 40 | 41 | namespace remora { namespace kernels{ 42 | 43 | ///\brief Computes the index of the maximum element of a vector 44 | template 45 | std::size_t vector_max( 46 | vector_expression const& e 47 | ) { 48 | REMORA_SIZE_CHECK(e().size() == e().size()); 49 | return bindings::vector_max(e,typename E::evaluation_category::tag()); 50 | } 51 | 52 | }} 53 | #endif -------------------------------------------------------------------------------- /include/remora/remora.hpp: -------------------------------------------------------------------------------- 1 | /*! 2 | * 3 | * 4 | * \brief includes all uBLAS files needed by Shark linear Algebra 5 | * 6 | * 7 | * \author O. Krause 8 | * \date 2012 9 | * 10 | * 11 | * \par Copyright 1995-2015 Shark Development Team 12 | * 13 | *

14 | * This file is part of Shark. 15 | * 16 | * 17 | * Shark is free software: you can redistribute it and/or modify 18 | * it under the terms of the GNU Lesser General Public License as published 19 | * by the Free Software Foundation, either version 3 of the License, or 20 | * (at your option) any later version. 21 | * 22 | * Shark is distributed in the hope that it will be useful, 23 | * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 | * GNU Lesser General Public License for more details. 26 | * 27 | * You should have received a copy of the GNU Lesser General Public License 28 | * along with Shark. If not, see . 29 | * 30 | */ 31 | 32 | #ifndef SHARK_LINALG_BLAS_REMORA_HPP 33 | #define SHARK_LINALG_BLAS_REMORA_HPP 34 | 35 | //expressions 36 | #include "vector_expression.hpp" 37 | #include "matrix_expression.hpp" 38 | #include "solve.hpp" 39 | //containers 40 | #include "dense.hpp" 41 | #include "sparse.hpp" 42 | 43 | //misc 44 | #include "permutation.hpp" 45 | #include "io.hpp" 46 | #include "random.hpp" 47 | #include "device_copy.hpp" 48 | 49 | #endif 50 | --------------------------------------------------------------------------------