├── .gitignore ├── LICENSE ├── README.md ├── benchmark ├── Makefile ├── analyze_load_balance_spmv.py ├── bench_bfs.cpp ├── bench_pagerank.cpp ├── bench_spmspv.cpp ├── bench_spmv.cpp ├── bench_sssp.cpp ├── run_bfs.sh ├── run_pagerank.sh ├── run_spmv.sh └── run_sssp.sh ├── generate_bitstream ├── Makefile └── synthesize.cpp ├── graphlily ├── app │ ├── bfs.h │ ├── module_collection.h │ ├── pagerank.h │ └── sssp.h ├── global.h ├── hw │ ├── float_pe.h │ ├── kernel_add_scalar_vector_dense_impl.h │ ├── kernel_assign_vector_dense_impl.h │ ├── kernel_assign_vector_sparse_new_frontier_impl.h │ ├── kernel_assign_vector_sparse_no_new_frontier_impl.h │ ├── kernel_spmspv_impl.h │ ├── kernel_spmv_impl.h │ ├── math_constants.h │ ├── overlay.cpp │ ├── overlay.h │ ├── shuffle.h │ ├── ufixed_pe_fwd.h │ └── util.h ├── io │ ├── data_formatter.h │ └── data_loader.h ├── module │ ├── add_scalar_vector_dense_module.h │ ├── assign_vector_dense_module.h │ ├── assign_vector_sparse_module.h │ ├── base_module.h │ ├── spmspv_module.h │ └── spmv_module.h └── synthesizer │ ├── base_synthesizer.h │ └── overlay_synthesizer.h ├── tests ├── Makefile ├── test_app.cpp ├── test_data │ ├── create_csr.py │ ├── eye_10_csr_float32.npz │ └── line_8_csr_float32.npz ├── test_io.cpp ├── test_module_apply.cpp ├── test_module_spmv_spmspv.cpp ├── test_pe_cluster.cpp ├── test_shuffle.cpp └── testbench │ ├── pe_tb.cpp │ ├── pe_tb.h │ ├── shuffle_tb.cpp │ └── shuffle_tb.h └── xrt └── includes ├── cmdparser ├── cmdlineparser.cpp ├── cmdlineparser.h └── cmdparser.mk ├── logger ├── logger.cpp ├── logger.h └── logger.mk ├── oclHelper ├── oclErrorCodes.cpp ├── oclHelper.cpp ├── oclHelper.h └── oclHelper.mk ├── opencl └── opencl.mk └── xcl2 ├── xcl2.cpp ├── xcl2.hpp └── xcl2.mk /.gitignore: -------------------------------------------------------------------------------- 1 | *.jou 2 | */host 3 | profile_summary.html 4 | profile_summary.csv 5 | xclbin.run_summary 6 | _v++_*/ 7 | .Xil/ 8 | *.ll 9 | *.exe 10 | emconfig.json 11 | xmltmp 12 | dltmp* 13 | runtime/ 14 | *.log 15 | xclbin/ 16 | lib/ 17 | *.orig 18 | *_BACKUP_* 19 | *_BASE_* 20 | *_LOCAL_* 21 | *_REMOTE_* 22 | *.swp 23 | *_x.* 24 | *build/ 25 | *build_dir.* 26 | *.run/ 27 | .vscode 28 | data 29 | build* 30 | spmspv 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2021, Cornell Zhang Research Group 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | GraphLily: A Graph Linear Algebra Overlay on HBM-Equipped FPGAs 2 | =============================================================== 3 | 4 | GraphLily is the first FPGA overlay for graph processing. 5 | GraphLily supports a rich set of graph algorithms by adopting the GraphBLAS programming interface, which formulates graph algorithms as sparse linear algebra kernels. 6 | GraphLily effectively utilizes the high bandwidth of HBM to accelerate SpMV and SpMSpV, the two widely-used kernels in GraphBLAS, by co-designing the data layout and the accelerator architecture. 7 | GraphLily further builds a middleware to provide runtime support, enabling users to easily port existing GraphBLAS programs from CPUs/GPUs to FPGAs. 8 | 9 | For more information, refer to our [ICCAD'21 paper](https://www.csl.cornell.edu/~zhiruz/pdfs/graphlily-iccad2021.pdf). 10 | ``` 11 | @article{hu2021graphlily, 12 | title={GraphLily: Accelerating Graph Linear Algebra on HBM-Equipped FPGAs}, 13 | author={Hu, Yuwei and Du, Yixiao and Ustun, Ecenur and Zhang, Zhiru}, 14 | journal={International Conference On Computer Aided Design}, 15 | year={2021} 16 | } 17 | ``` 18 | 19 | ## Prerequisites 20 | - Platform: Xilinx Alveo U280 21 | - Tool: Xilinx Vitis 2019.2 22 | 23 | ## Run Benchmarking 24 | 25 | ### Clone the repo 26 | ``` 27 | git clone git@github.com:cornell-zhang/GraphLily.git 28 | export GRAPHLILY_ROOT_PATH=/path/to/GraphLily 29 | ``` 30 | 31 | ### Get the bitstream 32 | - A pre-compiled bitstream (166 MHz) is provided [here](https://drive.google.com/file/d/1OGry0OtbvmGiSirhJy3tCPz51VMeV1HM/view?usp=sharing). 33 | - To generate a new bitstream: 34 | ``` 35 | cd GraphLily/generate_bitstream 36 | make synthesize 37 | ``` 38 | 39 | ### Prepare datasets 40 | The input is an adjacency matrix in csr format stored as a scipy npz file. Please install [cnpy](https://github.com/rogersce/cnpy), which is required for data loading. 41 | 42 | Our ICCAD'21 paper evaluated the following six graph datasets: 43 | 44 | - [googleplus](https://drive.google.com/file/d/1Wv9C7s0lK0KdrRPUsTqjlENvbMMKfykg/view?usp=sharing) 45 | - [ogbl-ppa](https://drive.google.com/file/d/189Qp9h4BxXR8dAiQdmJWkW89y08eU5qR/view?usp=sharing) 46 | - [hollywood](https://drive.google.com/file/d/1irBTVuYdJaMXQTUGQh7AerBjs784ykeO/view?usp=sharing) 47 | - [pokec](https://drive.google.com/file/d/1UEwsIYgNWmm3ucBfatjg_lmG25oXWWI-/view?usp=sharing) 48 | - [ogbn-products](https://drive.google.com/file/d/1yBJjW5aRpJt2if32gOWSmaYcI10KDQj0/view?usp=sharing) 49 | - [orkut](https://drive.google.com/file/d/1Am0hPLhGNAwjYWt5nd_-XsIaKBiWcwqt/view?usp=sharing) 50 | 51 | ### Run 52 | Go to the GraphLily/benchmark folder, modify the cnpy path in Makefile, modify the bitstream path and the datasets path in run_bfs.sh, then: 53 | ``` 54 | bash run_bfs.sh 55 | ``` 56 | -------------------------------------------------------------------------------- /benchmark/Makefile: -------------------------------------------------------------------------------- 1 | HOST_ARCH = x86 2 | 3 | CXXFLAGS += -Wall -O3 -g -std=c++11 4 | CXXFLAGS += -I$(GRAPHLILY_ROOT_PATH) 5 | 6 | LDFLAGS += -lrt -lstdc++ 7 | 8 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/xcl2/xcl2.mk 9 | CXXFLAGS += $(xcl2_CXXFLAGS) 10 | LDFLAGS += $(xcl2_LDFLAGS) 11 | 12 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/opencl/opencl.mk 13 | CXXFLAGS += $(opencl_CXXFLAGS) 14 | LDFLAGS += $(opencl_LDFLAGS) 15 | 16 | CXXFLAGS += -I/work/shared/common/project_build/graphblas/software/cnpy 17 | LDFLAGS += -L/work/shared/common/project_build/graphblas/software/cnpy/build -lcnpy 18 | 19 | BUILD_DIR = ./build 20 | 21 | bench_spmv: bench_spmv.cpp $(xcl2_SRCS) 22 | g++ $(CXXFLAGS) bench_spmv.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 23 | mkdir -p $(BUILD_DIR) 24 | mv bench_spmv $(BUILD_DIR)/ 25 | 26 | bench_spmspv: bench_spmspv.cpp $(xcl2_SRCS) 27 | g++ $(CXXFLAGS) bench_spmspv.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 28 | mkdir -p $(BUILD_DIR) 29 | mv bench_spmspv $(BUILD_DIR)/ 30 | 31 | bench_bfs: bench_bfs.cpp $(xcl2_SRCS) 32 | g++ $(CXXFLAGS) bench_bfs.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 33 | mkdir -p $(BUILD_DIR) 34 | mv bench_bfs $(BUILD_DIR)/ 35 | 36 | bench_sssp: bench_sssp.cpp $(xcl2_SRCS) 37 | g++ $(CXXFLAGS) bench_sssp.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 38 | mkdir -p $(BUILD_DIR) 39 | mv bench_sssp $(BUILD_DIR)/ 40 | 41 | bench_pagerank: bench_pagerank.cpp $(xcl2_SRCS) 42 | g++ $(CXXFLAGS) bench_pagerank.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 43 | mkdir -p $(BUILD_DIR) 44 | mv bench_pagerank $(BUILD_DIR)/ 45 | 46 | clean: 47 | rm -rf $(BUILD_DIR) 48 | -------------------------------------------------------------------------------- /benchmark/analyze_load_balance_spmv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | 4 | num_channels = 16 5 | pack_size = 8 6 | 7 | def calculate_degree_standard_deviation(nnz_each_row): 8 | return nnz_each_row.std() 9 | 10 | def calculate_imbalance_factor(nnz_each_row): 11 | num_PEs = num_channels * pack_size 12 | nnz_each_PE = np.zeros(num_PEs) 13 | step = num_PEs 14 | for i in range(num_PEs): 15 | nnz_each_PE[i] = nnz_each_row[i::step].sum() 16 | return nnz_each_PE.max() / nnz_each_PE.mean() 17 | 18 | path = "/work/shared/common/research/graphblas/data/sparse_matrix_graph/" 19 | datasets = ["gplus_108K_13M_csr_float32.npz", 20 | "ogbl_ppa_576K_42M_csr_float32.npz", 21 | "hollywood_1M_113M_csr_float32.npz", 22 | "pokec_1633K_31M_csr_float32.npz", 23 | "ogbn_products_2M_124M_csr_float32.npz", 24 | "orkut_3M_213M_csr_float32.npz"] 25 | 26 | if __name__ == "__main__": 27 | for dataset in datasets: 28 | csr_matrix = scipy.sparse.load_npz(path + dataset) 29 | nnz_each_row = csr_matrix.indptr[1::] - csr_matrix.indptr[:-1:] 30 | standard_deviation = calculate_degree_standard_deviation(nnz_each_row) 31 | average_degree = csr_matrix.nnz / csr_matrix.shape[0] 32 | normalized_standard_deviation = standard_deviation / average_degree 33 | print(dataset) 34 | print("standard_deviation: ", standard_deviation) 35 | print("normalized_standard_deviation: ", normalized_standard_deviation) 36 | print("imbalance_factor: ", calculate_imbalance_factor(nnz_each_row)) 37 | -------------------------------------------------------------------------------- /benchmark/bench_bfs.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context" 3 | #pragma GCC diagnostic ignored "-Wuninitialized" 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 5 | 6 | #include 7 | #include 8 | 9 | #include "graphlily/app/bfs.h" 10 | 11 | 12 | template 13 | void verify(std::vector> &reference_results, 14 | std::vector> &kernel_results) { 15 | if (!(reference_results.size() == kernel_results.size())) { 16 | std::cout << "Size mismatch!" << std::endl; 17 | exit(EXIT_FAILURE); 18 | } 19 | float epsilon = 0.0001; 20 | for (size_t i = 0; i < reference_results.size(); i++) { 21 | if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) { 22 | std::cout << "Error: Result mismatch" 23 | << std::endl; 24 | std::cout << "i = " << i 25 | << " Reference result = " << reference_results[i] 26 | << " Kernel result = " << kernel_results[i] 27 | << std::endl; 28 | exit(EXIT_FAILURE); 29 | } 30 | } 31 | } 32 | 33 | 34 | void bench_bfs(uint32_t num_channels, uint32_t spmv_out_buf_len, 35 | uint32_t spmspv_out_buf_len, uint32_t vec_buf_len, 36 | std::string bitstream, std::string dataset, uint32_t num_iterations) { 37 | graphlily::app::BFS bfs(num_channels, spmv_out_buf_len, spmspv_out_buf_len, vec_buf_len); 38 | bfs.set_target("hw"); 39 | bfs.set_up_runtime(bitstream); 40 | 41 | bool skip_empty_rows = true; 42 | bfs.load_and_format_matrix(dataset, skip_empty_rows); 43 | std::cout << "finished load_and_format_matrix" << std::endl; 44 | bfs.send_matrix_host_to_device(); 45 | 46 | uint32_t source = 0; 47 | auto reference_results = bfs.compute_reference_results(source, num_iterations); 48 | 49 | // // Make sure the results make sense, e.g., the starting vertex connects to at least one vertex 50 | // for (int i = 0; i < 10; i++) { 51 | // std::cout << reference_results[i] <(reference_results, kernel_results); 57 | // std::cout << "BFS pull passed" << std::endl; 58 | 59 | uint32_t num_runs = 1; 60 | auto t1 = std::chrono::high_resolution_clock::now(); 61 | for (size_t i = 0; i < num_runs; i++) { 62 | kernel_results = bfs.pull(source, num_iterations); 63 | } 64 | auto t2 = std::chrono::high_resolution_clock::now(); 65 | float average_time_in_sec = float(std::chrono::duration_cast(t2 - t1).count()) 66 | / 1000000 / num_runs; 67 | std::cout << "Pull average_time: " << average_time_in_sec * 1000 << " ms" << std::endl; 68 | uint32_t nnz = bfs.get_nnz(); 69 | double op_count = nnz * num_iterations; 70 | double throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec; 71 | std::cout << "Pull Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl; 72 | 73 | // Pull-Push 74 | float threshold = 0.001; 75 | kernel_results = bfs.pull_push(source, num_iterations, threshold); 76 | // verify(reference_results, kernel_results); 77 | // std::cout << "BFS pull-push passed" << std::endl; 78 | 79 | num_runs = 1; 80 | t1 = std::chrono::high_resolution_clock::now(); 81 | for (size_t i = 0; i < num_runs; i++) { 82 | kernel_results = bfs.pull_push(source, num_iterations, threshold); 83 | } 84 | t2 = std::chrono::high_resolution_clock::now(); 85 | average_time_in_sec = float(std::chrono::duration_cast(t2 - t1).count()) 86 | / 1000000 / num_runs; 87 | std::cout << "Pull-Push average_time: " << average_time_in_sec * 1000 << " ms" << std::endl; 88 | throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec; 89 | std::cout << "Pull-Push Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl; 90 | } 91 | 92 | 93 | int main(int argc, char *argv[]) { 94 | bench_bfs(strtol(argv[1], NULL, 10), 95 | strtol(argv[2], NULL, 10), 96 | strtol(argv[3], NULL, 10), 97 | strtol(argv[4], NULL, 10), 98 | argv[5], 99 | argv[6], 100 | strtol(argv[7], NULL, 10)); 101 | } 102 | 103 | #pragma GCC diagnostic pop 104 | -------------------------------------------------------------------------------- /benchmark/bench_pagerank.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context" 3 | #pragma GCC diagnostic ignored "-Wuninitialized" 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 5 | 6 | #include 7 | #include 8 | 9 | #include "graphlily/app/pagerank.h" 10 | 11 | 12 | template 13 | void verify(std::vector> &reference_results, 14 | std::vector> &kernel_results) { 15 | if (!(reference_results.size() == kernel_results.size())) { 16 | std::cout << "Size mismatch!" << std::endl; 17 | exit(EXIT_FAILURE); 18 | } 19 | float epsilon = 0.0001; 20 | for (size_t i = 0; i < reference_results.size(); i++) { 21 | if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) { 22 | std::cout << "Error: Result mismatch" 23 | << std::endl; 24 | std::cout << "i = " << i 25 | << " Reference result = " << reference_results[i] 26 | << " Kernel result = " << kernel_results[i] 27 | << std::endl; 28 | exit(EXIT_FAILURE); 29 | } 30 | } 31 | } 32 | 33 | 34 | void bench_pagerank(uint32_t num_channels, uint32_t spmv_out_buf_len, 35 | uint32_t vec_buf_len, std::string bitstream, std::string dataset) { 36 | graphlily::app::PageRank pagerank(graphlily::num_hbm_channels, spmv_out_buf_len, vec_buf_len); 37 | pagerank.set_target("hw"); 38 | pagerank.set_up_runtime(bitstream); 39 | 40 | float damping = 0.9; 41 | bool skip_empty_rows = true; 42 | pagerank.load_and_format_matrix(dataset, damping, skip_empty_rows); 43 | std::cout << "finished load_and_format_matrix" << std::endl; 44 | pagerank.send_matrix_host_to_device(); 45 | 46 | uint32_t num_iterations = 10; 47 | auto reference_results = pagerank.compute_reference_results(damping, num_iterations); 48 | 49 | auto kernel_results = pagerank.pull(damping, num_iterations); 50 | // verify(reference_results, kernel_results); 51 | // std::cout << "PageRank passed" << std::endl; 52 | 53 | uint32_t num_runs = 1; 54 | auto t1 = std::chrono::high_resolution_clock::now(); 55 | for (size_t i = 0; i < num_runs; i++) { 56 | kernel_results = pagerank.pull(damping, num_iterations); 57 | } 58 | auto t2 = std::chrono::high_resolution_clock::now(); 59 | float average_time_in_sec = float(std::chrono::duration_cast(t2 - t1).count()) 60 | / 1000000 / num_runs / num_iterations; 61 | std::cout << "PageRank time for one iteration: " << average_time_in_sec * 1000 << " ms" << std::endl; 62 | uint32_t nnz = pagerank.get_nnz(); 63 | double op_count = nnz; 64 | double throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec; 65 | std::cout << "PageRank Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl; 66 | } 67 | 68 | 69 | int main(int argc, char *argv[]) { 70 | bench_pagerank(strtol(argv[1], NULL, 10), 71 | strtol(argv[2], NULL, 10), 72 | strtol(argv[3], NULL, 10), 73 | argv[4], 74 | argv[5]); 75 | } 76 | 77 | #pragma GCC diagnostic pop 78 | -------------------------------------------------------------------------------- /benchmark/bench_spmv.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context" 3 | #pragma GCC diagnostic ignored "-Wuninitialized" 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 5 | 6 | #include 7 | #include 8 | 9 | #include "xcl2.hpp" 10 | 11 | #include "graphlily/io/data_loader.h" 12 | #include "graphlily/module/spmv_module.h" 13 | 14 | 15 | template 16 | void verify(std::vector> &reference_results, 17 | std::vector> &kernel_results) { 18 | if (!(reference_results.size() == kernel_results.size())) { 19 | std::cout << "Size mismatch!" << std::endl; 20 | exit(EXIT_FAILURE); 21 | } 22 | float epsilon = 0.0001; 23 | for (size_t i = 0; i < reference_results.size(); i++) { 24 | if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) { 25 | std::cout << "Error: Result mismatch" 26 | << std::endl; 27 | std::cout << "i = " << i 28 | << " Reference result = " << reference_results[i] 29 | << " Kernel result = " << kernel_results[i] 30 | << std::endl; 31 | exit(EXIT_FAILURE); 32 | } 33 | } 34 | } 35 | 36 | 37 | void bench_spmv(uint32_t num_channels, uint32_t out_buf_len, uint32_t vec_buf_len, 38 | std::string bitstream, std::string dataset) { 39 | graphlily::module::SpMVModule spmv(num_channels, 40 | out_buf_len, 41 | vec_buf_len); 42 | spmv.set_target("hw"); 43 | graphlily::MaskType mask_type = graphlily::kNoMask; 44 | spmv.set_mask_type(mask_type); 45 | spmv.set_semiring(graphlily::ArithmeticSemiring); 46 | spmv.set_up_runtime(bitstream); 47 | 48 | std::string csr_float_npz_path = dataset; 49 | CSRMatrix csr_matrix = graphlily::io::load_csr_matrix_from_float_npz(csr_float_npz_path); 50 | for (auto &x : csr_matrix.adj_data) x = 1.0 / csr_matrix.num_rows; 51 | 52 | graphlily::io::util_round_csr_matrix_dim( 53 | csr_matrix, 54 | num_channels * graphlily::pack_size, 55 | graphlily::pack_size); 56 | 57 | std::vector> vector_float(csr_matrix.num_cols); 58 | std::generate(vector_float.begin(), vector_float.end(), [&]{return float(rand() % 2);}); 59 | std::vector> vector(vector_float.begin(), 60 | vector_float.end()); 61 | 62 | std::vector> mask_float(csr_matrix.num_cols); 63 | std::generate(mask_float.begin(), mask_float.end(), [&](){return float(rand() % 2);}); 64 | std::vector> mask(mask_float.begin(), 65 | mask_float.end()); 66 | 67 | bool skip_empty_rows = true; 68 | spmv.load_and_format_matrix(csr_matrix, skip_empty_rows); 69 | 70 | std::cout << "finished load_and_format_matrix" << std::endl; 71 | 72 | spmv.send_matrix_host_to_device(); 73 | spmv.send_vector_host_to_device(vector); 74 | // send the mask to device even if the kernel does not use it 75 | spmv.send_mask_host_to_device(mask); 76 | 77 | std::cout << "start run" << std::endl; 78 | 79 | spmv.run(); 80 | 81 | auto kernel_results = spmv.send_results_device_to_host(); 82 | std::vector> reference_results; 83 | if (mask_type == graphlily::kNoMask) { 84 | reference_results = spmv.compute_reference_results(vector_float); 85 | } else { 86 | reference_results = spmv.compute_reference_results(vector_float, mask_float); 87 | } 88 | 89 | // for (int i = 0; i < 10; i++) { 90 | // std::cout << reference_results[i] << " " << kernel_results[i] <(reference_results, kernel_results); 94 | // std::cout << "SpMV passed" << std::endl; 95 | 96 | uint32_t num_runs = 100; 97 | auto t1 = std::chrono::high_resolution_clock::now(); 98 | for (size_t i = 0; i < num_runs; i++) { 99 | spmv.run(); 100 | } 101 | auto t2 = std::chrono::high_resolution_clock::now(); 102 | float average_time_in_sec = float(std::chrono::duration_cast(t2 - t1).count()) 103 | / 1000000 / num_runs; 104 | std::cout << "average_time: " << average_time_in_sec * 1000 << " ms" << std::endl; 105 | 106 | uint32_t nnz = spmv.get_nnz(); 107 | double throughput = nnz; 108 | throughput /= 1000; 109 | throughput /= 1000; 110 | throughput /= 1000; 111 | throughput /= average_time_in_sec; 112 | std::cout << "Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl; 113 | } 114 | 115 | 116 | int main(int argc, char *argv[]) { 117 | bench_spmv(strtol(argv[1], NULL, 10), 118 | strtol(argv[2], NULL, 10), 119 | strtol(argv[3], NULL, 10), 120 | argv[4], 121 | argv[5]); 122 | } 123 | 124 | #pragma GCC diagnostic pop 125 | -------------------------------------------------------------------------------- /benchmark/bench_sssp.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context" 3 | #pragma GCC diagnostic ignored "-Wuninitialized" 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 5 | 6 | #include 7 | #include 8 | 9 | #include "graphlily/app/sssp.h" 10 | 11 | 12 | template 13 | void verify(std::vector> &reference_results, 14 | std::vector> &kernel_results) { 15 | if (!(reference_results.size() == kernel_results.size())) { 16 | std::cout << "Size mismatch!" << std::endl; 17 | exit(EXIT_FAILURE); 18 | } 19 | float epsilon = 0.0001; 20 | for (size_t i = 0; i < reference_results.size(); i++) { 21 | if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) { 22 | std::cout << "Error: Result mismatch" 23 | << std::endl; 24 | std::cout << "i = " << i 25 | << " Reference result = " << reference_results[i] 26 | << " Kernel result = " << kernel_results[i] 27 | << std::endl; 28 | exit(EXIT_FAILURE); 29 | } 30 | } 31 | } 32 | 33 | 34 | void bench_sssp(uint32_t num_channels, uint32_t spmv_out_buf_len, 35 | uint32_t spmspv_out_buf_len, uint32_t vec_buf_len, 36 | std::string bitstream, std::string dataset, uint32_t num_iterations) { 37 | graphlily::app::SSSP sssp(num_channels, spmv_out_buf_len, spmspv_out_buf_len, vec_buf_len); 38 | sssp.set_target("hw"); 39 | sssp.set_up_runtime(bitstream); 40 | 41 | bool skip_empty_rows = true; 42 | sssp.load_and_format_matrix(dataset, skip_empty_rows); 43 | std::cout << "finished load_and_format_matrix" << std::endl; 44 | sssp.send_matrix_host_to_device(); 45 | 46 | uint32_t source = 0; 47 | auto reference_results = sssp.compute_reference_results(source, num_iterations); 48 | 49 | // Pull 50 | auto kernel_results = sssp.pull(source, num_iterations); 51 | // verify(reference_results, kernel_results); 52 | // std::cout << "SSSP pull passed" << std::endl; 53 | 54 | uint32_t num_runs = 1; 55 | auto t1 = std::chrono::high_resolution_clock::now(); 56 | for (size_t i = 0; i < num_runs; i++) { 57 | kernel_results = sssp.pull(source, num_iterations); 58 | } 59 | auto t2 = std::chrono::high_resolution_clock::now(); 60 | float average_time_in_sec = float(std::chrono::duration_cast(t2 - t1).count()) 61 | / 1000000 / num_runs; 62 | std::cout << "Pull average_time: " << average_time_in_sec * 1000 << " ms" << std::endl; 63 | uint32_t nnz = sssp.get_nnz(); 64 | double op_count = nnz * num_iterations; 65 | double throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec; 66 | std::cout << "Pull Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl; 67 | 68 | // Pull-Push 69 | float threshold = 0.001; 70 | kernel_results = sssp.pull_push(source, num_iterations, threshold); 71 | // verify(reference_results, kernel_results); 72 | // std::cout << "SSSP pull-push passed" << std::endl; 73 | 74 | num_runs = 1; 75 | t1 = std::chrono::high_resolution_clock::now(); 76 | for (size_t i = 0; i < num_runs; i++) { 77 | kernel_results = sssp.pull_push(source, num_iterations, threshold); 78 | } 79 | t2 = std::chrono::high_resolution_clock::now(); 80 | average_time_in_sec = float(std::chrono::duration_cast(t2 - t1).count()) 81 | / 1000000 / num_runs; 82 | std::cout << "Pull-Push average_time: " << average_time_in_sec * 1000 << " ms" << std::endl; 83 | throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec; 84 | std::cout << "Pull-Push Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl; 85 | } 86 | 87 | 88 | int main(int argc, char *argv[]) { 89 | bench_sssp(strtol(argv[1], NULL, 10), 90 | strtol(argv[2], NULL, 10), 91 | strtol(argv[3], NULL, 10), 92 | strtol(argv[4], NULL, 10), 93 | argv[5], 94 | argv[6], 95 | strtol(argv[7], NULL, 10)); 96 | } 97 | 98 | #pragma GCC diagnostic pop 99 | -------------------------------------------------------------------------------- /benchmark/run_bfs.sh: -------------------------------------------------------------------------------- 1 | make bench_bfs 2 | 3 | num_channels=16 4 | spmv_out_buf_len=1024000 5 | spmspv_out_buf_len=256000 6 | vec_buf_len=30720 7 | 8 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/ 9 | bitstream+=open_source_166MHz/overlay.xclbin 10 | 11 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph 12 | 13 | DATSETS=(gplus_108K_13M_csr_float32.npz 14 | ogbl_ppa_576K_42M_csr_float32.npz 15 | hollywood_1M_113M_csr_float32.npz 16 | pokec_1633K_31M_csr_float32.npz 17 | ogbn_products_2M_124M_csr_float32.npz 18 | orkut_3M_213M_csr_float32.npz) 19 | 20 | NUM_ITER=(7 11 10 11 23 6) 21 | 22 | BUILD_DIR=./build 23 | 24 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do 25 | echo ${BUILD_DIR}/bench_bfs ${DATSETS[i]} 26 | ${BUILD_DIR}/bench_bfs $num_channels $spmv_out_buf_len $spmspv_out_buf_len $vec_buf_len \ 27 | $bitstream $DATASET_PATH/${DATSETS[i]} ${NUM_ITER[i]} 28 | done 29 | -------------------------------------------------------------------------------- /benchmark/run_pagerank.sh: -------------------------------------------------------------------------------- 1 | make bench_pagerank 2 | 3 | num_channels=16 4 | spmv_out_buf_len=1024000 5 | vec_buf_len=30720 6 | 7 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/ 8 | bitstream+=open_source_166MHz/overlay.xclbin 9 | 10 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph 11 | 12 | DATSETS=(gplus_108K_13M_csr_float32.npz 13 | ogbl_ppa_576K_42M_csr_float32.npz 14 | hollywood_1M_113M_csr_float32.npz 15 | pokec_1633K_31M_csr_float32.npz 16 | ogbn_products_2M_124M_csr_float32.npz 17 | orkut_3M_213M_csr_float32.npz) 18 | 19 | BUILD_DIR=./build 20 | 21 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do 22 | echo ${BUILD_DIR}/bench_pagerank ${DATSETS[i]} 23 | ${BUILD_DIR}/bench_pagerank $num_channels $spmv_out_buf_len $vec_buf_len $bitstream $DATASET_PATH/${DATSETS[i]} 24 | done 25 | -------------------------------------------------------------------------------- /benchmark/run_spmv.sh: -------------------------------------------------------------------------------- 1 | make bench_spmv 2 | 3 | num_channels=16 4 | spmv_out_buf_len=1024000 5 | vec_buf_len=30720 6 | 7 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/ 8 | bitstream+=open_source_166MHz/overlay.xclbin 9 | 10 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph 11 | 12 | DATSETS=(gplus_108K_13M_csr_float32.npz 13 | ogbl_ppa_576K_42M_csr_float32.npz 14 | hollywood_1M_113M_csr_float32.npz 15 | pokec_1633K_31M_csr_float32.npz 16 | ogbn_products_2M_124M_csr_float32.npz 17 | orkut_3M_213M_csr_float32.npz) 18 | 19 | BUILD_DIR=./build 20 | 21 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do 22 | echo ${BUILD_DIR}/bench_spmv ${DATSETS[i]} 23 | ${BUILD_DIR}/bench_spmv $num_channels $spmv_out_buf_len $vec_buf_len $bitstream $DATASET_PATH/${DATSETS[i]} 24 | done 25 | -------------------------------------------------------------------------------- /benchmark/run_sssp.sh: -------------------------------------------------------------------------------- 1 | make bench_sssp 2 | 3 | num_channels=16 4 | spmv_out_buf_len=1024000 5 | spmspv_out_buf_len=256000 6 | vec_buf_len=30720 7 | 8 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/ 9 | bitstream+=open_source_166MHz/overlay.xclbin 10 | 11 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph 12 | 13 | DATSETS=(gplus_108K_13M_csr_float32.npz 14 | ogbl_ppa_576K_42M_csr_float32.npz 15 | hollywood_1M_113M_csr_float32.npz 16 | pokec_1633K_31M_csr_float32.npz 17 | ogbn_products_2M_124M_csr_float32.npz 18 | orkut_3M_213M_csr_float32.npz) 19 | 20 | NUM_ITER=(7 11 10 11 23 6) 21 | 22 | BUILD_DIR=./build 23 | 24 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do 25 | echo ${BUILD_DIR}/bench_sssp ${DATSETS[i]} 26 | ${BUILD_DIR}/bench_sssp $num_channels $spmv_out_buf_len $spmspv_out_buf_len $vec_buf_len \ 27 | $bitstream $DATASET_PATH/${DATSETS[i]} ${NUM_ITER[i]} 28 | done 29 | -------------------------------------------------------------------------------- /generate_bitstream/Makefile: -------------------------------------------------------------------------------- 1 | HOST_ARCH = x86 2 | 3 | CXXFLAGS += -Wall -O3 -g -std=c++11 4 | CXXFLAGS += -I$(GRAPHLILY_ROOT_PATH) 5 | 6 | LDFLAGS += -lrt -lstdc++ 7 | 8 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/xcl2/xcl2.mk 9 | CXXFLAGS += $(xcl2_CXXFLAGS) 10 | LDFLAGS += $(xcl2_LDFLAGS) 11 | 12 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/opencl/opencl.mk 13 | CXXFLAGS += $(opencl_CXXFLAGS) 14 | LDFLAGS += $(opencl_LDFLAGS) 15 | 16 | BUILD_DIR = ./build 17 | synthesize: synthesize.cpp $(xcl2_SRCS) 18 | g++ $(CXXFLAGS) synthesize.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 19 | mkdir -p $(BUILD_DIR) 20 | mv synthesize $(BUILD_DIR)/ 21 | cd $(BUILD_DIR); ./synthesize 22 | -------------------------------------------------------------------------------- /generate_bitstream/synthesize.cpp: -------------------------------------------------------------------------------- 1 | #include "graphlily/synthesizer/overlay_synthesizer.h" 2 | 3 | 4 | int main(int argc, char *argv[]) { 5 | uint32_t spmv_out_buf_len = 1000 * 1024; 6 | uint32_t spmspv_out_buf_len = 250 * 1024; 7 | uint32_t vec_buf_len = 30 * 1024; 8 | uint32_t num_hbm_channels = 16; 9 | 10 | graphlily::synthesizer::OverlaySynthesizer synthesizer(num_hbm_channels, 11 | spmv_out_buf_len, 12 | spmspv_out_buf_len, 13 | vec_buf_len); 14 | synthesizer.set_target("hw"); 15 | synthesizer.synthesize(); 16 | } 17 | -------------------------------------------------------------------------------- /graphlily/app/module_collection.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_MODULE_COLLECTION_H_ 2 | #define GRAPHLILY_MODULE_COLLECTION_H_ 3 | 4 | #include "graphlily/global.h" 5 | #include "graphlily/module/base_module.h" 6 | 7 | 8 | namespace graphlily { 9 | namespace app { 10 | 11 | using namespace module; 12 | 13 | class ModuleCollection { 14 | protected: 15 | /*! \brief The modules */ 16 | std::vector modules_; 17 | /*! \brief The number of modules */ 18 | uint32_t num_modules_ = 0; 19 | /*! \brief The kernel names */ 20 | std::vector kernel_names_; 21 | /*! \brief The target; can be sw_emu, hw_emu, hw */ 22 | std::string target_; 23 | 24 | // OpenCL runtime 25 | cl::Device device_; 26 | cl::Context context_; 27 | std::vector kernels_; 28 | std::vector command_queues_; 29 | 30 | public: 31 | ModuleCollection() {}; 32 | 33 | /*! 34 | * \brief Free up resources in the destructor. 35 | */ 36 | ~ModuleCollection() { 37 | for (size_t i = 0; i < this->num_modules_; i++) { 38 | delete this->modules_[i]; 39 | } 40 | } 41 | 42 | /*! 43 | * \brief Add a module. 44 | * \param module The module to be added. 45 | */ 46 | void add_module(BaseModule *module) { 47 | this->modules_.push_back(module); 48 | std::string kernel_name = module->get_kernel_name(); 49 | this->kernel_names_.push_back(kernel_name); 50 | this->num_modules_++; 51 | } 52 | 53 | /*! 54 | * \brief Set the target. 55 | */ 56 | void set_target(std::string target) { 57 | assert(target == "sw_emu" || target == "hw_emu" || target == "hw"); 58 | this->target_ = target; 59 | } 60 | 61 | /*! 62 | * \brief Load the xclbin file and set up runtime. 63 | * \param xclbin_file_path The xclbin file path. 64 | */ 65 | void set_up_runtime(std::string xclbin_file_path); 66 | }; 67 | 68 | 69 | void ModuleCollection::set_up_runtime(std::string xclbin_file_path) { 70 | this->kernels_.resize(this->num_modules_); 71 | this->command_queues_.resize(this->num_modules_); 72 | cl_int err; 73 | // Set this->device_ and this->context_ 74 | if (this->target_ == "sw_emu" || this->target_ == "hw_emu") { 75 | setenv("XCL_EMULATION_MODE", this->target_.c_str(), true); 76 | } 77 | this->device_ = graphlily::find_device(); 78 | this->context_ = cl::Context(this->device_, NULL, NULL, NULL); 79 | // Set this->kernels_ 80 | auto file_buf = xcl::read_binary_file(xclbin_file_path); 81 | cl::Program::Binaries binaries{{file_buf.data(), file_buf.size()}}; 82 | cl::Program program(this->context_, {this->device_}, binaries, NULL, &err); 83 | if (err != CL_SUCCESS) { 84 | std::cout << "Failed to program device with xclbin file\n"; 85 | } else { 86 | std::cout << "Successfully programmed device with xclbin file\n"; 87 | } 88 | for (size_t i = 0; i < this->num_modules_; i++) { 89 | OCL_CHECK(err, this->kernels_[i] = cl::Kernel(program, this->kernel_names_[i].c_str(), &err)); 90 | } 91 | // Set this->command_queues_ 92 | for (size_t i = 0; i < this->num_modules_; i++) { 93 | OCL_CHECK(err, this->command_queues_[i] = cl::CommandQueue(this->context_, 94 | this->device_, 95 | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | 96 | CL_QUEUE_PROFILING_ENABLE, 97 | &err)); 98 | } 99 | // Set up runtime for each module 100 | for (size_t i = 0; i < this->num_modules_; i++) { 101 | this->modules_[i]->set_device(this->device_); 102 | this->modules_[i]->set_context(this->context_); 103 | this->modules_[i]->set_kernel(this->kernels_[i]); 104 | this->modules_[i]->set_command_queue(this->command_queues_[i]); 105 | } 106 | // Set unused arguments for each module 107 | for (size_t i = 0; i < this->num_modules_; i++) { 108 | this->modules_[i]->set_unused_args(); 109 | } 110 | // Set the mode for each module 111 | for (size_t i = 0; i < this->num_modules_; i++) { 112 | this->modules_[i]->set_mode(); 113 | } 114 | } 115 | 116 | } // namespace app 117 | } // namespace graphlily 118 | 119 | #endif // GRAPHLILY_MODULE_COLLECTION_H_ 120 | -------------------------------------------------------------------------------- /graphlily/app/pagerank.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_APP_PAGERANK_H_ 2 | #define GRAPHLILY_APP_PAGERANK_H_ 3 | 4 | #include "graphlily/app/module_collection.h" 5 | #include "graphlily/module/spmv_module.h" 6 | #include "graphlily/module/add_scalar_vector_dense_module.h" 7 | #include "graphlily/io/data_loader.h" 8 | #include "graphlily/io/data_formatter.h" 9 | 10 | #include 11 | #include 12 | 13 | 14 | namespace graphlily { 15 | namespace app { 16 | 17 | class PageRank : public app::ModuleCollection { 18 | private: 19 | // modules 20 | graphlily::module::SpMVModule *SpMV_; 21 | graphlily::module::eWiseAddModule *eWiseAdd_; 22 | // Sparse matrix size 23 | uint32_t matrix_num_rows_; 24 | uint32_t matrix_num_cols_; 25 | // SpMV kernel configuration 26 | uint32_t num_channels_; 27 | uint32_t spmv_out_buf_len_; 28 | uint32_t vec_buf_len_; 29 | // Semiring 30 | graphlily::SemiringType semiring_ = graphlily::ArithmeticSemiring; 31 | // Data types 32 | using aligned_dense_vec_t = graphlily::aligned_dense_vec_t; 33 | using aligned_sparse_vec_t = graphlily::aligned_sparse_vec_t; 34 | using aligned_dense_float_vec_t = graphlily::aligned_dense_float_vec_t; 35 | 36 | public: 37 | PageRank(uint32_t num_channels, uint32_t spmv_out_buf_len, uint32_t vec_buf_len) { 38 | this->num_channels_ = num_channels; 39 | this->spmv_out_buf_len_ = spmv_out_buf_len; 40 | this->vec_buf_len_ = vec_buf_len; 41 | 42 | this->SpMV_ = new graphlily::module::SpMVModule( 43 | this->num_channels_, 44 | this->spmv_out_buf_len_, 45 | this->vec_buf_len_); 46 | this->SpMV_->set_semiring(semiring_); 47 | this->SpMV_->set_mask_type(graphlily::kNoMask); 48 | this->add_module(this->SpMV_); 49 | 50 | this->eWiseAdd_ = new graphlily::module::eWiseAddModule(); 51 | this->add_module(this->eWiseAdd_); 52 | } 53 | 54 | 55 | uint32_t get_nnz() { 56 | return this->SpMV_->get_nnz(); 57 | } 58 | 59 | 60 | void load_and_format_matrix(std::string csr_float_npz_path, float damping, bool skip_empty_rows) { 61 | CSRMatrix csr_matrix = graphlily::io::load_csr_matrix_from_float_npz(csr_float_npz_path); 62 | graphlily::io::util_round_csr_matrix_dim( 63 | csr_matrix, 64 | this->num_channels_ * graphlily::pack_size, 65 | this->num_channels_ * graphlily::pack_size); 66 | graphlily::io::util_normalize_csr_matrix_by_outdegree(csr_matrix); 67 | for (auto &x : csr_matrix.adj_data) x = x * damping; 68 | this->SpMV_->load_and_format_matrix(csr_matrix, skip_empty_rows); 69 | this->matrix_num_rows_ = this->SpMV_->get_num_rows(); 70 | this->matrix_num_cols_ = this->SpMV_->get_num_cols(); 71 | assert(this->matrix_num_rows_ == this->matrix_num_cols_); 72 | } 73 | 74 | 75 | void send_matrix_host_to_device() { 76 | this->SpMV_->send_matrix_host_to_device(); 77 | } 78 | 79 | 80 | aligned_dense_vec_t pull(graphlily::val_t damping, uint32_t num_iterations) { 81 | aligned_dense_vec_t rank(this->matrix_num_rows_, 1.0 / this->matrix_num_rows_); 82 | this->SpMV_->send_vector_host_to_device(rank); 83 | this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf); 84 | this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf); 85 | for (size_t iter = 1; iter <= num_iterations; iter++) { 86 | this->SpMV_->run(); 87 | this->eWiseAdd_->run(this->matrix_num_rows_, (1 - damping) / this->matrix_num_rows_); 88 | } 89 | return this->SpMV_->send_vector_device_to_host(); 90 | } 91 | 92 | 93 | aligned_dense_vec_t pull_time_breakdown(graphlily::val_t damping, uint32_t num_iterations) { 94 | float total_time_ms = 0.0; 95 | float spmv_time_ms = 0.0; 96 | float ewise_time_ms = 0.0; 97 | float data_transfer_time_ms = 0.0; 98 | // Initialize 99 | auto total_time_start = std::chrono::high_resolution_clock::now(); 100 | auto spmv_time_start = std::chrono::high_resolution_clock::now(); 101 | auto ewise_time_start = std::chrono::high_resolution_clock::now(); 102 | auto data_transfer_time_start = std::chrono::high_resolution_clock::now(); 103 | auto total_time_end = std::chrono::high_resolution_clock::now(); 104 | auto spmv_time_end = std::chrono::high_resolution_clock::now(); 105 | auto ewise_time_end = std::chrono::high_resolution_clock::now(); 106 | auto data_transfer_time_end = std::chrono::high_resolution_clock::now(); 107 | 108 | data_transfer_time_start = std::chrono::high_resolution_clock::now(); 109 | aligned_dense_vec_t rank(this->matrix_num_rows_, 1.0 / this->matrix_num_rows_); 110 | this->SpMV_->send_vector_host_to_device(rank); 111 | this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf); 112 | this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf); 113 | data_transfer_time_end = std::chrono::high_resolution_clock::now(); 114 | data_transfer_time_ms += float(std::chrono::duration_cast( 115 | data_transfer_time_end - data_transfer_time_start).count()) / 1000; 116 | 117 | for (size_t iter = 1; iter <= num_iterations; iter++) { 118 | spmv_time_start = std::chrono::high_resolution_clock::now(); 119 | this->SpMV_->run(); 120 | spmv_time_end = std::chrono::high_resolution_clock::now(); 121 | spmv_time_ms += float(std::chrono::duration_cast( 122 | spmv_time_end - spmv_time_start).count()) / 1000; 123 | 124 | ewise_time_start = std::chrono::high_resolution_clock::now(); 125 | this->eWiseAdd_->run(this->matrix_num_rows_, (1 - damping) / this->matrix_num_rows_); 126 | ewise_time_end = std::chrono::high_resolution_clock::now(); 127 | ewise_time_ms += float(std::chrono::duration_cast( 128 | ewise_time_end - ewise_time_start).count()) / 1000; 129 | } 130 | 131 | data_transfer_time_start = std::chrono::high_resolution_clock::now(); 132 | auto result = this->SpMV_->send_mask_device_to_host(); // the mask of SpMV on the host is not valid 133 | data_transfer_time_end = std::chrono::high_resolution_clock::now(); 134 | data_transfer_time_ms += float(std::chrono::duration_cast( 135 | data_transfer_time_end - data_transfer_time_start).count()) / 1000; 136 | 137 | total_time_end = std::chrono::high_resolution_clock::now(); 138 | total_time_ms = float(std::chrono::duration_cast( 139 | total_time_end - total_time_start).count()) / 1000; 140 | 141 | std::cout << "total_time_ms per iteration: " << total_time_ms / num_iterations << std::endl; 142 | std::cout << "spmv_time_ms per iteration: " << spmv_time_ms / num_iterations << std::endl; 143 | std::cout << "ewise_time_ms per iteration: " << ewise_time_ms / num_iterations << std::endl; 144 | std::cout << "data_transfer_time_ms per iteration: " << data_transfer_time_ms / num_iterations << std::endl; 145 | 146 | return result; 147 | } 148 | 149 | 150 | aligned_dense_float_vec_t compute_reference_results(float damping, uint32_t num_iterations) { 151 | aligned_dense_float_vec_t rank(this->matrix_num_rows_, 1.0 / this->matrix_num_rows_); 152 | for (size_t iter = 1; iter <= num_iterations; iter++) { 153 | rank = this->SpMV_->compute_reference_results(rank); 154 | rank = this->eWiseAdd_->compute_reference_results(rank, 155 | this->matrix_num_rows_, 156 | (1 - damping) / this->matrix_num_rows_); 157 | } 158 | return rank; 159 | } 160 | }; 161 | 162 | } // namespace app 163 | } // namespace graphlily 164 | 165 | #endif // GRAPHLILY_APP_PAGERANK_H_ 166 | -------------------------------------------------------------------------------- /graphlily/app/sssp.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_APP_SSSP_H_ 2 | #define GRAPHLILY_APP_SSSP_H_ 3 | 4 | #include "graphlily/app/module_collection.h" 5 | #include "graphlily/module/spmv_module.h" 6 | #include "graphlily/module/spmspv_module.h" 7 | #include "graphlily/module/assign_vector_dense_module.h" 8 | #include "graphlily/module/assign_vector_sparse_module.h" 9 | #include "graphlily/module/add_scalar_vector_dense_module.h" 10 | #include "graphlily/io/data_loader.h" 11 | #include "graphlily/io/data_formatter.h" 12 | 13 | 14 | namespace { 15 | 16 | void _preprocess(CSRMatrix& csr_matrix) { 17 | // randomly initialize edge weights 18 | for (uint32_t i = 0; i < csr_matrix.adj_data.size(); i++) { 19 | // csr_matrix.adj_data[i] = i % 10 + 1; 20 | csr_matrix.adj_data[i] = 1; // When all edge weights are 1, SSSP becomes BFS 21 | } 22 | // add self edges and set their weights to zero 23 | uint32_t num_rows = csr_matrix.adj_indptr.size() - 1; 24 | std::vector nnz_each_row(num_rows); 25 | for (size_t i = 0; i < num_rows; i++) { 26 | nnz_each_row[i] = csr_matrix.adj_indptr[i + 1] - csr_matrix.adj_indptr[i]; 27 | } 28 | csr_matrix.adj_data.reserve(csr_matrix.adj_data.size() + num_rows); 29 | csr_matrix.adj_indices.reserve(csr_matrix.adj_indices.size() + num_rows); 30 | for (size_t row_idx = 0; row_idx < num_rows; row_idx++) { 31 | uint32_t start = csr_matrix.adj_indptr[row_idx]; 32 | uint32_t end = csr_matrix.adj_indptr[row_idx + 1]; 33 | if (start == end) { 34 | csr_matrix.adj_data.insert(csr_matrix.adj_data.begin() + start, float(0)); 35 | csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin() + start, row_idx); 36 | nnz_each_row[row_idx]++; 37 | } else { 38 | bool add_self_edge = false; 39 | for (size_t i = start; i < end; i++) { 40 | uint32_t col_idx = csr_matrix.adj_indices[i]; 41 | if (col_idx == row_idx) { 42 | csr_matrix.adj_data[i] = float(0); 43 | break; 44 | } else if (col_idx > row_idx) { 45 | add_self_edge = true; 46 | csr_matrix.adj_data.insert(csr_matrix.adj_data.begin() + i, float(0)); 47 | csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin() + i, row_idx); 48 | break; 49 | } else if (i == (end - 1)) { 50 | add_self_edge = true; 51 | csr_matrix.adj_data.insert(csr_matrix.adj_data.begin() + i, float(0)); 52 | csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin() + i, row_idx); 53 | break; 54 | } 55 | } 56 | if (add_self_edge) { 57 | nnz_each_row[row_idx]++; 58 | } 59 | } 60 | csr_matrix.adj_indptr[row_idx + 1] = csr_matrix.adj_indptr[row_idx] + nnz_each_row[row_idx]; 61 | } 62 | } 63 | 64 | } // namespace 65 | 66 | 67 | namespace graphlily { 68 | namespace app { 69 | 70 | class SSSP : public app::ModuleCollection { 71 | private: 72 | // modules 73 | module::SpMVModule *SpMV_; 74 | module::AssignVectorDenseModule *DenseAssign_; 75 | module::SpMSpVModule *SpMSpV_; 76 | module::AssignVectorSparseModule *SparseAssign_; 77 | module::eWiseAddModule *eWiseAdd_; // for on-device data transfer 78 | // Sparse matrix size 79 | uint32_t matrix_num_rows_; 80 | uint32_t matrix_num_cols_; 81 | // SpMV kernel configuration 82 | uint32_t num_channels_; 83 | uint32_t spmv_out_buf_len_; 84 | uint32_t spmspv_out_buf_len_; 85 | uint32_t vec_buf_len_; 86 | // Semiring 87 | graphlily::SemiringType semiring_ = graphlily::TropicalSemiring; 88 | // Data types 89 | using aligned_dense_vec_t = graphlily::aligned_dense_vec_t; 90 | using aligned_sparse_vec_t = graphlily::aligned_sparse_vec_t; 91 | using aligned_dense_float_vec_t = graphlily::aligned_dense_float_vec_t; 92 | 93 | public: 94 | SSSP(uint32_t num_channels, uint32_t spmv_out_buf_len, 95 | uint32_t spmspv_out_buf_len, uint32_t vec_buf_len) { 96 | this->num_channels_ = num_channels; 97 | this->spmv_out_buf_len_ = spmv_out_buf_len; 98 | this->spmspv_out_buf_len_ = spmspv_out_buf_len; 99 | this->vec_buf_len_ = vec_buf_len; 100 | 101 | this->SpMV_ = new module::SpMVModule( 102 | this->num_channels_, 103 | this->spmv_out_buf_len_, 104 | this->vec_buf_len_); 105 | this->SpMV_->set_semiring(semiring_); 106 | this->SpMV_->set_mask_type(graphlily::kNoMask); 107 | this->add_module(this->SpMV_); 108 | 109 | this->SpMSpV_ = new module::SpMSpVModule( 110 | spmspv_out_buf_len); 111 | this->SpMSpV_->set_semiring(semiring_); 112 | this->SpMSpV_->set_mask_type(graphlily::kNoMask); 113 | this->add_module(this->SpMSpV_); 114 | 115 | bool generate_new_frontier = true; 116 | this->SparseAssign_ = new module::AssignVectorSparseModule( 117 | generate_new_frontier); 118 | this->add_module(this->SparseAssign_); 119 | 120 | this->eWiseAdd_ = new module::eWiseAddModule(); 121 | this->add_module(this->eWiseAdd_); 122 | } 123 | 124 | 125 | uint32_t get_nnz() { 126 | return this->SpMV_->get_nnz(); 127 | } 128 | 129 | 130 | void load_and_format_matrix(std::string csr_float_npz_path, bool skip_empty_rows) { 131 | CSRMatrix csr_matrix = graphlily::io::load_csr_matrix_from_float_npz(csr_float_npz_path); 132 | _preprocess(csr_matrix); 133 | graphlily::io::util_round_csr_matrix_dim( 134 | csr_matrix, 135 | this->num_channels_ * graphlily::pack_size, 136 | this->num_channels_ * graphlily::pack_size); 137 | CSCMatrix csc_matrix = graphlily::io::csr2csc(csr_matrix); 138 | this->SpMV_->load_and_format_matrix(csr_matrix, skip_empty_rows); 139 | this->SpMSpV_->load_and_format_matrix(csc_matrix); 140 | this->matrix_num_rows_ = this->SpMV_->get_num_rows(); 141 | this->matrix_num_cols_ = this->SpMV_->get_num_cols(); 142 | assert(this->matrix_num_rows_ == this->matrix_num_cols_); 143 | } 144 | 145 | 146 | void send_matrix_host_to_device() { 147 | this->SpMV_->send_matrix_host_to_device(); 148 | this->SpMSpV_->send_matrix_host_to_device(); 149 | } 150 | 151 | 152 | aligned_dense_vec_t pull(uint32_t source, uint32_t num_iterations) { 153 | aligned_dense_vec_t input(this->matrix_num_rows_, semiring_.zero); 154 | input[source] = 0; 155 | this->SpMV_->send_vector_host_to_device(input); 156 | this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf); 157 | this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf); 158 | for (size_t iter = 1; iter <= num_iterations; iter++) { 159 | this->SpMV_->run(); 160 | // this->SpMV_->copy_buffer_device_to_device(this->SpMV_->results_buf, 161 | // this->SpMV_->vector_buf, 162 | // sizeof(graphlily::val_t) * this->matrix_num_rows_); 163 | this->eWiseAdd_->run(this->matrix_num_rows_, 0); 164 | } 165 | return this->SpMV_->send_vector_device_to_host(); 166 | } 167 | 168 | 169 | aligned_dense_vec_t push(uint32_t source, uint32_t num_iterations) { 170 | // The sparse input vector 171 | aligned_sparse_vec_t spmspv_input(2); 172 | idx_val_t head; 173 | graphlily::idx_t nnz = 1; // one source vertex 174 | head.index = nnz; 175 | spmspv_input[0] = head; 176 | spmspv_input[1] = {source, 0}; 177 | 178 | // The dense distance vector 179 | aligned_dense_vec_t distance(this->matrix_num_rows_, semiring_.zero); 180 | distance[source] = 0; 181 | 182 | // Push 183 | this->SpMSpV_->send_vector_host_to_device(spmspv_input); 184 | this->SpMSpV_->send_mask_host_to_device(distance); 185 | this->SparseAssign_->bind_mask_buf(this->SpMSpV_->results_buf); 186 | this->SparseAssign_->bind_inout_buf(this->SpMSpV_->mask_buf); 187 | this->SparseAssign_->bind_new_frontier_buf(this->SpMSpV_->vector_buf); 188 | for (size_t iter = 1; iter <= num_iterations; iter++) { 189 | this->SpMSpV_->run(); 190 | this->SparseAssign_->run(); 191 | } 192 | 193 | return this->SpMSpV_->send_mask_device_to_host(); 194 | } 195 | 196 | 197 | aligned_dense_vec_t pull_push(uint32_t source, uint32_t num_iterations, float threshold = 0.05) { 198 | // The sparse input vector 199 | aligned_sparse_vec_t spmspv_input(2); 200 | idx_val_t head; 201 | graphlily::idx_t nnz = 1; // one source vertex 202 | head.index = nnz; 203 | spmspv_input[0] = head; 204 | spmspv_input[1] = {source, 0}; 205 | 206 | // The dense distance vector 207 | aligned_dense_vec_t distance(this->matrix_num_rows_, semiring_.zero); 208 | distance[source] = 0; 209 | 210 | // Push 211 | this->SpMSpV_->send_vector_host_to_device(spmspv_input); 212 | this->SpMSpV_->send_mask_host_to_device(distance); 213 | this->SparseAssign_->bind_mask_buf(this->SpMSpV_->results_buf); 214 | this->SparseAssign_->bind_inout_buf(this->SpMSpV_->mask_buf); 215 | this->SparseAssign_->bind_new_frontier_buf(this->SpMSpV_->vector_buf); 216 | uint32_t iter = 1; 217 | uint32_t vector_nnz; 218 | do { 219 | this->SpMSpV_->run(); 220 | this->SparseAssign_->run(); 221 | vector_nnz = this->SpMSpV_->get_results_nnz(); 222 | iter++; 223 | } while (iter < num_iterations && (float(vector_nnz) / this->matrix_num_rows_ < threshold)); 224 | 225 | std::cout << "SpMSpV runs for " << (iter - 1) << " iterations" << std::endl; 226 | 227 | // Switch from push to pull 228 | aligned_dense_vec_t spmv_input = this->SpMSpV_->send_mask_device_to_host(); 229 | this->SpMV_->send_vector_host_to_device(spmv_input); 230 | this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf); 231 | this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf); 232 | 233 | // Pull 234 | for ( ; iter <= num_iterations; iter++) { 235 | this->SpMV_->run(); 236 | // this->SpMV_->copy_buffer_device_to_device(this->SpMV_->results_buf, 237 | // this->SpMV_->vector_buf, 238 | // sizeof(graphlily::val_t) * this->matrix_num_rows_); 239 | this->eWiseAdd_->run(this->matrix_num_rows_, 0); 240 | } 241 | 242 | return this->SpMV_->send_vector_device_to_host(); 243 | } 244 | 245 | 246 | aligned_dense_float_vec_t compute_reference_results(uint32_t source, uint32_t num_iterations) { 247 | aligned_dense_float_vec_t input(this->matrix_num_rows_, semiring_.zero); 248 | input[source] = 0; 249 | for (size_t iter = 1; iter <= num_iterations; iter++) { 250 | input = this->SpMV_->compute_reference_results(input); 251 | } 252 | return input; 253 | } 254 | }; 255 | 256 | } // namespace app 257 | } // namespace graphlily 258 | 259 | #endif // GRAPHLILY_APP_SSSP_H_ 260 | -------------------------------------------------------------------------------- /graphlily/global.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_GLOBAL_H_ 2 | #define GRAPHLILY_GLOBAL_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "ap_fixed.h" 8 | #include "xcl2.hpp" 9 | 10 | 11 | namespace { 12 | 13 | std::string get_root_path() { 14 | char* root_path = getenv("GRAPHLILY_ROOT_PATH"); 15 | return root_path == NULL ? std::string("") : std::string(root_path); 16 | } 17 | 18 | } // namespace 19 | 20 | 21 | namespace graphlily { 22 | 23 | // The root path 24 | const std::string root_path = get_root_path(); 25 | 26 | // The device 27 | const std::string device_name = "xilinx_u280_xdma_201920_3"; 28 | 29 | // Find the device 30 | cl::Device find_device() { 31 | auto devices = xcl::get_xil_devices(); 32 | for (size_t i = 0; i < devices.size(); i++) { 33 | cl::Device device = devices[i]; 34 | if (device.getInfo() == device_name) { 35 | return device; 36 | } 37 | } 38 | std::cout << "Failed to find " << device_name << ", exit!\n"; 39 | exit(EXIT_FAILURE); 40 | } 41 | 42 | // HBM channels 43 | #define MAX_HBM_CHANNEL_COUNT 32 44 | #define CHANNEL_NAME(n) n | XCL_MEM_TOPOLOGY 45 | const int HBM[MAX_HBM_CHANNEL_COUNT] = { 46 | CHANNEL_NAME(0), CHANNEL_NAME(1), CHANNEL_NAME(2), CHANNEL_NAME(3), CHANNEL_NAME(4), 47 | CHANNEL_NAME(5), CHANNEL_NAME(6), CHANNEL_NAME(7), CHANNEL_NAME(8), CHANNEL_NAME(9), 48 | CHANNEL_NAME(10), CHANNEL_NAME(11), CHANNEL_NAME(12), CHANNEL_NAME(13), CHANNEL_NAME(14), 49 | CHANNEL_NAME(15), CHANNEL_NAME(16), CHANNEL_NAME(17), CHANNEL_NAME(18), CHANNEL_NAME(19), 50 | CHANNEL_NAME(20), CHANNEL_NAME(21), CHANNEL_NAME(22), CHANNEL_NAME(23), CHANNEL_NAME(24), 51 | CHANNEL_NAME(25), CHANNEL_NAME(26), CHANNEL_NAME(27), CHANNEL_NAME(28), CHANNEL_NAME(29), 52 | CHANNEL_NAME(30), CHANNEL_NAME(31)}; 53 | 54 | const int DDR[2] = {CHANNEL_NAME(32), CHANNEL_NAME(33)}; 55 | 56 | // Kernel configurations 57 | const uint32_t pack_size = 8; 58 | const uint32_t spmv_row_interleave_factor = 1; 59 | const uint32_t num_hbm_channels = 16; 60 | 61 | // Data types (please change this according to the kernel!) 62 | // using val_t = unsigned; 63 | using val_t = ap_ufixed<32, 8, AP_RND, AP_SAT>; 64 | // using val_t = float; 65 | typedef uint32_t idx_t; 66 | const uint32_t idx_marker = 0xffffffff; 67 | typedef struct {idx_t data[pack_size];} packed_idx_t; 68 | 69 | typedef struct {idx_t index; val_t val;} idx_val_t; 70 | typedef struct {idx_t index; float val;} idx_float_t; 71 | 72 | using aligned_dense_vec_t = std::vector>; 73 | using aligned_sparse_vec_t = std::vector>; 74 | 75 | using aligned_dense_float_vec_t = std::vector>; 76 | using aligned_sparse_float_vec_t = std::vector>; 77 | 78 | const val_t UINT_INF = 0xffffffff; 79 | const val_t UFIXED_INF = 255; 80 | const val_t FLOAT_INF = 999999999; 81 | 82 | // Operation type, named as k 83 | enum OperationType { 84 | kMulAdd = 0, 85 | kLogicalAndOr = 1, 86 | kAddMin = 2, 87 | }; 88 | 89 | // Semiring definition 90 | struct SemiringType { 91 | OperationType op; 92 | val_t one; // identity element for operator (a one = a) 93 | val_t zero; // identity element for operator <+> (a <+> zero = a) 94 | }; 95 | 96 | const SemiringType ArithmeticSemiring = {kMulAdd, 1, 0}; 97 | const SemiringType LogicalSemiring = {kLogicalAndOr, 1, 0}; 98 | // const SemiringType TropicalSemiring = {kAddMin, 0, UINT_INF}; 99 | const SemiringType TropicalSemiring = {kAddMin, 0, UFIXED_INF}; 100 | // const SemiringType TropicalSemiring = {kAddMin, 0, FLOAT_INF}; 101 | 102 | // Mask type 103 | enum MaskType { 104 | kNoMask = 0, 105 | kMaskWriteToZero = 1, 106 | kMaskWriteToOne = 2, 107 | }; 108 | 109 | // Makefile for synthesizing xclbin 110 | const std::string makefile_prologue = 111 | "DEVICE = /opt/xilinx/platforms/" + device_name + "/" + device_name + ".xpfm\n" 112 | "\n" 113 | "TEMP_DIR := ./_x.$(TARGET)\n" 114 | "BUILD_DIR := ./build_dir.$(TARGET)\n" 115 | "\n" 116 | "VPP := v++\n" 117 | "\n" 118 | "CLFLAGS += -t $(TARGET) --platform $(DEVICE) --kernel_frequency 200 --save-temps\n" 119 | "\n" 120 | "FUSED_KERNEL = $(BUILD_DIR)/fused.xclbin\n" 121 | "\n" 122 | "emconfig.json:\n" 123 | "\temconfigutil --platform $(DEVICE)\n" 124 | "\n" 125 | "build: $(FUSED_KERNEL) emconfig.json\n" 126 | "\n"; 127 | 128 | const std::string makefile_epilogue = 129 | "$(FUSED_KERNEL): $(KERNEL_OBJS)\n" 130 | "\tmkdir -p $(BUILD_DIR)\n" 131 | "\t$(VPP) $(CLFLAGS) --temp_dir $(BUILD_DIR) -l $(LDCLFLAGS) -o'$@' $(+)\n"; 132 | 133 | std::string add_kernel_to_makefile(std::string kernel_name) { 134 | std::string makefile_body; 135 | makefile_body += ("LDCLFLAGS += --config " + kernel_name + ".ini" + "\n"); 136 | makefile_body += ("KERNEL_OBJS += $(TEMP_DIR)/" + kernel_name + ".xo" + "\n"); 137 | makefile_body += "\n"; 138 | makefile_body += ("$(TEMP_DIR)/" + kernel_name + ".xo: " + kernel_name + ".cpp" + "\n"); 139 | makefile_body += ("\tmkdir -p $(TEMP_DIR)\n"); 140 | makefile_body += ("\t$(VPP) $(CLFLAGS) --temp_dir $(TEMP_DIR) -c -k " + kernel_name + " -I'$( 154 | dense_vec_t convert_sparse_vec_to_dense_vec(const sparse_vec_t &sparse_vector, 155 | uint32_t range, 156 | val_t zero) { 157 | dense_vec_t dense_vector(range); 158 | std::fill(dense_vector.begin(), dense_vector.end(), zero); 159 | int nnz = sparse_vector[0].index; 160 | for (int i = 1; i < nnz + 1; i++) { 161 | dense_vector[sparse_vector[i].index] = sparse_vector[i].val; 162 | } 163 | return dense_vector; 164 | } 165 | 166 | // used to calculate BANK_ID_NBITS 167 | unsigned log2(unsigned x) { 168 | switch (x) { 169 | case 1: return 0; 170 | case 2: return 1; 171 | case 4: return 2; 172 | case 8: return 3; 173 | case 16: return 4; 174 | default : return 0; 175 | } 176 | } 177 | 178 | } // namespace graphlily 179 | 180 | #endif // GRAPHLILY_GLOBAL_H_ 181 | -------------------------------------------------------------------------------- /graphlily/hw/kernel_add_scalar_vector_dense_impl.h: -------------------------------------------------------------------------------- 1 | #include "./overlay.h" 2 | 3 | #include 4 | 5 | 6 | void kernel_add_scalar_vector_dense( 7 | const PACKED_VAL_T *in, // The input vector 8 | PACKED_VAL_T *out, // The output vector 9 | unsigned length, // The length of the in/out vector 10 | VAL_T val // The value to be added 11 | ) { 12 | assert(length % PACK_SIZE == 0); 13 | unsigned size = length / PACK_SIZE; 14 | PACKED_VAL_T tmp_in; 15 | PACKED_VAL_T tmp_out; 16 | 17 | loop_kernel_add_scalar_vector_dense: 18 | for (int i = 0; i < size; i++) { 19 | #pragma HLS PIPELINE II=1 20 | tmp_in = in[i]; 21 | for (int k = 0; k < PACK_SIZE; k++) { 22 | #pragma HLS UNROLL 23 | tmp_out.data[k] = tmp_in.data[k] + val; 24 | } 25 | out[i] = tmp_out; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /graphlily/hw/kernel_assign_vector_dense_impl.h: -------------------------------------------------------------------------------- 1 | #include "./overlay.h" 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | void kernel_assign_vector_dense( 9 | const PACKED_VAL_T *mask, // The mask vector 10 | PACKED_VAL_T *in, // The input vector 11 | PACKED_VAL_T *out, // The output vector 12 | unsigned length, // The length of the mask/inout vector 13 | VAL_T val, // The value to be assigned to the inout vector 14 | MASK_T mask_type // The mask type 15 | ) { 16 | assert(length % PACK_SIZE == 0); 17 | unsigned size = length / PACK_SIZE; 18 | PACKED_VAL_T tmp_mask; 19 | PACKED_VAL_T tmp_inout; 20 | 21 | loop_kernel_assign_vector_dense: 22 | for (int i = 0; i < size; i++) { 23 | #pragma HLS PIPELINE II=1 24 | 25 | tmp_mask = mask[i]; 26 | tmp_inout = in[i]; 27 | for (int k = 0; k < PACK_SIZE; k++) { 28 | #pragma HLS UNROLL 29 | if (mask_type == WRITETOZERO) { 30 | if (tmp_mask.data[k] == 0) { 31 | tmp_inout.data[k] = val; 32 | } 33 | } else if (mask_type == WRITETOONE) { 34 | if (tmp_mask.data[k] != 0) { 35 | tmp_inout.data[k] = val; 36 | } 37 | } else { 38 | tmp_inout.data[k] = 0; 39 | #ifndef __SYNTHESIS__ 40 | std::cout << "Invalid mask type" << std::endl; 41 | exit(EXIT_FAILURE); 42 | #endif 43 | } 44 | } 45 | out[i] = tmp_inout; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /graphlily/hw/kernel_assign_vector_sparse_new_frontier_impl.h: -------------------------------------------------------------------------------- 1 | #include "./overlay.h" 2 | 3 | 4 | void kernel_assign_vector_sparse_new_frontier( 5 | const IDX_VAL_T *mask, // The sparse mask vector. The index field of the first element is the length. 6 | VAL_T *inout, // The inout vector. 7 | IDX_VAL_T *new_frontier // The new frontier. The index field of the first element is the length. 8 | ) { 9 | // local buffer 10 | VAL_T local_inout_buf[BATCH_SIZE]; 11 | IDX_VAL_T local_mask_buf[BATCH_SIZE]; 12 | IDX_VAL_T local_new_frontier_buf[BATCH_SIZE]; 13 | #pragma HLS DATA_PACK variable=local_mask_buf 14 | #pragma HLS DATA_PACK variable=local_new_frontier_buf 15 | 16 | IDX_T length = mask[0].index; 17 | unsigned num_batches = (length + BATCH_SIZE - 1) / BATCH_SIZE; 18 | unsigned remain = length; 19 | 20 | unsigned new_frontier_length = 0; 21 | 22 | loop_over_batches: 23 | for (unsigned batch_cnt = 0; batch_cnt < num_batches; batch_cnt++) { 24 | #pragma HLS pipeline off 25 | unsigned batch_new_frontier_length = 0; 26 | 27 | // read stage 28 | loop_read_inout_val: 29 | for (unsigned i = 0; i < BATCH_SIZE; i++) { 30 | #pragma HLS pipeline II=1 31 | if (i < remain) { 32 | IDX_VAL_T tmp_mask = mask[i + 1 + batch_cnt * BATCH_SIZE]; 33 | local_mask_buf[i].index = tmp_mask.index; 34 | local_mask_buf[i].val = tmp_mask.val; 35 | local_inout_buf[i] = inout[tmp_mask.index]; 36 | } 37 | } 38 | 39 | // process stage 40 | loop_process: 41 | for (unsigned i = 0; i < BATCH_SIZE; i++) { 42 | #pragma HLS pipeline II=1 43 | if (i < remain) { 44 | if (local_inout_buf[i] > local_mask_buf[i].val) { 45 | local_inout_buf[i] = local_mask_buf[i].val; 46 | local_new_frontier_buf[batch_new_frontier_length] = local_mask_buf[i]; 47 | batch_new_frontier_length++; 48 | } 49 | } 50 | } 51 | 52 | // write inout 53 | loop_write_inout_val: 54 | for (unsigned i = 0; i < BATCH_SIZE; i++) { 55 | #pragma HLS pipeline II=1 56 | if (i < remain) { 57 | inout[local_mask_buf[i].index] = local_inout_buf[i]; 58 | } 59 | } 60 | 61 | // write new_frontier 62 | loop_write_new_frontier: 63 | for (unsigned i = 0; i < batch_new_frontier_length; i++) { 64 | #pragma HLS pipeline II=1 65 | new_frontier[i + 1 + new_frontier_length] = local_new_frontier_buf[i]; 66 | } 67 | new_frontier_length += batch_new_frontier_length; 68 | 69 | // update progress 70 | remain -= BATCH_SIZE; 71 | } 72 | 73 | // attach head to new_frontier 74 | IDX_VAL_T new_frontier_head; 75 | new_frontier_head.index = new_frontier_length; 76 | new_frontier_head.val = 0; 77 | new_frontier[0] = new_frontier_head; 78 | } 79 | -------------------------------------------------------------------------------- /graphlily/hw/kernel_assign_vector_sparse_no_new_frontier_impl.h: -------------------------------------------------------------------------------- 1 | #include "./overlay.h" 2 | 3 | 4 | void kernel_assign_vector_sparse_no_new_frontier( 5 | const IDX_VAL_T *mask, // The sparse mask vector. The index field of the first element is the length. 6 | VAL_T *inout, // The inout vector. 7 | VAL_T val // The value to be assigned to the inout vector. 8 | ) { 9 | // local buffer 10 | VAL_T local_inout_buf[BATCH_SIZE]; 11 | IDX_VAL_T local_mask_buf[BATCH_SIZE]; 12 | #pragma HLS DATA_PACK variable=local_mask_buf 13 | 14 | IDX_T length = mask[0].index; 15 | unsigned num_batches = (length + BATCH_SIZE - 1) / BATCH_SIZE; 16 | unsigned remain = length; 17 | 18 | loop_over_batches: 19 | for (unsigned batch_cnt = 0; batch_cnt < num_batches; batch_cnt++) { 20 | #pragma HLS pipeline off 21 | 22 | // read stage 23 | loop_read_inout_val: 24 | for (unsigned i = 0; i < BATCH_SIZE; i++) { 25 | #pragma HLS pipeline II=1 26 | if (i < remain) { 27 | IDX_VAL_T tmp_mask = mask[i + 1 + batch_cnt * BATCH_SIZE]; 28 | local_mask_buf[i].index = tmp_mask.index; 29 | local_mask_buf[i].val = tmp_mask.val; 30 | local_inout_buf[i] = inout[tmp_mask.index]; 31 | } 32 | } 33 | 34 | // process stage 35 | loop_process: 36 | for (unsigned i = 0; i < BATCH_SIZE; i++) { 37 | #pragma HLS pipeline II=1 38 | if (i < remain) { 39 | local_inout_buf[i] = val; 40 | } 41 | } 42 | 43 | // write inout 44 | loop_write_inout_val: 45 | for (unsigned i = 0; i < BATCH_SIZE; i++) { 46 | #pragma HLS pipeline II=1 47 | if (i < remain) { 48 | inout[local_mask_buf[i].index] = local_inout_buf[i]; 49 | } 50 | } 51 | 52 | // update progress 53 | remain -= BATCH_SIZE; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /graphlily/hw/math_constants.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_HW_MATH_CONSTANTS_H_ 2 | #define GRAPHLILY_HW_MATH_CONSTANTS_H_ 3 | 4 | #include "ap_fixed.h" 5 | 6 | const unsigned UINT_INF = 0xffffffff; 7 | const ap_ufixed<32, 8, AP_RND, AP_SAT> UFIXED_INF = 255; 8 | const float FLOAT_INF = 999999999; 9 | 10 | #endif // GRAPHLILY_HW_MATH_CONSTANTS_H_ 11 | -------------------------------------------------------------------------------- /graphlily/hw/overlay.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_HW_OVERLAY_H_ 2 | #define GRAPHLILY_HW_OVERLAY_H_ 3 | 4 | #include "ap_fixed.h" 5 | #include "./math_constants.h" 6 | 7 | #define IDX_MARKER 0xffffffff 8 | 9 | const unsigned PACK_SIZE = 8; 10 | const unsigned NUM_PORT_PER_BANK = 1; 11 | const unsigned NUM_BANK_PER_HBM_CHANNEL = PACK_SIZE / NUM_PORT_PER_BANK; 12 | const unsigned BANK_ID_NBITS = 3; 13 | const unsigned BANK_ID_MASK = 7; 14 | 15 | const unsigned SPMV_ROW_INTERLEAVE_FACTOR = 1; 16 | 17 | // data types 18 | typedef unsigned IDX_T; 19 | typedef struct {IDX_T data[PACK_SIZE];} PACKED_IDX_T; 20 | 21 | // typedef unsigned VAL_T; 22 | typedef ap_ufixed<32, 8, AP_RND, AP_SAT> VAL_T; 23 | // typedef float VAL_T; 24 | typedef struct {VAL_T data[PACK_SIZE];} PACKED_VAL_T; 25 | 26 | typedef struct { 27 | PACKED_IDX_T indices; 28 | PACKED_VAL_T vals; 29 | } SPMV_MAT_PKT_T; 30 | 31 | typedef SPMV_MAT_PKT_T SPMSPV_MAT_PKT_T; 32 | 33 | typedef struct {IDX_T index; VAL_T val;} IDX_VAL_T; 34 | 35 | // semiring 36 | typedef char OP_T; 37 | #define MULADD 0 38 | #define ANDOR 1 39 | #define ADDMIN 2 40 | 41 | const VAL_T MulAddZero = 0; 42 | const VAL_T AndOrZero = 0; 43 | // const VAL_T AddMinZero = UINT_INF; 44 | const VAL_T AddMinZero = UFIXED_INF; 45 | // const VAL_T AddMinZero = FLOAT_INF; 46 | 47 | const VAL_T MulAddOne = 1; 48 | const VAL_T AndOrOne = 1; 49 | const VAL_T AddMinOne = 0; 50 | 51 | // mask type 52 | typedef char MASK_T; 53 | #define NOMASK 0 54 | #define WRITETOZERO 1 55 | #define WRITETOONE 2 56 | 57 | // Kernel configurations 58 | const unsigned FIFO_DEPTH = 64; 59 | const unsigned BATCH_SIZE = 128; 60 | 61 | // Below kernel configurations will be overwritten by the compiler 62 | // const unsigned SPMV_OUT_BUF_LEN =; 63 | // const unsigned SPMSPV_OUT_BUF_LEN =; 64 | // const unsigned VEC_BUF_LEN =; 65 | // #define NUM_HBM_CHANNEL 66 | // #define SPMV_NUM_PE_TOTAL 67 | 68 | // #endif // GRAPHLILY_HW_OVERLAY_H_ 69 | -------------------------------------------------------------------------------- /graphlily/hw/shuffle.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_HW_SHUFFLE_H_ 2 | #define GRAPHLILY_HW_SHUFFLE_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "hls_stream.h" 8 | 9 | #include "./util.h" 10 | 11 | 12 | #ifndef __SYNTHESIS__ 13 | bool line_tracing_shuffle_1p = false; 14 | #endif 15 | 16 | // pipeline register 17 | // Its latency is 5. 18 | // Only use this function to wrap signals that travels along stage A! 19 | template 20 | PayloadT pipereg_stage_A (PayloadT in) { 21 | #pragma HLS pipeline II=1 22 | #pragma HLS latency min=5 max=5 23 | return in; 24 | } 25 | 26 | //------------------------------------------------------------ 27 | // arbiters 28 | //------------------------------------------------------------ 29 | 30 | // TODO: Do we really need to expose addr_mask as an argument? Can it be inferred? 31 | // TODO: Is num_in_lane always equal to num_out_lane? 32 | template 33 | unsigned arbiter_1p( 34 | unsigned in_addr[num_in_lane], 35 | bool in_valid[num_in_lane], 36 | // bool in_granted[num_in_lane], 37 | bool in_resend[num_in_lane], 38 | unsigned xbar_sel[num_out_lane], 39 | bool out_valid[num_out_lane], 40 | unsigned rotate_priority 41 | ) { 42 | #pragma HLS pipeline II=1 43 | #pragma HLS latency min=5 max=5 44 | // #pragma HLS inline 45 | 46 | // bool in_granted[num_in_lane]; 47 | // #pragma HLS array_partition variable=in_granted complete 48 | 49 | // static unsigned in_addr[num_in_lane]; 50 | // #pragma HLS array_partition variable=in_addr complete 51 | // loop_A_extract_addr: 52 | // for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 53 | // in_addr[ILid] = in_payload[ILid].index; 54 | // } 55 | 56 | // prioritized valid and addr 57 | bool arb_p_in_valid[num_in_lane]; 58 | #pragma HLS array_partition variable=arb_p_in_valid complete 59 | unsigned arb_p_in_addr[num_in_lane]; 60 | #pragma HLS array_partition variable=arb_p_in_addr complete 61 | 62 | array_shift_left(in_addr, arb_p_in_addr, rotate_priority); 63 | array_shift_left(in_valid, arb_p_in_valid, rotate_priority); 64 | 65 | loop_A_arbsearch: 66 | for (unsigned OLid = 0; OLid < num_out_lane; OLid++) { 67 | #pragma HLS unroll 68 | bool found = false; 69 | unsigned chosen_port = 0; 70 | 71 | loop_ab_logic_encoder_unroll: 72 | for (unsigned ILid_plus_1 = num_in_lane; ILid_plus_1 > 0; ILid_plus_1--) { 73 | #pragma HLS unroll 74 | if (arb_p_in_valid[ILid_plus_1 - 1] && ((arb_p_in_addr[ILid_plus_1 - 1] & addr_mask) == OLid)) { 75 | chosen_port = ILid_plus_1 - 1; 76 | found = true; 77 | } 78 | } 79 | if (!found) { 80 | out_valid[OLid] = false; 81 | xbar_sel[OLid] = 0; 82 | } else { 83 | out_valid[OLid] = true; 84 | xbar_sel[OLid] = chosen_port; 85 | } 86 | } 87 | 88 | array_cyclic_add(xbar_sel, out_valid, rotate_priority); 89 | 90 | unsigned grant_count = 0; 91 | loop_A_grant: 92 | for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 93 | #pragma HLS unroll 94 | unsigned requested_olid = in_addr[ILid] & addr_mask; 95 | bool in_granted = (in_valid[ILid] 96 | && out_valid[requested_olid] 97 | && (xbar_sel[requested_olid] == ILid)); 98 | in_resend[ILid] = in_valid[ILid] && !in_granted; 99 | if (in_granted) grant_count++; 100 | } 101 | 102 | // loop_A_resend: 103 | // for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 104 | // #pragma HLS unroll 105 | // // resend path 106 | // in_resend[ILid] = in_valid[ILid] && !in_granted[ILid]; 107 | // } 108 | 109 | // loop_A_pass: 110 | // for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 111 | // out_payload[ILid] = in_payload[ILid]; 112 | // } 113 | 114 | return grant_count; 115 | } 116 | 117 | 118 | /* shuffler-1p 119 | * 1 downstream entitiy could process 1 payload per cycle 120 | * type PayloadT must be a struct of 2 fields: 121 | * unsigned index 122 | * data 123 | */ 124 | template 126 | void shuffler_1p( 127 | // fifos 128 | hls::stream input_lanes[num_in_lane], 129 | hls::stream output_lanes[num_out_lane], 130 | // total number of payloads (avaliable after all payloads are put into the input lanes) 131 | hls::stream &num_payloads_in, 132 | // all outputs are in the output lanes 133 | hls::stream &num_payloads_out 134 | ) { 135 | // pipeline control variables 136 | bool prev_finish = false; 137 | unsigned payload_cnt = 0; 138 | unsigned num_granted_A = 0; 139 | unsigned num_granted_C = 0; 140 | unsigned process_cnt = 0; 141 | 142 | // pipeline data registers before arbiter 143 | PayloadT payload_F[num_in_lane]; 144 | PayloadT payload_A[num_in_lane]; 145 | unsigned addr_A[num_in_lane]; 146 | #pragma HLS array_partition variable=payload_F complete 147 | #pragma HLS array_partition variable=payload_A complete 148 | #pragma HLS array_partition variable=addr_A complete 149 | 150 | // pipeline data registers after arbiter 151 | PayloadT payload_C[num_in_lane]; 152 | #pragma HLS array_partition variable=payload_C complete 153 | 154 | // pipeline valid registers before arbiter 155 | bool valid_F[num_in_lane]; 156 | bool valid_A[num_in_lane]; 157 | #pragma HLS array_partition variable=valid_F complete 158 | #pragma HLS array_partition variable=valid_A complete 159 | 160 | // pipeline valid registers after arbiter 161 | unsigned xbar_sel_C[num_out_lane]; 162 | bool xbar_valid_C[num_out_lane]; 163 | bool valid_C[num_in_lane]; 164 | #pragma HLS array_partition variable=xbar_sel_C complete 165 | #pragma HLS array_partition variable=xbar_valid_C complete 166 | #pragma HLS array_partition variable=valid_C complete 167 | 168 | // resend control 169 | PayloadT payload_resend[num_in_lane]; 170 | bool resend[num_in_lane]; 171 | #pragma HLS data_pack variable=payload_resend 172 | #pragma HLS array_partition variable=payload_resend complete 173 | #pragma HLS array_partition variable=resend complete 174 | 175 | // loop control 176 | bool loop_exit = false; 177 | 178 | // arbiter inputs 179 | unsigned arbiter_in_addr[num_in_lane]; 180 | bool arbiter_in_valid[num_in_lane]; 181 | #pragma HLS array_partition variable=arbiter_in_addr complete 182 | #pragma HLS array_partition variable=arbiter_in_valid complete 183 | 184 | // arbiter outputs 185 | // bool arbiter_in_granted[num_in_lane]; 186 | unsigned xbar_sel_A[num_out_lane]; 187 | bool xbar_valid_A[num_out_lane]; 188 | // #pragma HLS array_partition variable=arbiter_in_granted complete 189 | #pragma HLS array_partition variable=xbar_sel_A complete 190 | #pragma HLS array_partition variable=xbar_valid_A complete 191 | 192 | // arbiter priority rotation 193 | unsigned rotate_priority = 0; 194 | unsigned next_rotate_priority = 0; 195 | 196 | // reset 197 | loop_rst_IL: 198 | for (unsigned i = 0; i < num_in_lane; i++) { 199 | #pragma HLS unroll 200 | 201 | payload_F[i].index = 0; 202 | payload_A[i].index = 0; 203 | addr_A[i] = 0; 204 | payload_C[i].index = 0; 205 | payload_F[i].data = (PayloadValT){0, 0}; 206 | payload_A[i].data = (PayloadValT){0, 0}; 207 | payload_C[i].data = (PayloadValT){0, 0}; 208 | payload_resend[i].index = 0; 209 | payload_resend[i].data = (PayloadValT){0, 0}; 210 | 211 | valid_A[i] = false; 212 | valid_F[i] = false; 213 | valid_C[i] = false; 214 | 215 | resend[i] = false; 216 | } 217 | 218 | loop_reset_OL: 219 | for (unsigned i = 0; i < num_out_lane; i++) { 220 | xbar_sel_A[i] = 0; 221 | xbar_valid_A[i] = false; 222 | xbar_sel_C[i] = 0; 223 | xbar_valid_C[i] = false; 224 | } 225 | 226 | #ifndef __SYNTHESIS__ 227 | int cnt = 0; 228 | #endif 229 | 230 | loop_shuffle_pipeline: 231 | while (!loop_exit) { 232 | #pragma HLS pipeline II=1 233 | #pragma HLS latency min=7 max=7 234 | #pragma HLS dependence variable=resend inter distance=6 RAW True 235 | #pragma HLS dependence variable=payload_resend inter distance=6 RAW True 236 | #pragma HLS dependence variable=loop_exit inter distance=8 RAW True 237 | 238 | // Fetch stage (F) 239 | loop_F: 240 | for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 241 | #pragma HLS unroll 242 | PayloadT payload; 243 | if (!resend[ILid]) { 244 | valid_F[ILid] = input_lanes[ILid].read_nb(payload); 245 | } else { 246 | payload.data = (PayloadValT){0, 0}; 247 | payload.index = 0; 248 | valid_F[ILid] = true; 249 | } 250 | payload_F[ILid] = resend[ILid] ? payload_resend[ILid] : payload; 251 | } 252 | // ------- end of F stage 253 | 254 | // Arbiter stage (A) 255 | loop_A_pass: 256 | for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 257 | #pragma HLS unroll 258 | // prepare arbiter inputs 259 | payload_A[ILid] = payload_F[ILid]; 260 | valid_A[ILid] = valid_F[ILid]; 261 | addr_A[ILid] = payload_F[ILid].index; 262 | arbiter_in_valid[ILid] = valid_F[ILid]; 263 | } 264 | rotate_priority = next_rotate_priority; 265 | // pipeline arbiter, depth = 6 266 | num_granted_A = arbiter_1p( 267 | addr_A, 268 | arbiter_in_valid, 269 | // arbiter_in_granted, 270 | resend, 271 | xbar_sel_A, 272 | xbar_valid_A, 273 | rotate_priority 274 | ); 275 | next_rotate_priority = (rotate_priority + 1) % num_in_lane; 276 | loop_A_fwd: 277 | for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 278 | #pragma HLS unroll 279 | payload_resend[ILid] = pipereg_stage_A(payload_A[ILid]); 280 | // payload_resend[ILid].index = pipereg_stage_A(payload_A[ILid].index); 281 | // payload_resend[ILid].data = pipereg_stage_A(payload_A[ILid].data); 282 | } 283 | // ------- end of A stage 284 | 285 | // crossbar stage (C) 286 | loop_C_pass_il: 287 | for (unsigned ILid = 0; ILid < num_in_lane; ILid++) { 288 | #pragma HLS unroll 289 | payload_C[ILid] = payload_A[ILid]; 290 | valid_C[ILid] = valid_A[ILid]; 291 | } 292 | num_granted_C = num_granted_A; 293 | loop_C_pass_ol: 294 | for (unsigned OLid = 0; OLid < num_in_lane; OLid++) { 295 | #pragma HLS unroll 296 | xbar_sel_C[OLid] = xbar_sel_A[OLid]; 297 | xbar_valid_C[OLid] = xbar_valid_A[OLid]; 298 | } 299 | loop_C_xbar: 300 | for (unsigned OLid = 0; OLid < num_out_lane; OLid++) { 301 | #pragma HLS unroll 302 | if (xbar_valid_C[OLid]) { 303 | if (valid_C[xbar_sel_C[OLid]]) { 304 | output_lanes[OLid].write(payload_C[xbar_sel_C[OLid]]); 305 | } 306 | } 307 | } 308 | // ------- end of C stage 309 | 310 | if (!prev_finish) { prev_finish = num_payloads_in.read_nb(payload_cnt); } 311 | process_cnt += num_granted_C; 312 | bool all_processed = (process_cnt == payload_cnt); 313 | loop_exit = all_processed && prev_finish; 314 | } 315 | 316 | num_payloads_out.write(payload_cnt); 317 | } 318 | 319 | #endif // GRAPHLILY_HW_SHUFFLE_H_ 320 | -------------------------------------------------------------------------------- /graphlily/hw/util.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_HW_UTIL_H_ 2 | #define GRAPHLILY_HW_UTIL_H_ 3 | 4 | 5 | template 6 | void array_shift_left(T array[len], T array_dest[len], unsigned rotate) { 7 | #pragma HLS inline 8 | // #pragma HLS latency min=0 max=0 9 | #pragma HLS array_partition variable=array complete 10 | #pragma HLS array_partition variable=array_dest complete 11 | for (unsigned i = 0; i < len; i++) { 12 | #pragma HLS unroll 13 | array_dest[i] = array[(i + rotate) % len]; 14 | } 15 | } 16 | 17 | 18 | template 19 | void array_cyclic_add(T array[len], bool array_valid[len], unsigned inc) { 20 | #pragma HLS inline 21 | // #pragma HLS latency min=0 max=0 22 | #pragma HLS array_partition variable=array complete 23 | for (unsigned i = 0; i < len; i++) { 24 | #pragma HLS unroll 25 | if (array_valid[i]) { 26 | array[i] = (array[i] + inc) % maximum; 27 | } 28 | } 29 | } 30 | 31 | 32 | template 33 | bool array_and_reduction(bool array[len]) { 34 | #pragma HLS inline 35 | #pragma HLS expression_balance 36 | bool result = true; 37 | for (unsigned i = 0; i < len; i++) { 38 | #pragma HLS unroll 39 | result = result && array[i]; 40 | } 41 | return result; 42 | } 43 | 44 | 45 | template 46 | bool array_or_reduction(bool array[len]) { 47 | #pragma HLS inline 48 | #pragma HLS expression_balance 49 | bool result = false; 50 | for (unsigned i = 0; i < len; i++) { 51 | #pragma HLS unroll 52 | result = result || array[i]; 53 | } 54 | return result; 55 | } 56 | 57 | 58 | template 59 | unsigned array_popcount(bool array[len]) { 60 | #pragma HLS pipeline II = 1 61 | #pragma HLS latency min=1 max=1 62 | #pragma HLS array_partition variable=array complete 63 | #pragma HLS expression_balance 64 | unsigned cnt = 0; 65 | for (unsigned i = 0; i < len; i++) { 66 | #pragma HLS unroll 67 | if (array[i]) { 68 | cnt++; 69 | } 70 | } 71 | return cnt; 72 | } 73 | 74 | 75 | template 76 | T array_sum(T array[len]) { 77 | #pragma HLS inline 78 | #pragma HLS expression_balance 79 | T result = 0; 80 | for (unsigned i = 0; i < len; i++) { 81 | #pragma HLS unroll 82 | result += array[i]; 83 | } 84 | return result; 85 | } 86 | 87 | 88 | template 89 | T array_max(T array[len]) { 90 | #pragma HLS inline 91 | #pragma HLS expression_balance 92 | T result = 0; 93 | for (unsigned i = 0; i < len; i++) { 94 | #pragma HLS unroll 95 | result = (array[i] > result)? array[i] : result; 96 | } 97 | return result; 98 | } 99 | 100 | 101 | // force a register 102 | template 103 | T HLS_REG(T in) { 104 | #pragma HLS pipeline 105 | #pragma HLS inline off 106 | #pragma HLS interface port=return register 107 | return in; 108 | } 109 | 110 | 111 | // // Cyclic partitioning 112 | // unsigned get_bank_idx(unsigned full_addr) { 113 | // return full_addr & BANK_ID_MASK; 114 | // } 115 | 116 | 117 | // // Cyclic partitioning 118 | // unsigned get_bank_address(unsigned full_addr) { 119 | // return full_addr >> BANK_ID_NBITS; 120 | // } 121 | 122 | #endif // GRAPHLILY_HW_UTIL_H_ 123 | -------------------------------------------------------------------------------- /graphlily/io/data_loader.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_IO_DATA_LOADER_H_ 2 | #define GRAPHLILY_IO_DATA_LOADER_H_ 3 | 4 | #include 5 | #include 6 | 7 | #include "cnpy.h" 8 | 9 | 10 | namespace graphlily { 11 | namespace io { 12 | 13 | //-------------------------------------------------- 14 | // Compressed Sparse Row (CSR) format support 15 | //-------------------------------------------------- 16 | 17 | // Data structure for csr matrix. 18 | template 19 | struct CSRMatrix { 20 | /*! \brief The number of rows of the sparse matrix */ 21 | uint32_t num_rows; 22 | /*! \brief The number of columns of the sparse matrix */ 23 | uint32_t num_cols; 24 | /*! \brief The non-zero data of the sparse matrix */ 25 | std::vector adj_data; 26 | /*! \brief The column indices of the sparse matrix */ 27 | std::vector adj_indices; 28 | /*! \brief The index pointers of the sparse matrix */ 29 | std::vector adj_indptr; 30 | }; 31 | 32 | 33 | // Create a csr matrix from raw input. 34 | template 35 | CSRMatrix create_csr_matrix(uint32_t num_rows, 36 | uint32_t num_cols, 37 | std::vector const &adj_data, 38 | std::vector const &adj_indices, 39 | std::vector const &adj_indptr) { 40 | CSRMatrix csr_matrix; 41 | csr_matrix.num_rows = num_rows; 42 | csr_matrix.num_cols = num_cols; 43 | csr_matrix.adj_data = adj_data; 44 | csr_matrix.adj_indices = adj_indices; 45 | csr_matrix.adj_indptr = adj_indptr; 46 | return csr_matrix; 47 | } 48 | 49 | 50 | // Load a csr matrix from a scipy sparse npz file. The sparse matrix should have float data type. 51 | CSRMatrix load_csr_matrix_from_float_npz(std::string csr_float_npz_path) { 52 | CSRMatrix csr_matrix; 53 | cnpy::npz_t npz = cnpy::npz_load(csr_float_npz_path); 54 | cnpy::NpyArray npy_shape = npz["shape"]; 55 | uint32_t num_rows = npy_shape.data()[0]; 56 | uint32_t num_cols = npy_shape.data()[2]; 57 | csr_matrix.num_rows = num_rows; 58 | csr_matrix.num_cols = num_cols; 59 | cnpy::NpyArray npy_data = npz["data"]; 60 | uint32_t nnz = npy_data.shape[0]; 61 | cnpy::NpyArray npy_indices = npz["indices"]; 62 | cnpy::NpyArray npy_indptr = npz["indptr"]; 63 | csr_matrix.adj_data.insert(csr_matrix.adj_data.begin(), &npy_data.data()[0], 64 | &npy_data.data()[nnz]); 65 | csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin(), &npy_indices.data()[0], 66 | &npy_indices.data()[nnz]); 67 | csr_matrix.adj_indptr.insert(csr_matrix.adj_indptr.begin(), &npy_indptr.data()[0], 68 | &npy_indptr.data()[num_rows + 1]); 69 | return csr_matrix; 70 | } 71 | 72 | 73 | // Convert a float csr matrix to another data type. 74 | // TODO: does ap_int make formatting slower than float? 75 | template 76 | CSRMatrix csr_matrix_convert_from_float(CSRMatrix const &in) { 77 | CSRMatrix out; 78 | out.num_rows = in.num_rows; 79 | out.num_cols = in.num_cols; 80 | std::copy(in.adj_data.begin(), in.adj_data.end(), std::back_inserter(out.adj_data)); 81 | out.adj_indices = in.adj_indices; 82 | out.adj_indptr = in.adj_indptr; 83 | return out; 84 | } 85 | 86 | 87 | //-------------------------------------------------- 88 | // Compressed Sparse Colunm (CSC) format support 89 | //-------------------------------------------------- 90 | 91 | // Data structure for csc matrix. 92 | template 93 | struct CSCMatrix { 94 | /*! \brief The number of rows of the sparse matrix */ 95 | uint32_t num_rows; 96 | /*! \brief The number of columns of the sparse matrix */ 97 | uint32_t num_cols; 98 | /*! \brief The non-zero data of the sparse matrix */ 99 | std::vector adj_data; 100 | /*! \brief The row indices of the sparse matrix */ 101 | std::vector adj_indices; 102 | /*! \brief The index pointers of the sparse matrix */ 103 | std::vector adj_indptr; 104 | }; 105 | 106 | 107 | // Convert csr to csc. 108 | template 109 | CSCMatrix csr2csc(CSRMatrix const &csr_matrix) { 110 | CSCMatrix csc_matrix; 111 | csc_matrix.num_rows = csr_matrix.num_rows; 112 | csc_matrix.num_cols = csr_matrix.num_cols; 113 | csc_matrix.adj_data = std::vector(csr_matrix.adj_data.size()); 114 | csc_matrix.adj_indices = std::vector(csr_matrix.adj_indices.size()); 115 | csc_matrix.adj_indptr = std::vector(csc_matrix.num_cols + 1); 116 | // Convert adj_indptr 117 | uint32_t nnz = csr_matrix.adj_indptr[csr_matrix.num_rows]; 118 | std::vector nnz_each_col(csc_matrix.num_cols); 119 | std::fill(nnz_each_col.begin(), nnz_each_col.end(), 0); 120 | for (size_t n = 0; n < nnz; n++) { 121 | nnz_each_col[csr_matrix.adj_indices[n]]++; 122 | } 123 | csc_matrix.adj_indptr[0] = 0; 124 | for (size_t col_idx = 0; col_idx < csc_matrix.num_cols; col_idx++) { 125 | csc_matrix.adj_indptr[col_idx + 1] = csc_matrix.adj_indptr[col_idx] + nnz_each_col[col_idx]; 126 | } 127 | assert(csc_matrix.adj_indptr[csc_matrix.num_cols] == nnz); 128 | // Convert adj_data and adj_indices 129 | std::vector nnz_consumed_each_col(csc_matrix.num_cols); 130 | std::fill(nnz_consumed_each_col.begin(), nnz_consumed_each_col.end(), 0); 131 | for (size_t row_idx = 0; row_idx < csr_matrix.num_rows; row_idx++){ 132 | for (size_t i = csr_matrix.adj_indptr[row_idx]; i < csr_matrix.adj_indptr[row_idx + 1]; i++){ 133 | uint32_t col_idx = csr_matrix.adj_indices[i]; 134 | uint32_t dest = csc_matrix.adj_indptr[col_idx] + nnz_consumed_each_col[col_idx]; 135 | csc_matrix.adj_indices[dest] = row_idx; 136 | csc_matrix.adj_data[dest] = csr_matrix.adj_data[i]; 137 | nnz_consumed_each_col[col_idx]++; 138 | } 139 | } 140 | for (size_t col_idx = 0; col_idx < csc_matrix.num_cols; col_idx++) { 141 | assert(nnz_consumed_each_col[col_idx] == nnz_each_col[col_idx]); 142 | } 143 | return csc_matrix; 144 | } 145 | 146 | 147 | // Convert a float csc matrix to another data type. 148 | template 149 | CSCMatrix csc_matrix_convert_from_float(CSCMatrix const &in) { 150 | CSCMatrix out; 151 | out.num_rows = in.num_rows; 152 | out.num_cols = in.num_cols; 153 | std::copy(in.adj_data.begin(), in.adj_data.end(), std::back_inserter(out.adj_data)); 154 | out.adj_indices = in.adj_indices; 155 | out.adj_indptr = in.adj_indptr; 156 | return out; 157 | } 158 | 159 | } // namespace io 160 | } // namespace graphlily 161 | 162 | #endif // GRAPHLILY_IO_DATA_LOADER_H_ 163 | -------------------------------------------------------------------------------- /graphlily/module/add_scalar_vector_dense_module.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_EWISE_ADD_MODULE_H_ 2 | #define GRAPHLILY_EWISE_ADD_MODULE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "xcl2.hpp" 10 | 11 | #include "graphlily/global.h" 12 | #include "graphlily/module/base_module.h" 13 | 14 | 15 | namespace graphlily { 16 | namespace module { 17 | 18 | template 19 | class eWiseAddModule : public BaseModule { 20 | private: 21 | using packed_val_t = struct {vector_data_t data[graphlily::pack_size];}; 22 | using aligned_dense_vec_t = std::vector>; 23 | 24 | /*! \brief Internal copy of the input vector */ 25 | aligned_dense_vec_t in_; 26 | /*! \brief Internal copy of the output vector */ 27 | aligned_dense_vec_t out_; 28 | 29 | public: 30 | // Device buffers 31 | cl::Buffer in_buf; 32 | cl::Buffer out_buf; 33 | 34 | public: 35 | eWiseAddModule() : BaseModule("overlay") {} 36 | 37 | /*Overlay argument list: 38 | * (H = num_hbm_channels) 39 | * Index Argument used in this module? 40 | * 0 ~ H-1 matrix for spmv n 41 | * H+0 vector for spmv y 42 | * H+1 mask for spmv (read port) n 43 | * H+2 mask for spmv (write port) n 44 | * H+3 output for spmv y 45 | * 46 | * H+4 ~ +6 matrix for spmspv n 47 | * H+7 vector for spmspv n 48 | * H+8 mask for spmspv n 49 | * H+9 output for spmspv n 50 | * 51 | * H+10 # of rows n 52 | * H+11 # of columns n 53 | * 54 | * H+12 operation type n 55 | * H+13 mask type n 56 | * 57 | * H+14 overlay mode select y 58 | * 59 | * H+15 apply vector length y 60 | * H+16 input value for assign y 61 | */ 62 | void set_unused_args() override { 63 | // Set unused arguments for SpMV 64 | for (uint32_t i = 0; i < graphlily::num_hbm_channels; i++) { 65 | this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4)); 66 | } 67 | this->kernel_.setArg(graphlily::num_hbm_channels + 1, cl::Buffer(this->context_, 0, 4)); 68 | this->kernel_.setArg(graphlily::num_hbm_channels + 2, cl::Buffer(this->context_, 0, 4)); 69 | // Set unused arguments for SpMSpV 70 | for (uint32_t i = graphlily::num_hbm_channels + 4; i <= graphlily::num_hbm_channels + 9; i++) { 71 | this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4)); 72 | } 73 | // Set unused scalar arguments 74 | this->kernel_.setArg(graphlily::num_hbm_channels + 10, (unsigned)NULL); 75 | this->kernel_.setArg(graphlily::num_hbm_channels + 11, (unsigned)NULL); 76 | this->kernel_.setArg(graphlily::num_hbm_channels + 12, (char)NULL); 77 | this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)NULL); 78 | } 79 | 80 | void set_mode() override { 81 | this->kernel_.setArg(graphlily::num_hbm_channels + 14, 3); // 3 is kernel_add_scalar_vector_dense 82 | } 83 | 84 | /*! 85 | * \brief Send the input vector from host to device. 86 | */ 87 | void send_in_host_to_device(aligned_dense_vec_t &in); 88 | 89 | /*! 90 | * \brief Allocate the output buffer. 91 | */ 92 | void allocate_out_buf(uint32_t len); 93 | 94 | /*! 95 | * \brief Bind the input buffer to an existing buffer. 96 | */ 97 | void bind_in_buf(cl::Buffer src_buf) { 98 | this->in_buf = src_buf; 99 | this->kernel_.setArg(graphlily::num_hbm_channels + 3, this->in_buf); 100 | } 101 | 102 | /*! 103 | * \brief Bind the output buffer to an existing buffer. 104 | */ 105 | void bind_out_buf(cl::Buffer src_buf) { 106 | this->out_buf = src_buf; 107 | this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->out_buf); 108 | } 109 | 110 | /*! 111 | * \brief Run the module. 112 | * \param len The length of the in/out vector. 113 | * \param val The value to be added. 114 | */ 115 | void run(uint32_t len, vector_data_t val); 116 | 117 | /*! 118 | * \brief Send the output vector from device to host. 119 | * \return The output vector. 120 | */ 121 | aligned_dense_vec_t send_out_device_to_host() { 122 | this->command_queue_.enqueueMigrateMemObjects({this->out_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 123 | this->command_queue_.finish(); 124 | return this->out_; 125 | } 126 | 127 | /*! 128 | * \brief Compute reference results. 129 | * \param in The inout vector. 130 | * \param len The length of the mask/inout vector. 131 | * \param val The value to be assigned to the inout vector. 132 | * \return The output vector. 133 | */ 134 | graphlily::aligned_dense_float_vec_t 135 | compute_reference_results(graphlily::aligned_dense_float_vec_t const &in, 136 | uint32_t len, 137 | float val); 138 | }; 139 | 140 | 141 | template 142 | void eWiseAddModule::send_in_host_to_device(aligned_dense_vec_t &in) { 143 | this->in_.assign(in.begin(), in.end()); 144 | cl_mem_ext_ptr_t in_ext; 145 | in_ext.obj = this->in_.data(); 146 | in_ext.param = 0; 147 | in_ext.flags = graphlily::HBM[22]; 148 | cl_int err; 149 | OCL_CHECK(err, this->in_buf = cl::Buffer(this->context_, 150 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 151 | sizeof(vector_data_t) * this->in_.size(), 152 | &in_ext, 153 | &err)); 154 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 3, this->in_buf)); 155 | OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->in_buf}, 0)); 156 | this->command_queue_.finish(); 157 | } 158 | 159 | 160 | template 161 | void eWiseAddModule::allocate_out_buf(uint32_t len) { 162 | this->out_.resize(len); 163 | cl_mem_ext_ptr_t out_ext; 164 | out_ext.obj = this->out_.data(); 165 | out_ext.param = 0; 166 | out_ext.flags = graphlily::HBM[20]; 167 | cl_int err; 168 | OCL_CHECK(err, this->out_buf = cl::Buffer(this->context_, 169 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 170 | sizeof(vector_data_t) * this->out_.size(), 171 | &out_ext, 172 | &err)); 173 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->out_buf)); 174 | OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->out_buf}, 0)); 175 | this->command_queue_.finish(); 176 | } 177 | 178 | 179 | template 180 | void eWiseAddModule::run(uint32_t len, vector_data_t val) { 181 | cl_int err; 182 | // TODO: is the overhead of setArg and enqueueTask large at run time? 183 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 15, len)); 184 | // To avoid runtime error of invalid scalar argument size 185 | if (!(std::is_same::value || std::is_same::value)) { 186 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, 8, (void*)&val)); 187 | } else { 188 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, val)); 189 | } 190 | OCL_CHECK(err, err = this->command_queue_.enqueueTask(this->kernel_)); 191 | this->command_queue_.finish(); 192 | } 193 | 194 | 195 | template graphlily::aligned_dense_float_vec_t 196 | eWiseAddModule::compute_reference_results(graphlily::aligned_dense_float_vec_t const &in, 197 | uint32_t len, 198 | float val) { 199 | graphlily::aligned_dense_float_vec_t out(len); 200 | for (uint32_t i = 0; i < len; i++) { 201 | out[i] = in[i] + val; 202 | } 203 | return out; 204 | } 205 | 206 | } // namespace module 207 | } // namespace graphlily 208 | 209 | #endif // GRAPHLILY_EWISE_ADD_MODULE_H_ 210 | -------------------------------------------------------------------------------- /graphlily/module/assign_vector_dense_module.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_ASSIGN_VECTOR_DENSE_MODULE_H_ 2 | #define GRAPHLILY_ASSIGN_VECTOR_DENSE_MODULE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "xcl2.hpp" 10 | 11 | #include "graphlily/global.h" 12 | #include "graphlily/module/base_module.h" 13 | 14 | 15 | namespace graphlily { 16 | namespace module { 17 | 18 | template 19 | class AssignVectorDenseModule : public BaseModule { 20 | private: 21 | using packed_val_t = struct {vector_data_t data[graphlily::pack_size];}; 22 | using aligned_dense_vec_t = std::vector>; 23 | 24 | /*! \brief The mask type */ 25 | graphlily::MaskType mask_type_; 26 | /*! \brief Internal copy of mask */ 27 | aligned_dense_vec_t mask_; 28 | /*! \brief Internal copy of inout */ 29 | aligned_dense_vec_t inout_; 30 | 31 | public: 32 | // Device buffers 33 | cl::Buffer mask_buf; 34 | cl::Buffer inout_buf; 35 | 36 | public: 37 | AssignVectorDenseModule() : BaseModule("overlay") {} 38 | 39 | /*Overlay argument list: 40 | * (H = num_hbm_channels) 41 | * Index Argument used in this module? 42 | * 0 ~ H-1 matrix for spmv n 43 | * H+0 vector for spmv y 44 | * H+1 mask for spmv (read port) y 45 | * H+2 mask for spmv (write port) y 46 | * H+3 output for spmv n 47 | * 48 | * H+4 ~ +6 matrix for spmspv n 49 | * H+7 vector for spmspv n 50 | * H+8 mask for spmspv n 51 | * H+9 output for spmspv n 52 | * 53 | * H+10 # of rows n 54 | * H+11 # of columns n 55 | * 56 | * H+12 operation type n 57 | * H+13 mask type y 58 | * 59 | * H+14 overlay mode select y 60 | * 61 | * H+15 apply vector length y 62 | * H+16 input value for assign y 63 | */ 64 | void set_unused_args() override { 65 | // Set unused arguments for SpMV 66 | for (uint32_t i = 0; i < graphlily::num_hbm_channels; i++) { 67 | this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4)); 68 | } 69 | this->kernel_.setArg(graphlily::num_hbm_channels + 3, cl::Buffer(this->context_, 0, 4)); 70 | // Set unused arguments for SpMSpV 71 | for (uint32_t i = graphlily::num_hbm_channels + 4; i <= graphlily::num_hbm_channels + 9; i++) { 72 | this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4)); 73 | } 74 | // Set unused scalar arguments 75 | this->kernel_.setArg(graphlily::num_hbm_channels + 10, (unsigned)NULL); 76 | this->kernel_.setArg(graphlily::num_hbm_channels + 11, (unsigned)NULL); 77 | this->kernel_.setArg(graphlily::num_hbm_channels + 12, (char)NULL); 78 | } 79 | 80 | void set_mode() override { 81 | this->kernel_.setArg(graphlily::num_hbm_channels + 14, 4);; // 4 is kernel_assign_vector_dense 82 | } 83 | 84 | /*! 85 | * \brief Set the mask type. 86 | * \param mask_type The mask type. 87 | */ 88 | void set_mask_type(graphlily::MaskType mask_type) { 89 | if (mask_type == graphlily::kNoMask) { 90 | std::cerr << "Please set the mask type" << std::endl; 91 | exit(EXIT_FAILURE); 92 | } else { 93 | this->mask_type_ = mask_type; 94 | } 95 | } 96 | 97 | /*! 98 | * \brief Send the mask from host to device. 99 | */ 100 | void send_mask_host_to_device(aligned_dense_vec_t &mask); 101 | 102 | /*! 103 | * \brief Send the inout from host to device. 104 | */ 105 | void send_inout_host_to_device(aligned_dense_vec_t &inout); 106 | 107 | /*! 108 | * \brief Bind the mask buffer to an existing buffer. 109 | */ 110 | void bind_mask_buf(cl::Buffer src_buf) { 111 | this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)this->mask_type_); 112 | this->mask_buf = src_buf; 113 | this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->mask_buf); 114 | } 115 | 116 | /*! 117 | * \brief Bind the inout buffer to an existing buffer. 118 | */ 119 | void bind_inout_buf(cl::Buffer src_buf) { 120 | this->inout_buf = src_buf; 121 | // set both read and write ports 122 | this->kernel_.setArg(graphlily::num_hbm_channels + 1, this->inout_buf); 123 | this->kernel_.setArg(graphlily::num_hbm_channels + 2, this->inout_buf); 124 | } 125 | 126 | /*! 127 | * \brief Run the module. 128 | * \param len The length of the mask/inout vector. 129 | * \param val The value to be assigned to the inout vector. 130 | */ 131 | void run(uint32_t len, vector_data_t val); 132 | 133 | /*! 134 | * \brief Send the mask from device to host. 135 | * \return The mask. 136 | */ 137 | aligned_dense_vec_t send_mask_device_to_host() { 138 | this->command_queue_.enqueueMigrateMemObjects({this->mask_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 139 | this->command_queue_.finish(); 140 | return this->mask_; 141 | } 142 | 143 | /*! 144 | * \brief Send the inout from device to host. 145 | * \return The inout. 146 | */ 147 | aligned_dense_vec_t send_inout_device_to_host() { 148 | this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 149 | this->command_queue_.finish(); 150 | return this->inout_; 151 | } 152 | 153 | /*! 154 | * \brief Compute reference results. 155 | * \param mask The mask vector. 156 | * \param inout The inout vector. 157 | * \param len The length of the mask/inout vector. 158 | * \param val The value to be assigned to the inout vector. 159 | */ 160 | void compute_reference_results(graphlily::aligned_dense_float_vec_t &mask, 161 | graphlily::aligned_dense_float_vec_t &inout, 162 | uint32_t len, 163 | float val); 164 | }; 165 | 166 | 167 | template 168 | void AssignVectorDenseModule::send_mask_host_to_device(aligned_dense_vec_t &mask) { 169 | this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)this->mask_type_); 170 | this->mask_.assign(mask.begin(), mask.end()); 171 | cl_mem_ext_ptr_t mask_ext; 172 | mask_ext.obj = this->mask_.data(); 173 | mask_ext.param = 0; 174 | mask_ext.flags = graphlily::HBM[20]; 175 | cl_int err; 176 | OCL_CHECK(err, this->mask_buf = cl::Buffer(this->context_, 177 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 178 | sizeof(vector_data_t) * this->mask_.size(), 179 | &mask_ext, 180 | &err)); 181 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->mask_buf)); 182 | OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->mask_buf}, 0)); 183 | this->command_queue_.finish(); 184 | } 185 | 186 | 187 | template 188 | void AssignVectorDenseModule::send_inout_host_to_device(aligned_dense_vec_t &inout) { 189 | this->inout_.assign(inout.begin(), inout.end()); 190 | cl_mem_ext_ptr_t inout_ext; 191 | inout_ext.obj = this->inout_.data(); 192 | inout_ext.param = 0; 193 | inout_ext.flags = graphlily::HBM[21]; 194 | cl_int err; 195 | OCL_CHECK(err, this->inout_buf = cl::Buffer(this->context_, 196 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 197 | sizeof(vector_data_t) * this->inout_.size(), 198 | &inout_ext, 199 | &err)); 200 | // set both read and write ports 201 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 1, this->inout_buf)); 202 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 2, this->inout_buf)); 203 | OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, 0)); 204 | this->command_queue_.finish(); 205 | } 206 | 207 | 208 | template 209 | void AssignVectorDenseModule::run(uint32_t len, vector_data_t val) { 210 | cl_int err; 211 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 15, len)); 212 | // To avoid runtime error of invalid scalar argument size 213 | if (!(std::is_same::value || std::is_same::value)) { 214 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, 8, (void*)&val)); 215 | } else { 216 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, val)); 217 | } 218 | OCL_CHECK(err, err = this->command_queue_.enqueueTask(this->kernel_)); 219 | this->command_queue_.finish(); 220 | } 221 | 222 | 223 | template 224 | void AssignVectorDenseModule::compute_reference_results( 225 | graphlily::aligned_dense_float_vec_t &mask, 226 | graphlily::aligned_dense_float_vec_t &inout, 227 | uint32_t len, 228 | float val 229 | ) { 230 | if (this->mask_type_ == graphlily::kMaskWriteToZero) { 231 | for (size_t i = 0; i < len; i++) { 232 | if (mask[i] == 0) { 233 | inout[i] = val; 234 | } 235 | } 236 | } else if (this->mask_type_ == graphlily::kMaskWriteToOne) { 237 | for (size_t i = 0; i < len; i++) { 238 | if (mask[i] != 0) { 239 | inout[i] = val; 240 | } 241 | } 242 | } else { 243 | std::cout << "Invalid mask type" << std::endl; 244 | exit(EXIT_FAILURE); 245 | } 246 | } 247 | 248 | } // namespace module 249 | } // namespace graphlily 250 | 251 | #endif // GRAPHLILY_ASSIGN_VECTOR_DENSE_MODULE_H_ 252 | -------------------------------------------------------------------------------- /graphlily/module/assign_vector_sparse_module.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_ASSIGN_VECTOR_SPARSE_MODULE_H_ 2 | #define GRAPHLILY_ASSIGN_VECTOR_SPARSE_MODULE_H_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "xcl2.hpp" 10 | 11 | #include "graphlily/global.h" 12 | #include "graphlily/module/base_module.h" 13 | 14 | 15 | namespace graphlily { 16 | namespace module { 17 | 18 | template 19 | class AssignVectorSparseModule : public BaseModule { 20 | private: 21 | using aligned_mask_t = std::vector>; 22 | using aligned_dense_vec_t = std::vector>; 23 | 24 | /*! \brief Generate new frontier (used in SSSP) or not (used in BFS) */ 25 | bool generate_new_frontier_; 26 | /*! \brief Internal copy of mask */ 27 | aligned_mask_t mask_; 28 | /*! \brief Internal copy of inout */ 29 | aligned_dense_vec_t inout_; 30 | /*! \brief Internal copy of new_frontier */ 31 | aligned_mask_t new_frontier_; 32 | 33 | public: 34 | // Device buffers 35 | cl::Buffer mask_buf; 36 | cl::Buffer inout_buf; 37 | cl::Buffer new_frontier_buf; 38 | 39 | public: 40 | AssignVectorSparseModule(bool generate_new_frontier) : BaseModule("overlay") { 41 | this->generate_new_frontier_ = generate_new_frontier; 42 | } 43 | 44 | /*Overlay argument list: 45 | * (H = num_hbm_channels) 46 | * Index Argument used in this module? 47 | * 0 ~ H-1 matrix for spmv n 48 | * H+0 vector for spmv n 49 | * H+1 mask for spmv (read port) n 50 | * H+2 mask for spmv (write port) n 51 | * H+3 output for spmv n 52 | * 53 | * H+4 ~ +6 matrix for spmspv n 54 | * H+7 vector for spmspv y 55 | * H+8 mask for spmspv y 56 | * H+9 output for spmspv n 57 | * 58 | * H+10 # of rows n 59 | * H+11 # of columns n 60 | * 61 | * H+12 operation type n 62 | * H+13 mask type n 63 | * 64 | * H+14 overlay mode select y 65 | * 66 | * H+15 apply vector length n 67 | * H+16 input value for assign y 68 | */ 69 | void set_unused_args() override { 70 | // Set unused arguments for SpMV 71 | for (uint32_t i = 0; i < graphlily::num_hbm_channels + 4; i++) { 72 | this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4)); 73 | } 74 | // Set unused arguments for SpMSpV 75 | for (uint32_t i = graphlily::num_hbm_channels + 4; i < graphlily::num_hbm_channels + 7; i++) { 76 | this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4)); 77 | } 78 | // Set unused scalar arguments 79 | this->kernel_.setArg(graphlily::num_hbm_channels + 15, (unsigned)NULL); 80 | this->kernel_.setArg(graphlily::num_hbm_channels + 10, (unsigned)NULL); 81 | this->kernel_.setArg(graphlily::num_hbm_channels + 11, (unsigned)NULL); 82 | this->kernel_.setArg(graphlily::num_hbm_channels + 12, (char)NULL); 83 | this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)NULL); 84 | if (!this->generate_new_frontier_) { 85 | this->kernel_.setArg(graphlily::num_hbm_channels + 9, cl::Buffer(this->context_, 0, 4)); 86 | } 87 | if (this->generate_new_frontier_) { 88 | if (!(std::is_same::value || std::is_same::value)) { 89 | this->kernel_.setArg(graphlily::num_hbm_channels + 16, (long long)NULL); 90 | } else { 91 | this->kernel_.setArg(graphlily::num_hbm_channels + 16, (unsigned)NULL); 92 | } 93 | } 94 | } 95 | 96 | void set_mode() override { 97 | if (this->generate_new_frontier_) { 98 | this->kernel_.setArg(graphlily::num_hbm_channels + 14, 6); 99 | } else { 100 | this->kernel_.setArg(graphlily::num_hbm_channels + 14, 5); 101 | } 102 | } 103 | 104 | /*! 105 | * \brief Send the mask from host to device. 106 | */ 107 | void send_mask_host_to_device(aligned_mask_t &mask); 108 | 109 | /*! 110 | * \brief Send the inout from host to device. 111 | */ 112 | void send_inout_host_to_device(aligned_dense_vec_t &inout); 113 | 114 | /*! 115 | * \brief Bind the mask buffer to an existing buffer. 116 | */ 117 | void bind_mask_buf(cl::Buffer src_buf) { 118 | this->mask_buf = src_buf; 119 | if (this->generate_new_frontier_) { 120 | this->kernel_.setArg(graphlily::num_hbm_channels + 9, this->mask_buf); 121 | } else { 122 | this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->mask_buf); 123 | } 124 | } 125 | 126 | /*! 127 | * \brief Bind the inout buffer to an existing buffer. 128 | */ 129 | void bind_inout_buf(cl::Buffer src_buf) { 130 | this->inout_buf = src_buf; 131 | this->kernel_.setArg(graphlily::num_hbm_channels + 8, this->inout_buf); 132 | } 133 | 134 | /*! 135 | * \brief Bind the new_frontier buffer to an existing buffer. 136 | */ 137 | void bind_new_frontier_buf(cl::Buffer src_buf) { 138 | if (!this->generate_new_frontier_) { 139 | std::cout << "[ERROR]: this->generate_new_frontier_ should be true" << std::endl; 140 | exit(EXIT_FAILURE); 141 | } 142 | this->new_frontier_buf = src_buf; 143 | this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->new_frontier_buf); 144 | } 145 | 146 | /*! 147 | * \brief Run the module when this->generate_new_frontier_ is false (BFS mode). 148 | * \param val The value to be assigned to the inout vector. 149 | */ 150 | void run(vector_data_t val); 151 | 152 | /*! 153 | * \brief Run the module when this->generate_new_frontier_ is true (SSSP mode). 154 | */ 155 | void run(); 156 | 157 | /*! 158 | * \brief Send the mask from device to host. 159 | * \return The mask. 160 | */ 161 | aligned_mask_t send_mask_device_to_host() { 162 | this->command_queue_.enqueueMigrateMemObjects({this->mask_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 163 | this->command_queue_.finish(); 164 | return this->mask_; 165 | } 166 | 167 | /*! 168 | * \brief Send the inout from device to host. 169 | * \return The inout. 170 | */ 171 | aligned_dense_vec_t send_inout_device_to_host() { 172 | this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 173 | this->command_queue_.finish(); 174 | return this->inout_; 175 | } 176 | 177 | /*! 178 | * \brief Send the new_frontier from device to host. 179 | * \return The inout. 180 | */ 181 | aligned_mask_t send_new_frontier_device_to_host() { 182 | if (!this->generate_new_frontier_) { 183 | std::cout << "[ERROR]: this->generate_new_frontier_ should be true" << std::endl; 184 | exit(EXIT_FAILURE); 185 | } 186 | this->command_queue_.enqueueMigrateMemObjects({this->new_frontier_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 187 | this->command_queue_.finish(); 188 | return this->new_frontier_; 189 | } 190 | 191 | /*! 192 | * \brief Compute reference results when this->generate_new_frontier_ is false (BFS mode). 193 | * \param mask The mask vector. 194 | * \param inout The inout vector. 195 | * \param val The value to be assigned to the inout vector. 196 | */ 197 | void compute_reference_results(graphlily::aligned_sparse_float_vec_t &mask, 198 | graphlily::aligned_dense_float_vec_t &inout, 199 | float val); 200 | 201 | /*! 202 | * \brief Compute reference results when this->generate_new_frontier_ is true (SSSP mode). 203 | * \param mask The mask vector. 204 | * \param inout The inout vector. 205 | * \param new_frontier The new frontier. 206 | */ 207 | void compute_reference_results(graphlily::aligned_sparse_float_vec_t &mask, 208 | graphlily::aligned_dense_float_vec_t &inout, 209 | graphlily::aligned_sparse_float_vec_t &new_frontier); 210 | }; 211 | 212 | 213 | template 214 | void AssignVectorSparseModule::send_mask_host_to_device( 215 | aligned_mask_t &mask 216 | ) { 217 | cl_int err; 218 | // handle mask 219 | this->mask_.assign(mask.begin(), mask.end()); 220 | cl_mem_ext_ptr_t mask_ext; 221 | mask_ext.obj = this->mask_.data(); 222 | mask_ext.param = 0; 223 | if (this->generate_new_frontier_) { 224 | mask_ext.flags = graphlily::HBM[22]; 225 | } else { 226 | mask_ext.flags = graphlily::HBM[20]; 227 | } 228 | OCL_CHECK(err, this->mask_buf = cl::Buffer(this->context_, 229 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 230 | sizeof(sparse_vector_data_t) * this->mask_.size(), 231 | &mask_ext, 232 | &err)); 233 | if (this->generate_new_frontier_) { 234 | this->kernel_.setArg(graphlily::num_hbm_channels + 9, this->mask_buf); 235 | } else { 236 | this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->mask_buf); 237 | } 238 | if (this->generate_new_frontier_) { 239 | // allocate memory for new_frontier 240 | this->new_frontier_.resize(this->mask_.size()); 241 | cl_mem_ext_ptr_t new_frontier_ext; 242 | new_frontier_ext.obj = this->new_frontier_.data(); 243 | new_frontier_ext.param = 0; 244 | new_frontier_ext.flags = graphlily::HBM[20]; 245 | OCL_CHECK(err, this->new_frontier_buf = cl::Buffer(this->context_, 246 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 247 | sizeof(sparse_vector_data_t) * this->new_frontier_.size(), 248 | &new_frontier_ext, 249 | &err)); 250 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->new_frontier_buf)); 251 | OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->new_frontier_buf}, 0)); 252 | this->command_queue_.finish(); 253 | } 254 | } 255 | 256 | 257 | template 258 | void AssignVectorSparseModule::send_inout_host_to_device( 259 | aligned_dense_vec_t &inout 260 | ) { 261 | this->inout_.assign(inout.begin(), inout.end()); 262 | cl_mem_ext_ptr_t inout_ext; 263 | inout_ext.obj = this->inout_.data(); 264 | inout_ext.param = 0; 265 | inout_ext.flags = graphlily::HBM[21]; 266 | cl_int err; 267 | OCL_CHECK(err, this->inout_buf = cl::Buffer(this->context_, 268 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 269 | sizeof(vector_data_t) * this->inout_.size(), 270 | &inout_ext, 271 | &err)); 272 | OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 8, this->inout_buf)); 273 | OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, 0)); 274 | this->command_queue_.finish(); 275 | } 276 | 277 | 278 | template 279 | void AssignVectorSparseModule::run(vector_data_t val) { 280 | if (this->generate_new_frontier_) { 281 | std::cout << "[ERROR]: this->generate_new_frontier_ should be false" << std::endl; 282 | exit(EXIT_FAILURE); 283 | } 284 | // To avoid runtime error of invalid scalar argument size 285 | if (!(std::is_same::value || std::is_same::value)) { 286 | this->kernel_.setArg(graphlily::num_hbm_channels + 16, 8, (void*)&val); 287 | } else { 288 | this->kernel_.setArg(graphlily::num_hbm_channels + 16, val); 289 | } 290 | this->command_queue_.enqueueTask(this->kernel_); 291 | this->command_queue_.finish(); 292 | } 293 | 294 | 295 | template 296 | void AssignVectorSparseModule::run() { 297 | if (!this->generate_new_frontier_) { 298 | std::cout << "[ERROR]: this->generate_new_frontier_ should be true" << std::endl; 299 | exit(EXIT_FAILURE); 300 | } 301 | this->command_queue_.enqueueTask(this->kernel_); 302 | this->command_queue_.finish(); 303 | } 304 | 305 | 306 | template 307 | void AssignVectorSparseModule::compute_reference_results( 308 | graphlily::aligned_sparse_float_vec_t &mask, 309 | graphlily::aligned_dense_float_vec_t &inout, 310 | float val 311 | ) { 312 | for (size_t i = 0; i < mask[0].index; i++) { 313 | inout[mask[i + 1].index] = val; 314 | } 315 | } 316 | 317 | 318 | template 319 | void AssignVectorSparseModule::compute_reference_results( 320 | graphlily::aligned_sparse_float_vec_t &mask, 321 | graphlily::aligned_dense_float_vec_t &inout, 322 | graphlily::aligned_sparse_float_vec_t &new_frontier 323 | ) { 324 | new_frontier.clear(); 325 | for (size_t i = 0; i < mask[0].index; i++) { 326 | if (inout[mask[i + 1].index] > mask[i + 1].val) { 327 | inout[mask[i + 1].index] = mask[i + 1].val; 328 | new_frontier.push_back(mask[i + 1]); 329 | } 330 | } 331 | graphlily::idx_float_t new_frontier_head; 332 | new_frontier_head.index = new_frontier.size(); 333 | new_frontier_head.val = 0; 334 | new_frontier.insert(new_frontier.begin(), new_frontier_head); 335 | } 336 | 337 | } // namespace module 338 | } // namespace graphlily 339 | 340 | #endif // GRAPHLILY_ASSIGN_VECTOR_SPARSE_MODULE_H_ 341 | -------------------------------------------------------------------------------- /graphlily/module/base_module.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_BASE_MODULE_H_ 2 | #define GRAPHLILY_BASE_MODULE_H_ 3 | 4 | #include "graphlily/global.h" 5 | 6 | 7 | namespace graphlily { 8 | namespace module { 9 | 10 | class BaseModule { 11 | protected: 12 | /*! \brief The kernel name */ 13 | std::string kernel_name_; 14 | /*! \brief The target; can be sw_emu, hw_emu, hw */ 15 | std::string target_; 16 | 17 | // OpenCL runtime 18 | cl::Device device_; 19 | cl::Context context_; 20 | cl::Kernel kernel_; 21 | cl::CommandQueue command_queue_; 22 | 23 | public: 24 | BaseModule(std::string kernel_name) { 25 | this->kernel_name_ = kernel_name; 26 | } 27 | 28 | virtual ~BaseModule() { 29 | this->device_ = nullptr; 30 | this->context_ = nullptr; 31 | this->kernel_ = nullptr; 32 | this->command_queue_ = nullptr; 33 | } 34 | 35 | /*! 36 | * \brief Get the kernel name. 37 | * \return The kernel name. 38 | */ 39 | std::string get_kernel_name() { 40 | return this->kernel_name_; 41 | } 42 | 43 | /*! 44 | * \brief Set the device. 45 | */ 46 | void set_device(cl::Device device) { 47 | this->device_ = device; 48 | } 49 | 50 | /*! 51 | * \brief Set the context. 52 | */ 53 | void set_context(cl::Context context) { 54 | this->context_ = context; 55 | } 56 | 57 | /*! 58 | * \brief Set the kernel. 59 | */ 60 | void set_kernel(cl::Kernel kernel) { 61 | this->kernel_ = kernel; 62 | } 63 | 64 | /*! 65 | * \brief Set the command queue. 66 | */ 67 | void set_command_queue(cl::CommandQueue command_queue) { 68 | this->command_queue_ = command_queue; 69 | } 70 | 71 | /*! 72 | * \brief Set the target. 73 | */ 74 | void set_target(std::string target) { 75 | assert(target == "sw_emu" || target == "hw_emu" || target == "hw"); 76 | this->target_ = target; 77 | } 78 | 79 | /*! 80 | * \brief Copy the contents of a buffer into another buffer without going through the host. 81 | */ 82 | void copy_buffer_device_to_device(cl::Buffer src, cl::Buffer dst, size_t bytes) { 83 | this->command_queue_.enqueueCopyBuffer(src, dst, 0, 0, bytes); 84 | this->command_queue_.finish(); 85 | } 86 | 87 | /*! 88 | * \brief Set unused arguments 89 | */ 90 | virtual void set_unused_args() = 0; 91 | 92 | /*! 93 | * \brief Set the mode. SpMV and SpMSpV are merged into a single kernel; we need to select 94 | * one of them, so called the mode. Similarly, all apply functions are merged into one kernel. 95 | */ 96 | virtual void set_mode() = 0; 97 | 98 | /*! 99 | * \brief Load the xclbin file and set up runtime. 100 | * \param xclbin_file_path The xclbin file path. 101 | */ 102 | void set_up_runtime(std::string xclbin_file_path); 103 | }; 104 | 105 | 106 | void BaseModule::set_up_runtime(std::string xclbin_file_path) { 107 | cl_int err; 108 | // Set this->device_ and this->context_ 109 | if (this->target_ == "sw_emu" || this->target_ == "hw_emu") { 110 | setenv("XCL_EMULATION_MODE", this->target_.c_str(), true); 111 | } 112 | this->device_ = graphlily::find_device(); 113 | this->context_ = cl::Context(this->device_, NULL, NULL, NULL); 114 | // Set this->kernel_ 115 | auto file_buf = xcl::read_binary_file(xclbin_file_path); 116 | cl::Program::Binaries binaries{{file_buf.data(), file_buf.size()}}; 117 | cl::Program program(this->context_, {this->device_}, binaries, NULL, &err); 118 | if (err != CL_SUCCESS) { 119 | std::cout << "Failed to program device with xclbin file\n"; 120 | } else { 121 | std::cout << "Successfully programmed device with xclbin file\n"; 122 | } 123 | OCL_CHECK(err, this->kernel_ = cl::Kernel(program, this->kernel_name_.c_str(), &err)); 124 | // Set this->command_queue_ 125 | OCL_CHECK(err, this->command_queue_ = cl::CommandQueue(this->context_, 126 | this->device_, 127 | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, 128 | &err)); 129 | // Set unused arguments 130 | this->set_unused_args(); 131 | // Set the mode 132 | this->set_mode(); 133 | } 134 | 135 | } // namespace module 136 | } // namespace graphlily 137 | 138 | #endif // GRAPHLILY_BASE_MODULE_H_ 139 | -------------------------------------------------------------------------------- /graphlily/synthesizer/base_synthesizer.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_BASE_SYNTHESIZER_H_ 2 | #define GRAPHLILY_BASE_SYNTHESIZER_H_ 3 | 4 | #include "graphlily/global.h" 5 | 6 | 7 | namespace graphlily { 8 | namespace synthesizer { 9 | 10 | template 11 | void _generate_makefile_impl(T* t) { 12 | std::string command = "mkdir -p " + graphlily::proj_folder_name; 13 | std::cout << command << std::endl; 14 | system(command.c_str()); 15 | std::ofstream makefile(graphlily::proj_folder_name + "/makefile"); 16 | makefile << "TARGET := " << t->target_ << "\n" << std::endl; 17 | makefile << graphlily::makefile_prologue << t->makefile_body_ << graphlily::makefile_epilogue; 18 | makefile.close(); 19 | } 20 | 21 | 22 | template 23 | void _synthesize_impl(T* t) { 24 | std::string command = "mkdir -p " + graphlily::proj_folder_name; 25 | std::cout << command << std::endl; 26 | system(command.c_str()); 27 | t->link_kernel_code(); 28 | t->generate_kernel_header(); 29 | t->generate_kernel_ini(); 30 | t->generate_makefile(); 31 | command = "cd " + graphlily::proj_folder_name + "; " + "make build"; 32 | std::cout << command << std::endl; 33 | system(command.c_str()); 34 | if (t->target_ == "sw_emu" || t->target_ == "hw_emu") { 35 | command = "cp " + graphlily::proj_folder_name + "/emconfig.json " + "."; 36 | std::cout << command << std::endl; 37 | system(command.c_str()); 38 | } 39 | } 40 | 41 | 42 | class BaseSynthesizer { 43 | protected: 44 | /*! \brief The kernel name */ 45 | std::string kernel_name_; 46 | /*! \brief The makefile body */ 47 | std::string makefile_body_; 48 | /*! \brief The target; can be sw_emu, hw_emu, hw */ 49 | std::string target_; 50 | 51 | private: 52 | template friend void _generate_makefile_impl(T* t); 53 | template friend void _synthesize_impl(T* t); 54 | 55 | public: 56 | BaseSynthesizer(std::string kernel_name) { 57 | this->kernel_name_ = kernel_name; 58 | this->makefile_body_ = graphlily::add_kernel_to_makefile(this->kernel_name_); 59 | } 60 | 61 | /*! 62 | * \brief Get the kernel name. 63 | * \return The kernel name. 64 | */ 65 | std::string get_kernel_name() { 66 | return this->kernel_name_; 67 | } 68 | 69 | /*! 70 | * \brief Set the target. 71 | */ 72 | void set_target(std::string target) { 73 | assert(target == "sw_emu" || target == "hw_emu" || target == "hw"); 74 | this->target_ = target; 75 | } 76 | 77 | /*! 78 | * \brief Generate the kernel header file. 79 | */ 80 | virtual void generate_kernel_header() = 0; 81 | 82 | /*! 83 | * \brief Generate the kernel .ini configuration file. 84 | */ 85 | virtual void generate_kernel_ini() = 0; 86 | 87 | /*! 88 | * \brief Link the kernel cpp file to the proj directory. 89 | */ 90 | virtual void link_kernel_code(); 91 | 92 | /*! 93 | * \brief Generate the Makefile. 94 | */ 95 | virtual void generate_makefile() { 96 | _generate_makefile_impl(this); 97 | } 98 | 99 | /*! 100 | * \brief Synthesize the kernel according to this->target_. 101 | */ 102 | virtual void synthesize() { 103 | _synthesize_impl(this); 104 | } 105 | }; 106 | 107 | 108 | void BaseSynthesizer::link_kernel_code() { 109 | std::string command = "cp " + graphlily::root_path + "/graphlily/hw/" + "*.h" 110 | + " " + graphlily::proj_folder_name + "/"; 111 | std::cout << command << std::endl; 112 | system(command.c_str()); 113 | 114 | command = "cp " + graphlily::root_path + "/graphlily/hw/" + this->kernel_name_ + ".cpp" 115 | + " " + graphlily::proj_folder_name + "/" + this->kernel_name_ + ".cpp"; 116 | std::cout << command << std::endl; 117 | system(command.c_str()); 118 | 119 | command = "cp " + graphlily::root_path + "/graphlily/hw/" + this->kernel_name_ + ".ini" 120 | + " " + graphlily::proj_folder_name + "/" + this->kernel_name_ + ".ini"; 121 | std::cout << command << std::endl; 122 | system(command.c_str()); 123 | } 124 | 125 | } // namespace synthesizer 126 | } // namespace graphlily 127 | 128 | #endif // GRAPHLILY_BASE_SYNTHESIZER_H_ 129 | -------------------------------------------------------------------------------- /graphlily/synthesizer/overlay_synthesizer.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_OVERLAY_SYNTHESIZER_H_ 2 | #define GRAPHLILY_OVERLAY_SYNTHESIZER_H_ 3 | 4 | #include "graphlily/synthesizer/base_synthesizer.h" 5 | 6 | 7 | namespace graphlily { 8 | namespace synthesizer { 9 | 10 | class OverlaySynthesizer : public BaseSynthesizer { 11 | private: 12 | // Kernel configuration 13 | uint32_t num_channels_; 14 | uint32_t spmv_out_buf_len_; 15 | uint32_t spmspv_out_buf_len_; 16 | uint32_t vec_buf_len_; 17 | 18 | public: 19 | OverlaySynthesizer(uint32_t num_channels, 20 | uint32_t spmv_out_buf_len, 21 | uint32_t spmspv_out_buf_len, 22 | uint32_t vec_buf_len) : BaseSynthesizer("overlay") { 23 | this->num_channels_ = num_channels; 24 | this->spmv_out_buf_len_ = spmv_out_buf_len; 25 | this->spmspv_out_buf_len_ = spmspv_out_buf_len; 26 | this->vec_buf_len_ = vec_buf_len; 27 | } 28 | 29 | void generate_kernel_header() override; 30 | 31 | void generate_kernel_ini() override; 32 | }; 33 | 34 | 35 | void OverlaySynthesizer::generate_kernel_header() { 36 | std::string command = "mkdir -p " + graphlily::proj_folder_name; 37 | std::cout << command << std::endl; 38 | system(command.c_str()); 39 | std::ofstream header(graphlily::proj_folder_name + "/" + this->kernel_name_ + ".h", std::ios_base::app); 40 | header << "const unsigned SPMV_OUT_BUF_LEN = " << this->spmv_out_buf_len_ << ";" << std::endl; 41 | header << "const unsigned SPMSPV_OUT_BUF_LEN = " << this->spmspv_out_buf_len_ << ";" << std::endl; 42 | header << "const unsigned VEC_BUF_LEN = " << this->vec_buf_len_ << ";" << std::endl; 43 | header << "#define NUM_HBM_CHANNEL " << this->num_channels_ << std::endl; 44 | header << "#define SPMV_NUM_PE_TOTAL " << this->num_channels_ * graphlily::pack_size << std::endl; 45 | header << std::endl; 46 | header << "#endif // GRAPHLILY_HW_OVERLAY_H_" << std::endl; 47 | header.close(); 48 | } 49 | 50 | 51 | void OverlaySynthesizer::generate_kernel_ini() { 52 | std::string command = "mkdir -p " + graphlily::proj_folder_name; 53 | std::cout << command << std::endl; 54 | system(command.c_str()); 55 | std::ofstream ini(graphlily::proj_folder_name + "/" + this->kernel_name_ + ".ini"); 56 | ini << "[connectivity]" << std::endl; 57 | // SpMV 58 | for (size_t hbm_idx = 0; hbm_idx < this->num_channels_; hbm_idx++) { 59 | ini << "sp=overlay_1.spmv_channel_" << hbm_idx << "_matrix:HBM[" 60 | << hbm_idx << "]" << std::endl; 61 | } 62 | ini << "sp=overlay_1.spmv_vector:HBM[20]" << std::endl; 63 | ini << "sp=overlay_1.spmv_mask:HBM[21]" << std::endl; 64 | ini << "sp=overlay_1.spmv_mask_w:HBM[21]" << std::endl; 65 | ini << "sp=overlay_1.spmv_out:HBM[22]" << std::endl; 66 | // SpMSpV 67 | ini << "sp=overlay_1.spmspv_matrix:DDR[0]" << std::endl; 68 | ini << "sp=overlay_1.spmspv_matrix_indptr:DDR[0]" << std::endl; 69 | ini << "sp=overlay_1.spmspv_matrix_partptr:DDR[0]" << std::endl; 70 | ini << "sp=overlay_1.spmspv_vector:HBM[20]" << std::endl; 71 | ini << "sp=overlay_1.spmspv_mask:HBM[21]" << std::endl; 72 | ini << "sp=overlay_1.spmspv_out:HBM[22]" << std::endl; 73 | // enable retiming 74 | ini << "[vivado]" << std::endl; 75 | ini << "prop=run.__KERNEL__.{STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS}={-retiming}" << std::endl; 76 | ini.close(); 77 | } 78 | 79 | 80 | } // namespace synthesizer 81 | } // namespace graphlily 82 | 83 | #endif // GRAPHLILY_OVERLAY_SYNTHESIZER_H_ 84 | -------------------------------------------------------------------------------- /tests/Makefile: -------------------------------------------------------------------------------- 1 | BUILD_DIR = ./build 2 | 3 | HOST_ARCH = x86 4 | 5 | CXXFLAGS += -Wall -O3 -g -std=c++11 6 | CXXFLAGS += -I$(GRAPHLILY_ROOT_PATH) 7 | 8 | LDFLAGS += -lrt -lstdc++ 9 | 10 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/xcl2/xcl2.mk 11 | CXXFLAGS += $(xcl2_CXXFLAGS) 12 | LDFLAGS += $(xcl2_LDFLAGS) 13 | 14 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/opencl/opencl.mk 15 | CXXFLAGS += $(opencl_CXXFLAGS) 16 | LDFLAGS += $(opencl_LDFLAGS) 17 | 18 | CXXFLAGS += -I/work/shared/common/project_build/graphblas/software/cnpy 19 | LDFLAGS += -L/work/shared/common/project_build/graphblas/software/cnpy/build -lcnpy 20 | 21 | CXXFLAGS += -I/work/shared/common/project_build/graphblas/software/googletest/googletest/include 22 | LDFLAGS += -L/work/shared/common/project_build/graphblas/software/googletest/build/lib -lgtest 23 | 24 | test_io: test_io.cpp $(xcl2_SRCS) 25 | g++ $(CXXFLAGS) test_io.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 26 | mkdir -p $(BUILD_DIR) 27 | mv test_io $(BUILD_DIR)/ 28 | cd $(BUILD_DIR); ./test_io 29 | 30 | test_module_spmv_spmspv: test_module_spmv_spmspv.cpp $(xcl2_SRCS) 31 | g++ $(CXXFLAGS) test_module_spmv_spmspv.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 32 | mkdir -p $(BUILD_DIR) 33 | mv test_module_spmv_spmspv $(BUILD_DIR)/ 34 | cd $(BUILD_DIR); ./test_module_spmv_spmspv 35 | 36 | test_module_apply: test_module_apply.cpp $(xcl2_SRCS) 37 | g++ $(CXXFLAGS) test_module_apply.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 38 | mkdir -p $(BUILD_DIR) 39 | mv test_module_apply $(BUILD_DIR)/ 40 | cd $(BUILD_DIR); ./test_module_apply 41 | 42 | test_app: test_app.cpp $(xcl2_SRCS) 43 | g++ $(CXXFLAGS) test_app.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 44 | mkdir -p $(BUILD_DIR) 45 | mv test_app $(BUILD_DIR)/ 46 | cd $(BUILD_DIR); ./test_app 47 | 48 | test_pe_cluster: test_pe_cluster.cpp $(xcl2_SRCS) 49 | g++ $(CXXFLAGS) test_pe_cluster.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 50 | mkdir -p $(BUILD_DIR) 51 | mv test_pe_cluster $(BUILD_DIR)/ 52 | cd $(BUILD_DIR); ./test_pe_cluster 53 | 54 | test_shuffle: test_shuffle.cpp $(xcl2_SRCS) 55 | g++ $(CXXFLAGS) test_shuffle.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS) 56 | mkdir -p $(BUILD_DIR) 57 | mv test_shuffle $(BUILD_DIR)/ 58 | cd $(BUILD_DIR); ./test_shuffle 59 | 60 | all: test_io test_module_spmv_spmspv test_module_apply test_app test_pe_cluster 61 | 62 | clean: 63 | rm -rf $(BUILD_DIR) 64 | -------------------------------------------------------------------------------- /tests/test_app.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context" 3 | #pragma GCC diagnostic ignored "-Wuninitialized" 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 5 | 6 | 7 | #include "graphlily/synthesizer/overlay_synthesizer.h" 8 | #include "graphlily/app/bfs.h" 9 | #include "graphlily/app/pagerank.h" 10 | #include "graphlily/app/sssp.h" 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | std::string target = "sw_emu"; 18 | uint32_t spmv_out_buf_len = 1024; 19 | uint32_t spmspv_out_buf_len = 512; 20 | uint32_t vec_buf_len = 256; 21 | 22 | 23 | void clean_proj_folder() { 24 | std::string command = "rm -rf ./" + graphlily::proj_folder_name; 25 | std::cout << command << std::endl; 26 | system(command.c_str()); 27 | } 28 | 29 | 30 | template 31 | void verify(std::vector> &reference_results, 32 | std::vector> &kernel_results) { 33 | ASSERT_EQ(reference_results.size(), kernel_results.size()); 34 | float epsilon = 0.0001; 35 | for (size_t i = 0; i < reference_results.size(); i++) { 36 | ASSERT_TRUE(abs(float(kernel_results[i]) - reference_results[i]) < epsilon); 37 | } 38 | } 39 | 40 | 41 | TEST(Synthesize, NULL) { 42 | graphlily::synthesizer::OverlaySynthesizer synthesizer(graphlily::num_hbm_channels, 43 | spmv_out_buf_len, 44 | spmspv_out_buf_len, 45 | vec_buf_len); 46 | synthesizer.set_target(target); 47 | synthesizer.synthesize(); 48 | } 49 | 50 | 51 | TEST(BFS, PullPush) { 52 | graphlily::app::BFS bfs(graphlily::num_hbm_channels, spmv_out_buf_len, 53 | spmspv_out_buf_len, vec_buf_len); 54 | bfs.set_target(target); 55 | bfs.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 56 | 57 | std::string csr_float_npz_path = "/work/shared/common/project_build/graphblas/" 58 | "data/sparse_matrix_graph/uniform_10K_10_csr_float32.npz"; 59 | bool skip_empty_rows = true; 60 | bfs.load_and_format_matrix(csr_float_npz_path, skip_empty_rows); 61 | bfs.send_matrix_host_to_device(); 62 | 63 | uint32_t source = 0; 64 | uint32_t num_iterations = 10; 65 | 66 | auto reference_results = bfs.compute_reference_results(source, num_iterations); 67 | 68 | // pull push 69 | float threshold = 0.1; 70 | auto kernel_results = bfs.pull_push(source, num_iterations, threshold); 71 | // for (int i = 0; i < 10; i++) { 72 | // std::cout << reference_results[i] << " " << kernel_results[i] << std::endl; 73 | // } 74 | verify(reference_results, kernel_results); 75 | 76 | // pull 77 | kernel_results = bfs.pull(source, num_iterations); 78 | verify(reference_results, kernel_results); 79 | 80 | // push 81 | kernel_results = bfs.push(source, num_iterations); 82 | verify(reference_results, kernel_results); 83 | } 84 | 85 | 86 | TEST(PageRank, Pull) { 87 | graphlily::app::PageRank pagerank(graphlily::num_hbm_channels, spmv_out_buf_len, vec_buf_len); 88 | pagerank.set_target(target); 89 | pagerank.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 90 | 91 | std::string csr_float_npz_path = "/work/shared/common/project_build/graphblas/" 92 | "data/sparse_matrix_graph/uniform_10K_10_csr_float32.npz"; 93 | float damping = 0.9; 94 | bool skip_empty_rows = true; 95 | pagerank.load_and_format_matrix(csr_float_npz_path, damping, skip_empty_rows); 96 | pagerank.send_matrix_host_to_device(); 97 | 98 | uint32_t num_iterations = 10; 99 | auto kernel_results = pagerank.pull(damping, num_iterations); 100 | auto reference_results = pagerank.compute_reference_results(damping, num_iterations); 101 | verify(reference_results, kernel_results); 102 | } 103 | 104 | 105 | TEST(SSSP, PullPush) { 106 | graphlily::app::SSSP sssp(graphlily::num_hbm_channels, spmv_out_buf_len, 107 | spmspv_out_buf_len, vec_buf_len); 108 | sssp.set_target(target); 109 | sssp.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 110 | 111 | std::string csr_float_npz_path = "/work/shared/common/project_build/graphblas/" 112 | "data/sparse_matrix_graph/uniform_10K_10_csr_float32.npz"; 113 | 114 | bool skip_empty_rows = true; 115 | sssp.load_and_format_matrix(csr_float_npz_path, skip_empty_rows); 116 | sssp.send_matrix_host_to_device(); 117 | 118 | uint32_t source = 0; 119 | uint32_t num_iterations = 10; 120 | 121 | auto reference_results = sssp.compute_reference_results(source, num_iterations); 122 | 123 | // pull push 124 | float threshold = 0.1; 125 | auto kernel_results = sssp.pull_push(source, num_iterations, threshold); 126 | verify(reference_results, kernel_results); 127 | 128 | // pull 129 | kernel_results = sssp.pull(source, num_iterations); 130 | verify(reference_results, kernel_results); 131 | 132 | // push 133 | kernel_results = sssp.push(source, num_iterations); 134 | verify(reference_results, kernel_results); 135 | } 136 | 137 | 138 | TEST(CleanOverlay, NULL) { 139 | clean_proj_folder(); 140 | } 141 | 142 | 143 | int main(int argc, char ** argv) { 144 | testing::InitGoogleTest(&argc, argv); 145 | return RUN_ALL_TESTS(); 146 | } 147 | 148 | #pragma GCC diagnostic pop 149 | -------------------------------------------------------------------------------- /tests/test_data/create_csr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.sparse 3 | 4 | num_rows = 10 5 | num_cols = 10 6 | nnz_per_row = 1 7 | nnz = nnz_per_row * num_rows 8 | 9 | indptr = np.array([i * nnz_per_row for i in range(num_rows + 1)], dtype='uint32') 10 | # indices = np.array([i * num_cols / nnz_per_row % num_cols for i in range(nnz)], dtype='uint32') 11 | indices = np.array([i for i in range(nnz)], dtype='uint32') 12 | data = np.ones(nnz) 13 | 14 | M = scipy.sparse.csr_matrix((data, indices, indptr), shape=(num_rows, num_cols), dtype=np.float32) 15 | # print(M.toarray()) 16 | scipy.sparse.save_npz("eye_10_csr_float32.npz", M) 17 | -------------------------------------------------------------------------------- /tests/test_data/eye_10_csr_float32.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cornell-zhang/GraphLily/f3438f9d8de0e5fe47b4348a2b5b2bb6c91c76ce/tests/test_data/eye_10_csr_float32.npz -------------------------------------------------------------------------------- /tests/test_data/line_8_csr_float32.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cornell-zhang/GraphLily/f3438f9d8de0e5fe47b4348a2b5b2bb6c91c76ce/tests/test_data/line_8_csr_float32.npz -------------------------------------------------------------------------------- /tests/test_module_apply.cpp: -------------------------------------------------------------------------------- 1 | #pragma GCC diagnostic push 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context" 3 | #pragma GCC diagnostic ignored "-Wuninitialized" 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" 5 | 6 | #include "graphlily/synthesizer/overlay_synthesizer.h" 7 | 8 | #include "graphlily/module/assign_vector_dense_module.h" 9 | #include "graphlily/module/assign_vector_sparse_module.h" 10 | #include "graphlily/module/add_scalar_vector_dense_module.h" 11 | 12 | #include 13 | #include 14 | 15 | #include "graphlily/global.h" 16 | #include "graphlily/io/data_loader.h" 17 | #include "graphlily/io/data_formatter.h" 18 | 19 | 20 | std::string target = "sw_emu"; 21 | uint32_t spmv_out_buf_len = 1024; 22 | uint32_t spmspv_out_buf_len = 512; 23 | uint32_t vec_buf_len = 256; 24 | 25 | 26 | void clean_proj_folder() { 27 | std::string command = "rm -rf ./" + graphlily::proj_folder_name; 28 | std::cout << command << std::endl; 29 | system(command.c_str()); 30 | } 31 | 32 | 33 | template 34 | void verify(std::vector> &reference_results, 35 | std::vector> &kernel_results) { 36 | ASSERT_EQ(reference_results.size(), kernel_results.size()); 37 | float epsilon = 0.0001; 38 | for (size_t i = 0; i < reference_results.size(); i++) { 39 | ASSERT_TRUE(abs(float(kernel_results[i]) - reference_results[i]) < epsilon); 40 | } 41 | } 42 | 43 | 44 | TEST(Synthesize, NULL) { 45 | graphlily::synthesizer::OverlaySynthesizer synthesizer(graphlily::num_hbm_channels, 46 | spmv_out_buf_len, 47 | spmspv_out_buf_len, 48 | vec_buf_len); 49 | synthesizer.set_target(target); 50 | synthesizer.synthesize(); 51 | } 52 | 53 | 54 | TEST(AddScalarVectorDense, Basic) { 55 | graphlily::module::eWiseAddModule module; 56 | module.set_target(target); 57 | module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 58 | 59 | uint32_t length = 128; 60 | graphlily::val_t val = 1; 61 | float val_float = float(val); 62 | std::vector> in_float(length); 63 | std::generate(in_float.begin(), in_float.end(), [&](){return float(rand() % 10) / 100;}); 64 | std::vector> in(in_float.begin(), in_float.end()); 65 | 66 | module.send_in_host_to_device(in); 67 | module.allocate_out_buf(length); 68 | module.run(length, val); 69 | std::vector> kernel_out = 70 | module.send_out_device_to_host(); 71 | std::vector> reference_out = 72 | module.compute_reference_results(in_float, length, val_float); 73 | 74 | verify(reference_out, kernel_out); 75 | } 76 | 77 | 78 | TEST(AssignVectorDense, Basic) { 79 | graphlily::module::AssignVectorDenseModule module; 80 | module.set_target(target); 81 | module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 82 | 83 | uint32_t length = 128; 84 | graphlily::val_t val = 23; 85 | float val_float = float(val); 86 | std::vector> mask_float(length); 87 | std::generate(mask_float.begin(), mask_float.end(), [&](){return float(rand() % 2);}); 88 | std::vector> mask(mask_float.begin(), 89 | mask_float.end()); 90 | std::vector> reference_inout(length); 91 | std::generate(reference_inout.begin(), reference_inout.end(), [&](){return float(rand() % 2);}); 92 | std::vector> kernel_inout(reference_inout.begin(), 93 | reference_inout.end()); 94 | 95 | module.set_mask_type(graphlily::kMaskWriteToOne); 96 | module.send_mask_host_to_device(mask); 97 | module.send_inout_host_to_device(kernel_inout); 98 | module.run(length, val); 99 | kernel_inout = module.send_inout_device_to_host(); 100 | module.compute_reference_results(mask_float, reference_inout, length, val_float); 101 | 102 | verify(reference_inout, kernel_inout); 103 | } 104 | 105 | 106 | TEST(AssignVectorSparseNoNewFrontier, Basic) { 107 | bool generate_new_frontier = false; 108 | graphlily::module::AssignVectorSparseModule module(generate_new_frontier); 110 | module.set_target(target); 111 | module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 112 | 113 | float mask_sparsity = 0.9; 114 | uint32_t inout_size = 8192; 115 | graphlily::val_t val = 3; 116 | float val_float = float(val); 117 | unsigned length = (unsigned)floor(inout_size * (1 - mask_sparsity)); 118 | unsigned mask_indices_increment = inout_size / length; 119 | graphlily::aligned_sparse_float_vec_t mask_float(length + 1); 120 | for (size_t i = 0; i < length; i++) { 121 | mask_float[i + 1].val = float(rand() % 10); 122 | mask_float[i + 1].index = i * mask_indices_increment; 123 | } 124 | mask_float[0].val = 0; 125 | mask_float[0].index = length; 126 | std::vector> mask(length + 1); 127 | for (size_t i = 0; i < length + 1; i++) { 128 | mask[i].val = mask_float[i].val; 129 | mask[i].index = mask_float[i].index; 130 | } 131 | graphlily::aligned_dense_float_vec_t reference_inout(inout_size); 132 | std::generate(reference_inout.begin(), reference_inout.end(), [&](){return (rand() % 10);}); 133 | std::vector> kernel_inout(reference_inout.begin(), 134 | reference_inout.end()); 135 | 136 | module.send_mask_host_to_device(mask); 137 | module.send_inout_host_to_device(kernel_inout); 138 | module.run(val); 139 | kernel_inout = module.send_inout_device_to_host(); 140 | module.compute_reference_results(mask_float, reference_inout, val_float); 141 | 142 | verify(reference_inout, kernel_inout); 143 | } 144 | 145 | 146 | TEST(AssignVectorSparseNewFrontier, Basic) { 147 | bool generate_new_frontier = true; 148 | graphlily::module::AssignVectorSparseModule module(generate_new_frontier); 150 | module.set_target(target); 151 | module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 152 | 153 | float mask_sparsity = 0.9; 154 | uint32_t inout_size = 128; 155 | float inf; 156 | if (std::is_same::value) { 157 | inf = float(graphlily::FLOAT_INF); 158 | } else if (std::is_same::value) { 159 | inf = float(graphlily::UINT_INF); 160 | } else { 161 | inf = float(graphlily::UFIXED_INF); 162 | } 163 | unsigned length = (unsigned)floor(inout_size * (1 - mask_sparsity)); 164 | unsigned mask_indices_increment = inout_size / length; 165 | graphlily::aligned_sparse_float_vec_t mask_float(length + 1); 166 | for (size_t i = 0; i < length; i++) { 167 | mask_float[i + 1].val = float(rand() % 10); 168 | mask_float[i + 1].index = i * mask_indices_increment; 169 | } 170 | mask_float[0].val = 0; 171 | mask_float[0].index = length; 172 | std::vector> mask(length + 1); 173 | for (size_t i = 0; i < length + 1; i++) { 174 | mask[i].val = mask_float[i].val; 175 | mask[i].index = mask_float[i].index; 176 | } 177 | 178 | std::vector> kernel_inout(inout_size); 179 | std::generate(kernel_inout.begin(), kernel_inout.end(), 180 | [&](){return (((rand() % 10) > 5) ? 5 : inf);}); 181 | graphlily::aligned_dense_float_vec_t reference_inout(kernel_inout.begin(), kernel_inout.end()); 182 | 183 | std::vector> kernel_new_frontier; 184 | graphlily::aligned_sparse_float_vec_t reference_new_frontier; 185 | 186 | module.send_mask_host_to_device(mask); 187 | module.send_inout_host_to_device(kernel_inout); 188 | module.run(); 189 | kernel_inout = module.send_inout_device_to_host(); 190 | kernel_new_frontier = module.send_new_frontier_device_to_host(); 191 | module.compute_reference_results(mask_float, reference_inout, reference_new_frontier); 192 | 193 | // Verify kernel_inout 194 | verify(reference_inout, kernel_inout); 195 | 196 | // Verify kernel_new_frontier 197 | graphlily::aligned_dense_float_vec_t dense_ref_nf = 198 | graphlily::convert_sparse_vec_to_dense_vec(reference_new_frontier, inout_size, 0); 200 | std::vector> dense_knl_nf = 201 | graphlily::convert_sparse_vec_to_dense_vec< 202 | std::vector>, 203 | std::vector>, graphlily::val_t>( 204 | kernel_new_frontier, inout_size, 0); 205 | verify(dense_ref_nf, dense_knl_nf); 206 | } 207 | 208 | 209 | TEST(CopyBufferBindBuffer, Basic) { 210 | graphlily::module::AssignVectorDenseModule module; 211 | module.set_target(target); 212 | module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin"); 213 | 214 | uint32_t length = 128; 215 | std::vector> mask_float(length); 216 | std::generate(mask_float.begin(), mask_float.end(), [&](){return float(rand() % 2);}); 217 | std::vector> mask(mask_float.begin(), 218 | mask_float.end()); 219 | std::vector> inout_float(length); 220 | std::fill(inout_float.begin(), inout_float.end(), 0); 221 | std::vector> inout(inout_float.begin(), 222 | inout_float.end()); 223 | 224 | module.set_mask_type(graphlily::kMaskWriteToOne); 225 | 226 | /*----------------------------- Copy buffer -------------------------------*/ 227 | { 228 | module.send_mask_host_to_device(mask); 229 | module.send_inout_host_to_device(inout); 230 | module.copy_buffer_device_to_device(module.mask_buf, module.inout_buf, sizeof(graphlily::val_t) * length); 231 | inout = module.send_inout_device_to_host(); 232 | verify(mask_float, inout); 233 | } 234 | 235 | /*----------------------------- Bind buffer -------------------------------*/ 236 | { 237 | std::vector> x_float(length); 238 | std::fill(x_float.begin(), x_float.end(), 0); 239 | std::vector> x(x_float.begin(), x_float.end()); 240 | cl_mem_ext_ptr_t x_ext; 241 | x_ext.obj = x.data(); 242 | x_ext.param = 0; 243 | x_ext.flags = graphlily::HBM[graphlily::num_hbm_channels + 1]; 244 | cl::Device device = graphlily::find_device(); 245 | cl::Context context = cl::Context(device, NULL, NULL, NULL); 246 | cl::Buffer x_buf = cl::Buffer(context, 247 | CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR, 248 | sizeof(graphlily::val_t) * length, 249 | &x_ext); 250 | cl::CommandQueue command_queue = cl::CommandQueue(context, device); 251 | 252 | module.send_mask_host_to_device(mask); 253 | module.bind_inout_buf(x_buf); 254 | module.run(length, 2); 255 | command_queue.enqueueMigrateMemObjects({x_buf}, CL_MIGRATE_MEM_OBJECT_HOST); 256 | command_queue.finish(); 257 | 258 | module.compute_reference_results(mask_float, inout_float, length, 2); 259 | verify(inout_float, x); 260 | } 261 | } 262 | 263 | 264 | TEST(Clean, NULL) { 265 | clean_proj_folder(); 266 | } 267 | 268 | 269 | int main(int argc, char ** argv) { 270 | testing::InitGoogleTest(&argc, argv); 271 | return RUN_ALL_TESTS(); 272 | } 273 | 274 | #pragma GCC diagnostic pop 275 | -------------------------------------------------------------------------------- /tests/testbench/pe_tb.cpp: -------------------------------------------------------------------------------- 1 | #include "pe_tb.h" 2 | #include "ufixed_pe_fwd.h" 3 | #include "hls_stream.h" 4 | #include 5 | #include 6 | 7 | template 8 | static void data_feeder( 9 | PE_I_T input_buffer[num_lanes][IN_BUF_SIZE], 10 | hls::stream output_stream[num_lanes], 11 | hls::stream &output_npld_stream 12 | ) { 13 | loop_data_feeder: 14 | for (unsigned i = 0; i < IN_BUF_SIZE; i++) { 15 | #pragma HLS pipeline II=1 16 | for (unsigned Lid = 0; Lid < num_lanes; Lid++) { 17 | #pragma HLS unroll 18 | output_stream[Lid].write(input_buffer[Lid][i]); 19 | } 20 | } 21 | output_npld_stream.write(IN_BUF_SIZE * num_lanes); 22 | } 23 | 24 | 25 | static void main_dataflow( 26 | PE_I_T input_buffer[NUM_PE][IN_BUF_SIZE], 27 | VAL_T output_buffer[NUM_PE][BANK_SIZE] 28 | ) { 29 | hls::stream DF_to_PE_stream[NUM_PE]; 30 | hls::stream DF_to_PE_npld_stream; 31 | #pragma HLS stream variable=DF_to_PE_stream depth=8 32 | #pragma HLS stream variable=DF_to_PE_npld_stream depth=2 33 | 34 | #pragma HLS dataflow 35 | 36 | data_feeder(input_buffer, DF_to_PE_stream, DF_to_PE_npld_stream); 37 | 38 | ufixed_pe_cluster_spmv_uram( 39 | DF_to_PE_stream, 40 | output_buffer, 41 | MULADD, 42 | 0, 43 | DF_to_PE_npld_stream 44 | ); 45 | } 46 | 47 | extern "C" { 48 | void pe_tb( 49 | const IDX_T *test_addr_gmem, //0 50 | const VAL_T *test_mat_gmem, //1 51 | const VAL_T *test_vec_gmem, //2 52 | VAL_T *result_gmem //3 53 | ) { 54 | #pragma HLS interface m_axi port=test_addr_gmem offset=slave bundle=gmem0 55 | #pragma HLS interface m_axi port=test_mat_gmem offset=slave bundle=gmem1 56 | #pragma HLS interface m_axi port=test_vec_gmem offset=slave bundle=gmem2 57 | #pragma HLS interface m_axi port=result_gmem offset=slave bundle=gmem3 58 | 59 | #pragma HLS interface s_axilite port=test_addr_gmem bundle=control 60 | #pragma HLS interface s_axilite port=test_mat_gmem bundle=control 61 | #pragma HLS interface s_axilite port=test_vec_gmem bundle=control 62 | #pragma HLS interface s_axilite port=result_gmem bundle=control 63 | 64 | #pragma HLS interface s_axilite port=return bundle=control 65 | 66 | // input buffer 67 | PE_I_T input_buffer[NUM_PE][IN_BUF_SIZE]; 68 | #pragma HLS array_partition variable=input_buffer dim=1 complete 69 | #pragma HLS resource variable=input_buffer core=RAM_1P 70 | 71 | // output buffer 72 | VAL_T output_buffer[NUM_PE][BANK_SIZE]; 73 | #pragma HLS array_partition variable=output_buffer dim=1 complete 74 | #pragma HLS resource variable=output_buffer core=RAM_2P latency=2 75 | 76 | // reset output buffer 77 | loop_reset_ob: 78 | for (unsigned i = 0; i < BANK_SIZE; i++) { 79 | #pragma HLS pipeline II=1 80 | for (unsigned PEid = 0; PEid < NUM_PE; PEid++) { 81 | #pragma HLS unroll 82 | output_buffer[PEid][i] = 0; 83 | } 84 | } 85 | 86 | // initialize input buffer 87 | loop_ini_ib: 88 | for (unsigned i = 0; i < NUM_PE * IN_BUF_SIZE; i++) { 89 | #pragma HLS pipeline II=1 90 | input_buffer[i / IN_BUF_SIZE][i % IN_BUF_SIZE].index = test_addr_gmem[i]; 91 | input_buffer[i / IN_BUF_SIZE][i % IN_BUF_SIZE].data.mat_val = test_mat_gmem[i]; 92 | input_buffer[i / IN_BUF_SIZE][i % IN_BUF_SIZE].data.vec_val = test_vec_gmem[i]; 93 | } 94 | 95 | // run main dataflow 96 | main_dataflow(input_buffer, output_buffer); 97 | 98 | // write back to results 99 | loop_wb_2: 100 | for (unsigned i = 0; i < BANK_SIZE; i++) { 101 | loop_wb_1: 102 | for (unsigned PEid = 0; PEid < NUM_PE; PEid++) { 103 | #pragma HLS pipeline II=1 104 | result_gmem[i * NUM_PE + PEid] = output_buffer[PEid][i]; 105 | } 106 | } 107 | 108 | } // extern "C" 109 | } // kernel 110 | -------------------------------------------------------------------------------- /tests/testbench/pe_tb.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_TEST_TESTBENCH_PE_TB_H_ 2 | #define GRAPHLILY_TEST_TESTBENCH_PE_TB_H_ 3 | 4 | #include "ap_fixed.h" 5 | 6 | #define MULADD 0 7 | #define ANDOR 1 8 | #define ADDMIN 2 9 | 10 | // data types 11 | typedef unsigned IDX_T; 12 | typedef ap_ufixed<32, 8, AP_RND, AP_SAT> VAL_T; 13 | 14 | typedef struct pe_input_val_type { 15 | VAL_T mat_val; 16 | VAL_T vec_val; 17 | } PE_I_VAL_T; 18 | 19 | typedef struct pe_input_type { 20 | IDX_T index; 21 | PE_I_VAL_T data; 22 | } PE_I_T; 23 | 24 | // Below configurations will be overwritten by the compiler 25 | // const unsigned NUM_PE = 26 | // const unsigned BANK_ID_NBITS = 27 | // const unsigned BANK_SIZE = 28 | // const unsigned IN_BUF_SIZE = 29 | // #endif // GRAPHLILY_TEST_TESTBENCH_PE_TB_H_ 30 | -------------------------------------------------------------------------------- /tests/testbench/shuffle_tb.h: -------------------------------------------------------------------------------- 1 | #ifndef GRAPHLILY_TEST_TESTBENCH_SHUFFLE_TB_H_ 2 | #define GRAPHLILY_TEST_TESTBENCH_SHUFFLE_TB_H_ 3 | 4 | // data types 5 | typedef struct shuffle_inout_data_type { 6 | unsigned uuid; 7 | unsigned padding; 8 | } SF_IO_DATA_T; 9 | 10 | typedef struct shuffle_inout_type { 11 | unsigned index; 12 | SF_IO_DATA_T data; 13 | } SF_IO_T; 14 | 15 | typedef struct testbench_interfece_type { 16 | unsigned index; 17 | unsigned uuid; 18 | } TB_IFC_T; 19 | 20 | const unsigned INVALID_UUID = 0; 21 | 22 | const unsigned NUM_IN_LANES = 8; 23 | const unsigned NUM_OUT_LANES = 8; 24 | const unsigned ADDR_MASK = 7; 25 | 26 | // Below configurations will be overwritten by the compiler 27 | // const unsigned IN_BUF_SIZE = 28 | // const unsigned OUT_BUF_SIZE = 29 | // #endif // GRAPHLILY_TEST_TESTBENCH_SHUFFLE_TB_H_ 30 | -------------------------------------------------------------------------------- /xrt/includes/cmdparser/cmdlineparser.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #include "cmdlineparser.h" 30 | #include "logger.h" 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | namespace sda { 37 | namespace utils { 38 | 39 | bool is_file(const std::string &name) { 40 | ifstream f(name.c_str()); 41 | if (f.good()) { 42 | f.close(); 43 | return true; 44 | } else { 45 | f.close(); 46 | return false; 47 | } 48 | } 49 | 50 | bool is_number(const std::string &s) { 51 | std::string::const_iterator it = s.begin(); 52 | while (it != s.end() && std::isdigit(*it)) 53 | ++it; 54 | return !s.empty() && it == s.end(); 55 | } 56 | 57 | bool starts_with(const string &src, const string &sub) { 58 | return (src.find(sub) == 0); 59 | } 60 | 61 | CmdLineParser::CmdLineParser() { 62 | // TODO Auto-generated constructor stub 63 | m_strDefaultKey = ""; 64 | m_appname = "application.exe"; 65 | addSwitch("--help", "-h", "prints this help list", "", true); 66 | } 67 | 68 | /* 69 | CmdLineParser::CmdLineParser(int argc, char* argv[]) { 70 | // TODO Auto-generated constructor stub 71 | assert(parse(argc, argv) > 0); 72 | 73 | } 74 | */ 75 | 76 | CmdLineParser::~CmdLineParser() { 77 | // TODO Auto-generated destructor stub 78 | for (size_t i = 0; i < m_vSwitches.size(); i++) { 79 | delete m_vSwitches[i]; 80 | m_vSwitches[i] = NULL; 81 | } 82 | 83 | m_vSwitches.resize(0); 84 | } 85 | 86 | bool CmdLineParser::addSwitch(const CmdSwitch &s) { 87 | 88 | CmdSwitch cmd = s; 89 | 90 | if (cmd.desc.length() == 0) { 91 | LogError("No description provided!"); 92 | return false; 93 | } 94 | 95 | //check input 96 | if (cmd.key.find("--") != 0 || cmd.key.length() < 3) { 97 | LogError("The input key is invalid. Please start with -- and keep a " 98 | "length >= 3"); 99 | return false; 100 | } 101 | 102 | if (m_mapKeySwitch.find(cmd.key) != m_mapKeySwitch.end()) { 103 | LogError("This key %s is taken already!", cmd.key.c_str()); 104 | return false; 105 | } 106 | 107 | if (cmd.shortcut.length() == 0) { 108 | 109 | string temp = "-" + cmd.key[2]; 110 | 111 | int i = 3; 112 | while (m_mapShortcutKeys.find(temp) != m_mapShortcutKeys.end() && 113 | (size_t)i < cmd.key.length()) { 114 | temp = "-" + s.key[i]; 115 | i++; 116 | } 117 | 118 | cmd.shortcut = temp; 119 | LogInfo("Automatic shortcut assigned %s to %s", 120 | temp.c_str(), 121 | cmd.key.c_str()); 122 | } 123 | 124 | if (s.istoggle) { 125 | cmd.default_value = string("false"); 126 | cmd.value = cmd.default_value; 127 | cmd.isvalid = true; 128 | } else { 129 | cmd.value = cmd.default_value; 130 | cmd.isvalid = false; 131 | } 132 | 133 | //add 134 | CmdSwitch *pcmd = new CmdSwitch(cmd); 135 | m_vSwitches.push_back(pcmd); 136 | m_mapShortcutKeys[s.shortcut] = cmd.key; 137 | m_mapKeySwitch[s.key] = pcmd; 138 | 139 | return true; 140 | } 141 | 142 | bool CmdLineParser::addSwitch(const string &name, 143 | const string &shortcut, 144 | const string &desc, 145 | const string &default_value, 146 | bool istoggle) { 147 | 148 | CmdSwitch s; 149 | s.key = name; 150 | s.shortcut = shortcut; 151 | s.desc = desc; 152 | s.default_value = default_value; 153 | s.istoggle = istoggle; 154 | 155 | return addSwitch(s); 156 | } 157 | 158 | bool CmdLineParser::setDefaultKey(const char *key) { 159 | string strKey(key); 160 | if (!starts_with(strKey, "--")) 161 | strKey = "--" + strKey; 162 | 163 | if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end()) { 164 | CmdSwitch *pcmd = m_mapKeySwitch[m_strDefaultKey]; 165 | if (pcmd != NULL) { 166 | if (pcmd->istoggle) { 167 | LogError("Boolean command line options can not be used as " 168 | "default keys"); 169 | return false; 170 | } 171 | } 172 | 173 | //set default key 174 | m_strDefaultKey = strKey; 175 | return true; 176 | } else 177 | return false; 178 | } 179 | 180 | int CmdLineParser::parse(int argc, char *argv[]) { 181 | 182 | int i = 0; 183 | int ctOptions = 0; 184 | while (i < argc) { 185 | string key, val; 186 | bool iskey = false; 187 | string token = string(argv[i]); 188 | 189 | bool isNextTokenKey = false; 190 | if (i + 1 < argc) { 191 | string peeknext = string(argv[i + 1]); 192 | if (starts_with(peeknext, "-") || starts_with(peeknext, "--")) { 193 | string fullkey; 194 | isNextTokenKey = token_to_fullkeyname(peeknext, fullkey); 195 | } 196 | } 197 | 198 | //full-key 199 | if (starts_with(token, string("--"))) { 200 | if (m_mapKeySwitch.find(token) == m_mapKeySwitch.end()) { 201 | LogError("Unrecognized key passed %s", token.c_str()); 202 | printHelp(); 203 | return -1; 204 | } 205 | 206 | key = token; 207 | iskey = true; 208 | } 209 | //shortcut 210 | else if (starts_with(token, "-")) { 211 | if (m_mapShortcutKeys.find(token) == m_mapShortcutKeys.end()) { 212 | LogError("Unrecognized shortcut key passed %s", token.c_str()); 213 | printHelp(); 214 | return -1; 215 | } 216 | 217 | key = m_mapShortcutKeys[token]; 218 | iskey = true; 219 | } 220 | //default key, the value for default key is the last argument 221 | else if (isNextTokenKey == false && m_strDefaultKey.length() > 0 && 222 | i == argc - 2) { 223 | if (m_mapKeySwitch.find(m_strDefaultKey) == m_mapKeySwitch.end()) { 224 | LogError("Unrecognized default key %s", 225 | m_strDefaultKey.c_str()); 226 | printHelp(); 227 | return -1; 228 | } 229 | 230 | LogInfo("Using default key: %s", m_strDefaultKey.c_str()); 231 | key = m_strDefaultKey; 232 | iskey = true; 233 | } 234 | 235 | //if iskey and needs param then read it 236 | if (iskey) { 237 | ctOptions++; 238 | 239 | if (key == "--help") { 240 | printHelp(); 241 | return 1; 242 | } 243 | 244 | //fetch value 245 | CmdSwitch *pcmd = m_mapKeySwitch[key]; 246 | 247 | //read next 248 | if (pcmd->istoggle) { 249 | pcmd->value = string("true"); 250 | pcmd->isvalid = true; 251 | } else { 252 | i++; 253 | pcmd->value = string(argv[i]); 254 | pcmd->isvalid = true; 255 | } 256 | } 257 | 258 | //next token 259 | i++; 260 | } 261 | 262 | //capture real app name 263 | if (argc > 0) { 264 | m_appname = string(argv[0]); 265 | } 266 | 267 | return ctOptions; 268 | } 269 | 270 | bool CmdLineParser::token_to_fullkeyname(const string &token, string &fullkey) { 271 | 272 | fullkey = ""; 273 | int ctDashes = 0; 274 | if (starts_with(token, string("--"))) 275 | ctDashes = 2; 276 | else if (starts_with(token, string("-"))) 277 | ctDashes = 1; 278 | 279 | if (ctDashes == 0) 280 | return false; 281 | 282 | if (ctDashes == 2) { 283 | if (m_mapKeySwitch.find(token) == m_mapKeySwitch.end()) { 284 | LogError("Unrecognized key passed %s", token.c_str()); 285 | return false; 286 | } 287 | fullkey = token; 288 | } else if (ctDashes == 1) { 289 | if (m_mapShortcutKeys.find(token) == m_mapShortcutKeys.end()) { 290 | LogError("Unrecognized shortcut key passed %s", token.c_str()); 291 | return false; 292 | } 293 | 294 | fullkey = m_mapShortcutKeys[token]; 295 | } 296 | 297 | return (fullkey.length() > 0); 298 | } 299 | 300 | string CmdLineParser::value(const char *key) { 301 | 302 | string strKey(key); 303 | if (!starts_with(strKey, "--")) 304 | strKey = "--" + strKey; 305 | 306 | if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end()) 307 | return m_mapKeySwitch[strKey]->value; 308 | else { 309 | LogWarn("The input key %s is not recognized!", strKey.c_str()); 310 | return string(""); 311 | } 312 | } 313 | 314 | int CmdLineParser::value_to_int(const char *key) { 315 | string strVal = value(key); 316 | if (strVal.length() == 0 || !is_number(strVal)) 317 | return -1; 318 | return atoi(strVal.c_str()); 319 | } 320 | 321 | double CmdLineParser::value_to_double(const char *key) { 322 | string strVal = value(key); 323 | if (strVal.length() == 0) 324 | return -1; 325 | return atof(strVal.c_str()); 326 | } 327 | 328 | bool CmdLineParser::isValid(const char *key) { 329 | string strKey(key); 330 | if (!starts_with(strKey, "--")) 331 | strKey = "--" + strKey; 332 | 333 | if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end()) 334 | return m_mapKeySwitch[strKey]->isvalid; 335 | else { 336 | LogWarn("The input key %s is not recognized!", strKey.c_str()); 337 | return false; 338 | } 339 | } 340 | 341 | void CmdLineParser::printHelp() { 342 | printf("===========================================================\n"); 343 | string strAllShortcuts = ""; 344 | for (size_t i = 0; i < m_vSwitches.size(); i++) { 345 | CmdSwitch *pcmd = m_vSwitches[i]; 346 | if (pcmd && pcmd->shortcut.length() > 0) 347 | strAllShortcuts = strAllShortcuts + pcmd->shortcut; 348 | } 349 | //example 350 | printf("Usage: %s -[%s]\n\n", m_appname.c_str(), strAllShortcuts.c_str()); 351 | 352 | //row by row 353 | for (size_t i = 0; i < m_vSwitches.size(); i++) { 354 | CmdSwitch *pcmd = m_vSwitches[i]; 355 | 356 | if (pcmd->default_value.length() > 0) 357 | printf("\t%s, %s\t\t%s\t Default: [%s]\n", 358 | pcmd->key.c_str(), 359 | pcmd->shortcut.c_str(), 360 | pcmd->desc.c_str(), 361 | pcmd->default_value.c_str()); 362 | else 363 | printf("\t%s, %s\t\t%s\n", 364 | pcmd->key.c_str(), 365 | pcmd->shortcut.c_str(), 366 | pcmd->desc.c_str()); 367 | } 368 | } 369 | 370 | CmdLineParser::CmdSwitch *CmdLineParser::getCmdSwitch(const char *key) { 371 | string strKey(key); 372 | if (!starts_with(strKey, "--")) 373 | strKey = "--" + strKey; 374 | 375 | if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end()) 376 | return m_mapKeySwitch[strKey]; 377 | else 378 | return NULL; 379 | } 380 | 381 | } // namespace utils 382 | } // namespace sda 383 | -------------------------------------------------------------------------------- /xrt/includes/cmdparser/cmdlineparser.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #ifndef CMDLINEPARSER_H_ 30 | #define CMDLINEPARSER_H_ 31 | 32 | #include 33 | #include 34 | #include 35 | 36 | using namespace std; 37 | 38 | namespace sda { 39 | namespace utils { 40 | 41 | bool is_file(const std::string& name); 42 | 43 | /*! 44 | * Synopsis: 45 | * 1.Parses the command line passed in from the user and stores all enabled 46 | * system options. 47 | * 2.Prints help for the user if an option is not valid. 48 | * 3.Stores options and provides a mechanism to read those options 49 | */ 50 | class CmdLineParser { 51 | public: 52 | class CmdSwitch { 53 | public: 54 | CmdSwitch() {} 55 | CmdSwitch(const CmdSwitch& rhs) { 56 | copyfrom(rhs); 57 | } 58 | 59 | void copyfrom(const CmdSwitch& rhs) { 60 | this->key = rhs.key; 61 | this->shortcut = rhs.shortcut; 62 | this->default_value = rhs.default_value; 63 | this->value = rhs.value; 64 | this->desc = rhs.desc; 65 | this->istoggle = rhs.istoggle; 66 | this->isvalid = rhs.isvalid; 67 | } 68 | 69 | CmdSwitch& operator=(const CmdSwitch& rhs) { 70 | this->copyfrom(rhs); 71 | return *this; 72 | } 73 | public: 74 | string key; 75 | string shortcut; 76 | string default_value; 77 | string value; 78 | string desc; 79 | bool istoggle; 80 | bool isvalid; 81 | }; 82 | 83 | public: 84 | CmdLineParser(); 85 | //CmdLineParser(int argc, char* argv[]); 86 | virtual ~CmdLineParser(); 87 | 88 | 89 | bool addSwitch(const CmdSwitch& s); 90 | bool addSwitch(const string& name, const string& shortcut, 91 | const string& desc, const string& default_value = "", 92 | bool istoggle = false); 93 | 94 | /*! 95 | * sets default key to be able to read a 2 argumented call 96 | */ 97 | bool setDefaultKey(const char* key); 98 | 99 | /*! 100 | * parse and store command line 101 | */ 102 | int parse(int argc, char* argv[]); 103 | 104 | /*! 105 | * retrieve value using a key 106 | */ 107 | string value(const char* key); 108 | 109 | int value_to_int(const char* key); 110 | 111 | 112 | double value_to_double(const char* key); 113 | 114 | /*! 115 | * Returns true if a valid value is supplied by user 116 | */ 117 | bool isValid(const char* key); 118 | 119 | /*! 120 | * prints the help menu in case the options are not correct. 121 | */ 122 | virtual void printHelp(); 123 | 124 | protected: 125 | /*! 126 | * Retrieve command switch 127 | */ 128 | CmdSwitch* getCmdSwitch(const char* key); 129 | 130 | bool token_to_fullkeyname(const string& token, string& fullkey); 131 | 132 | 133 | private: 134 | map m_mapKeySwitch; 135 | map m_mapShortcutKeys; 136 | vector m_vSwitches; 137 | string m_strDefaultKey; 138 | string m_appname; 139 | }; 140 | 141 | //bool starts_with(const string& src, const string& sub); 142 | 143 | } 144 | } 145 | #endif /* CMDLINEPARSER_H_ */ 146 | -------------------------------------------------------------------------------- /xrt/includes/cmdparser/cmdparser.mk: -------------------------------------------------------------------------------- 1 | cmdparser_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/cmdparser/cmdlineparser.cpp 2 | cmdparser_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/cmdparser/cmdlineparser.h 3 | cmdparser_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/cmdparser 4 | -------------------------------------------------------------------------------- /xrt/includes/logger/logger.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #include "logger.h" 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #ifdef WINDOWS 36 | #include 37 | #else 38 | #include 39 | #endif 40 | 41 | using namespace std; 42 | 43 | namespace sda { 44 | 45 | /////////////////////////////////////////////////////////////////////// 46 | string GetApplicationPath() { 47 | #ifdef WINDOWS 48 | #define GetCurrentDir _getcwd 49 | #else 50 | #define GetCurrentDir getcwd 51 | #endif 52 | 53 | char strCurrentPath[FILENAME_MAX]; 54 | 55 | if (!GetCurrentDir(strCurrentPath, sizeof(strCurrentPath))) { 56 | return string(""); 57 | } 58 | 59 | /* not really required */ 60 | strCurrentPath[sizeof(strCurrentPath) - 1] = '\0'; 61 | return string(strCurrentPath); 62 | } 63 | 64 | string ToLower(const string &s) { 65 | string result = s; 66 | std::transform(result.begin(), result.end(), result.begin(), ::tolower); 67 | return result; 68 | } 69 | 70 | string ToUpper(const string &s) { 71 | string result = s; 72 | std::transform(result.begin(), result.end(), result.begin(), ::toupper); 73 | return result; 74 | } 75 | 76 | string GetTimeStamp() { return ""; } 77 | 78 | // trim from start 79 | string <rim(std::string &s) { 80 | s.erase(s.begin(), 81 | std::find_if(s.begin(), 82 | s.end(), 83 | std::not1(std::ptr_fun(std::isspace)))); 84 | return s; 85 | } 86 | 87 | // trim from end 88 | string &rtrim(std::string &s) { 89 | s.erase(std::find_if(s.rbegin(), 90 | s.rend(), 91 | std::not1(std::ptr_fun(std::isspace))) 92 | .base(), 93 | s.end()); 94 | return s; 95 | } 96 | 97 | // trim from both ends 98 | string &trim(std::string &s) { return ltrim(rtrim(s)); } 99 | 100 | string GetFileExt(const string &s) { 101 | string strext = s.substr(s.find_last_of(".") + 1); 102 | return strext; 103 | } 104 | 105 | string GetFileTitleOnly(const string &s) { 106 | 107 | string temp = s; 108 | string::size_type d = temp.find_last_of("//"); 109 | if (d == string::npos) 110 | d = temp.find_last_of("\\"); 111 | if (d != string::npos) 112 | temp = temp.substr(d + 1); 113 | 114 | d = temp.find_last_of("."); 115 | if (d != string::npos) 116 | temp = temp.substr(0, d); 117 | 118 | return temp; 119 | } 120 | 121 | void LogWrapper(int etype, const char *file, int line, const char *desc, ...) { 122 | 123 | //crop file name from full path 124 | string strFileLoc(file); 125 | strFileLoc = strFileLoc.substr(strFileLoc.find_last_of("\\/") + 1); 126 | 127 | string strHeader = ""; 128 | { 129 | char header[512]; 130 | //source 131 | switch (etype) { 132 | case (sda::etError): { 133 | snprintf(header, 134 | sizeof(header), 135 | "ERROR: [%s:%d]", 136 | strFileLoc.c_str(), 137 | line); 138 | break; 139 | } 140 | case (sda::etInfo): { 141 | snprintf(header, 142 | sizeof(header), 143 | "INFO: [%s:%d]", 144 | strFileLoc.c_str(), 145 | line); 146 | break; 147 | } 148 | case (sda::etWarning): { 149 | snprintf(header, 150 | sizeof(header), 151 | "WARN: [%s:%d]", 152 | strFileLoc.c_str(), 153 | line); 154 | break; 155 | } 156 | } 157 | strHeader = string(header); 158 | } 159 | 160 | //time 161 | string strTime = ""; 162 | #ifdef ENABLE_LOG_TIME 163 | { 164 | time_t rawtime; 165 | time(&rawtime); 166 | #ifdef ENABLE_SECURE_API 167 | char buffer[64]; 168 | struct tm timeinfo; 169 | localtime_s(&timeinfo, &rawtime); 170 | asctime_s(timeinfo, buffer, sizeof(buffer)) 171 | snprintf(buffer, sizeof(buffer), "TIME: [%s]", asctime(timeinfo)); 172 | strTime = string(buffer); 173 | #else 174 | char buffer[64]; 175 | struct tm *timeinfo = localtime(&rawtime); 176 | string temp = string(asctime(timeinfo)); 177 | temp = trim(temp); 178 | 179 | // strftime(buffer, sizeof(buffer), "TIME: []") 180 | snprintf(buffer, sizeof(buffer), "TIME: [%s]", temp.c_str()); 181 | strTime = string(buffer); 182 | #endif 183 | } 184 | #endif 185 | 186 | //format the message itself 187 | string strMsg = ""; 188 | { 189 | char msg[512]; 190 | va_list args; 191 | va_start(args, desc); 192 | vsnprintf(msg, sizeof(msg), desc, args); 193 | va_end(args); 194 | strMsg = string(msg); 195 | } 196 | 197 | //combine 198 | string strOut = 199 | strHeader + string(" ") + strTime + string(" ") + strMsg + string("\n"); 200 | 201 | //display 202 | cout << strOut; 203 | 204 | //store 205 | #ifdef ENABLE_LOG_TOFILE 206 | std::ofstream outfile; 207 | outfile.open("benchapp.log", std::ios_base::app); 208 | outfile << strOut; 209 | #endif 210 | 211 | return; 212 | } 213 | 214 | } // namespace sda 215 | -------------------------------------------------------------------------------- /xrt/includes/logger/logger.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #ifndef LOGGER_H_ 30 | #define LOGGER_H_ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | 38 | #define ENABLE_LOG_TOFILE 1 39 | #define ENABLE_LOG_TIME 1 40 | 41 | //global logging 42 | #define LogInfo(desc, ...) sda::LogWrapper(0, __FILE__, __LINE__, desc, ##__VA_ARGS__) 43 | #define LogWarn(desc, ...) sda::LogWrapper(1, __FILE__, __LINE__, desc, ##__VA_ARGS__) 44 | #define LogError(desc, ...) sda::LogWrapper(2, __FILE__, __LINE__, desc, ##__VA_ARGS__) 45 | 46 | using namespace std; 47 | 48 | namespace sda { 49 | 50 | enum LOGTYPE {etInfo, etWarning, etError}; 51 | 52 | //string 53 | string& ltrim(string& s); 54 | string& rtrim(string& s); 55 | string& trim(string& s); 56 | string GetFileExt(const string& s); 57 | string GetFileTitleOnly(const string& s); 58 | 59 | string ToLower(const string& s); 60 | string ToUpper(const string& s); 61 | 62 | //time 63 | string GetTimeStamp(); 64 | 65 | //paths 66 | string GetApplicationPath(); 67 | 68 | 69 | //debug 70 | template 71 | void PrintPOD(const vector& pod, size_t display_count = 0, const int precision = 4) { 72 | 73 | size_t count = pod.size(); 74 | if(display_count > 0) 75 | count = std::min(pod.size(), display_count); 76 | 77 | for(size_t i = 0; i < count; i++) { 78 | cout << std::setprecision(precision) << pod[i] << ", "; 79 | } 80 | cout << endl; 81 | } 82 | 83 | //logging 84 | void LogWrapper(int etype, const char* file, int line, const char* desc, ...); 85 | 86 | } 87 | 88 | 89 | 90 | #endif /* LOGGER_H_ */ 91 | -------------------------------------------------------------------------------- /xrt/includes/logger/logger.mk: -------------------------------------------------------------------------------- 1 | logger_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/logger/logger.cpp 2 | logger_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/logger/logger.h 3 | logger_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/logger 4 | -------------------------------------------------------------------------------- /xrt/includes/oclHelper/oclErrorCodes.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #include 30 | #include 31 | 32 | #include 33 | 34 | #define TO_STRING(x) #x 35 | 36 | static const std::pair map_pairs[] = { 37 | std::make_pair(CL_SUCCESS, TO_STRING(CL_SUCCESS)), 38 | std::make_pair(CL_DEVICE_NOT_FOUND, TO_STRING(CL_DEVICE_NOT_FOUND)), 39 | std::make_pair(CL_DEVICE_NOT_AVAILABLE, TO_STRING(CL_DEVICE_NOT_AVAILABLE)), 40 | std::make_pair(CL_COMPILER_NOT_AVAILABLE, 41 | TO_STRING(CL_COMPILER_NOT_AVAILABLE)), 42 | std::make_pair(CL_MEM_OBJECT_ALLOCATION_FAILURE, 43 | TO_STRING(CL_MEM_OBJECT_ALLOCATION_FAILURE)), 44 | std::make_pair(CL_OUT_OF_RESOURCES, TO_STRING(CL_OUT_OF_RESOURCES)), 45 | std::make_pair(CL_OUT_OF_HOST_MEMORY, TO_STRING(CL_OUT_OF_HOST_MEMORY)), 46 | std::make_pair(CL_PROFILING_INFO_NOT_AVAILABLE, 47 | TO_STRING(CL_PROFILING_INFO_NOT_AVAILABLE)), 48 | std::make_pair(CL_MEM_COPY_OVERLAP, TO_STRING(CL_MEM_COPY_OVERLAP)), 49 | std::make_pair(CL_IMAGE_FORMAT_MISMATCH, 50 | TO_STRING(CL_IMAGE_FORMAT_MISMATCH)), 51 | std::make_pair(CL_IMAGE_FORMAT_NOT_SUPPORTED, 52 | TO_STRING(CL_IMAGE_FORMAT_NOT_SUPPORTED)), 53 | std::make_pair(CL_BUILD_PROGRAM_FAILURE, 54 | TO_STRING(CL_BUILD_PROGRAM_FAILURE)), 55 | std::make_pair(CL_MAP_FAILURE, TO_STRING(CL_MAP_FAILURE)), 56 | std::make_pair(CL_MISALIGNED_SUB_BUFFER_OFFSET, 57 | TO_STRING(CL_MISALIGNED_SUB_BUFFER_OFFSET)), 58 | std::make_pair(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST, 59 | TO_STRING(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_W)), 60 | std::make_pair(CL_INVALID_VALUE, TO_STRING(CL_INVALID_VALUE)), 61 | std::make_pair(CL_INVALID_DEVICE_TYPE, TO_STRING(CL_INVALID_DEVICE_TYPE)), 62 | std::make_pair(CL_INVALID_PLATFORM, TO_STRING(CL_INVALID_PLATFORM)), 63 | std::make_pair(CL_INVALID_DEVICE, TO_STRING(CL_INVALID_DEVICE)), 64 | std::make_pair(CL_INVALID_CONTEXT, TO_STRING(CL_INVALID_CONTEXT)), 65 | std::make_pair(CL_INVALID_QUEUE_PROPERTIES, 66 | TO_STRING(CL_INVALID_QUEUE_PROPERTIES)), 67 | std::make_pair(CL_INVALID_COMMAND_QUEUE, 68 | TO_STRING(CL_INVALID_COMMAND_QUEUE)), 69 | std::make_pair(CL_INVALID_HOST_PTR, TO_STRING(CL_INVALID_HOST_PTR)), 70 | std::make_pair(CL_INVALID_MEM_OBJECT, TO_STRING(CL_INVALID_MEM_OBJECT)), 71 | std::make_pair(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, 72 | TO_STRING(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)), 73 | std::make_pair(CL_INVALID_IMAGE_SIZE, TO_STRING(CL_INVALID_IMAGE_SIZE)), 74 | std::make_pair(CL_INVALID_SAMPLER, TO_STRING(CL_INVALID_SAMPLER)), 75 | std::make_pair(CL_INVALID_BINARY, TO_STRING(CL_INVALID_BINARY)), 76 | std::make_pair(CL_INVALID_BUILD_OPTIONS, 77 | TO_STRING(CL_INVALID_BUILD_OPTIONS)), 78 | std::make_pair(CL_INVALID_PROGRAM, TO_STRING(CL_INVALID_PROGRAM)), 79 | std::make_pair(CL_INVALID_PROGRAM_EXECUTABLE, 80 | TO_STRING(CL_INVALID_PROGRAM_EXECUTABLE)), 81 | std::make_pair(CL_INVALID_KERNEL_NAME, TO_STRING(CL_INVALID_KERNEL_NAME)), 82 | std::make_pair(CL_INVALID_KERNEL_DEFINITION, 83 | TO_STRING(CL_INVALID_KERNEL_DEFINITION)), 84 | std::make_pair(CL_INVALID_KERNEL, TO_STRING(CL_INVALID_KERNEL)), 85 | std::make_pair(CL_INVALID_ARG_INDEX, TO_STRING(CL_INVALID_ARG_INDEX)), 86 | std::make_pair(CL_INVALID_ARG_VALUE, TO_STRING(CL_INVALID_ARG_VALUE)), 87 | std::make_pair(CL_INVALID_ARG_SIZE, TO_STRING(CL_INVALID_ARG_SIZE)), 88 | std::make_pair(CL_INVALID_KERNEL_ARGS, TO_STRING(CL_INVALID_KERNEL_ARGS)), 89 | std::make_pair(CL_INVALID_WORK_DIMENSION, 90 | TO_STRING(CL_INVALID_WORK_DIMENSION)), 91 | std::make_pair(CL_INVALID_WORK_GROUP_SIZE, 92 | TO_STRING(CL_INVALID_WORK_GROUP_SIZE)), 93 | std::make_pair(CL_INVALID_WORK_ITEM_SIZE, 94 | TO_STRING(CL_INVALID_WORK_ITEM_SIZE)), 95 | std::make_pair(CL_INVALID_GLOBAL_OFFSET, 96 | TO_STRING(CL_INVALID_GLOBAL_OFFSET)), 97 | std::make_pair(CL_INVALID_EVENT_WAIT_LIST, 98 | TO_STRING(CL_INVALID_EVENT_WAIT_LIST)), 99 | std::make_pair(CL_INVALID_EVENT, TO_STRING(CL_INVALID_EVENT)), 100 | std::make_pair(CL_INVALID_OPERATION, TO_STRING(CL_INVALID_OPERATION)), 101 | std::make_pair(CL_INVALID_GL_OBJECT, TO_STRING(CL_INVALID_GL_OBJECT)), 102 | std::make_pair(CL_INVALID_BUFFER_SIZE, TO_STRING(CL_INVALID_BUFFER_SIZE)), 103 | std::make_pair(CL_INVALID_MIP_LEVEL, TO_STRING(CL_INVALID_MIP_LEVEL)), 104 | std::make_pair(CL_INVALID_GLOBAL_WORK_SIZE, 105 | TO_STRING(CL_INVALID_GLOBAL_WORK_SIZE)), 106 | std::make_pair(CL_INVALID_PROPERTY, TO_STRING(CL_INVALID_PROPERTY))}; 107 | 108 | static const std::map 109 | oclErrorCodes(map_pairs, 110 | map_pairs + sizeof(map_pairs) / sizeof(map_pairs[0])); 111 | 112 | const char *oclErrorCode(cl_int code) { 113 | std::map::const_iterator iter = 114 | oclErrorCodes.find(code); 115 | if (iter == oclErrorCodes.end()) 116 | return "UNKNOWN ERROR"; 117 | else 118 | return iter->second.c_str(); 119 | } 120 | -------------------------------------------------------------------------------- /xrt/includes/oclHelper/oclHelper.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #include "oclHelper.h" 30 | #include 31 | #include 32 | #include 33 | #include 34 | 35 | // 36 | // Load file to memory 37 | // 38 | static int loadFile2Memory(const char *filename, char **result) { 39 | int size = 0; 40 | 41 | std::ifstream stream(filename, std::ifstream::binary); 42 | if (!stream) { 43 | return -1; 44 | } 45 | 46 | stream.seekg(0, stream.end); 47 | size = stream.tellg(); 48 | stream.seekg(0, stream.beg); 49 | 50 | *result = new char[size + 1]; 51 | stream.read(*result, size); 52 | if (!stream) { 53 | return -2; 54 | } 55 | stream.close(); 56 | (*result)[size] = 0; 57 | return size; 58 | } 59 | 60 | // 61 | // Get device version 62 | // 63 | static void getDeviceVersion(oclHardware &hardware) { 64 | char versionString[512]; 65 | size_t size = 0; 66 | cl_int err = clGetDeviceInfo( 67 | hardware.mDevice, CL_DEVICE_VERSION, 511, versionString, &size); 68 | if (err != CL_SUCCESS) { 69 | std::cout << oclErrorCode(err) << "\n"; 70 | return; 71 | } 72 | unsigned major = 0; 73 | unsigned minor = 0; 74 | unsigned state = 0; 75 | for (size_t i = 0; i < size; i++) { 76 | if (!versionString[i]) { 77 | break; 78 | } 79 | if (versionString[i] == ' ') { 80 | state++; 81 | continue; 82 | } 83 | if (versionString[i] == '.') { 84 | state++; 85 | continue; 86 | } 87 | if (state == 0) { 88 | continue; 89 | } 90 | if (state == 1) { 91 | major *= 10; 92 | major += (versionString[i] - '0'); 93 | continue; 94 | } 95 | if (state == 2) { 96 | minor *= 10; 97 | minor += (versionString[i] - '0'); 98 | continue; 99 | } 100 | break; 101 | } 102 | hardware.mMajorVersion = major; 103 | hardware.mMinorVersion = minor; 104 | } 105 | 106 | // 107 | // Get OCL hardware 108 | // 109 | oclHardware getOclHardware(cl_device_type type) { 110 | oclHardware hardware = {0, 0, 0, 0, 0, 0}; 111 | cl_platform_id platforms[16] = {0}; 112 | cl_device_id devices[16]; 113 | char platformName[256]; 114 | char deviceName[256]; 115 | cl_uint platformCount = 0; 116 | cl_int err = clGetPlatformIDs(0, 0, &platformCount); 117 | err = clGetPlatformIDs(16, platforms, &platformCount); 118 | if (err != CL_SUCCESS) { 119 | std::cout << oclErrorCode(err) << "\n"; 120 | return hardware; 121 | } 122 | 123 | for (cl_uint i = 0; i < platformCount; i++) { 124 | err = clGetPlatformInfo( 125 | platforms[i], CL_PLATFORM_NAME, 256, platformName, 0); 126 | if (err != CL_SUCCESS) { 127 | std::cout << oclErrorCode(err) << "\n"; 128 | return hardware; 129 | } 130 | cl_uint deviceCount = 0; 131 | err = clGetDeviceIDs(platforms[i], type, 16, devices, &deviceCount); 132 | if ((err != CL_SUCCESS) || (deviceCount == 0)) { 133 | continue; 134 | } 135 | 136 | err = clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 256, deviceName, 0); 137 | if (err != CL_SUCCESS) { 138 | std::cout << oclErrorCode(err) << "\n"; 139 | return hardware; 140 | } 141 | 142 | cl_context_properties contextData[3] = { 143 | CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0}; 144 | cl_context context = 145 | clCreateContextFromType(contextData, type, 0, 0, &err); 146 | if (err != CL_SUCCESS) { 147 | continue; 148 | } 149 | cl_command_queue queue = 150 | clCreateCommandQueue(context, devices[0], 0, &err); 151 | if (err != CL_SUCCESS) { 152 | std::cout << oclErrorCode(err) << "\n"; 153 | return hardware; 154 | } 155 | 156 | hardware.mPlatform = platforms[i]; 157 | hardware.mContext = context; 158 | hardware.mDevice = devices[0]; 159 | hardware.mQueue = queue; 160 | getDeviceVersion(hardware); 161 | std::cout << "Platform = " << platformName << "\n"; 162 | std::cout << "Device = " << deviceName << "\n"; 163 | std::cout << "OpenCL Version = " << hardware.mMajorVersion << '.' 164 | << hardware.mMinorVersion << "\n"; 165 | return hardware; 166 | } 167 | return hardware; 168 | } 169 | 170 | // 171 | // Get OCL software 172 | // 173 | int getOclSoftware(oclSoftware &software, const oclHardware &hardware) { 174 | cl_device_type deviceType = CL_DEVICE_TYPE_DEFAULT; 175 | cl_int err = clGetDeviceInfo( 176 | hardware.mDevice, CL_DEVICE_TYPE, sizeof(deviceType), &deviceType, 0); 177 | if (err != CL_SUCCESS) { 178 | std::cout << oclErrorCode(err) << "\n"; 179 | return -1; 180 | } 181 | 182 | unsigned char *kernelCode = 0; 183 | std::cout << "Loading " << software.mFileName << "\n"; 184 | 185 | int size = loadFile2Memory(software.mFileName, (char **)&kernelCode); 186 | if (size < 0) { 187 | std::cout << "Failed to load kernel\n"; 188 | return -2; 189 | } 190 | 191 | if (deviceType == CL_DEVICE_TYPE_ACCELERATOR) { 192 | size_t n = size; 193 | software.mProgram = 194 | clCreateProgramWithBinary(hardware.mContext, 195 | 1, 196 | &hardware.mDevice, 197 | &n, 198 | (const unsigned char **)&kernelCode, 199 | 0, 200 | &err); 201 | } else { 202 | software.mProgram = clCreateProgramWithSource( 203 | hardware.mContext, 1, (const char **)&kernelCode, 0, &err); 204 | } 205 | if (!software.mProgram || (err != CL_SUCCESS)) { 206 | std::cout << oclErrorCode(err) << "\n"; 207 | return -3; 208 | } 209 | 210 | software.mKernel = 211 | clCreateKernel(software.mProgram, software.mKernelName, NULL); 212 | if (software.mKernel == 0) { 213 | std::cout << oclErrorCode(err) << "\n"; 214 | return -4; 215 | } 216 | 217 | delete[] kernelCode; 218 | return 0; 219 | } 220 | 221 | // 222 | // Release software and hardware 223 | // 224 | void release(oclSoftware &software) { 225 | clReleaseKernel(software.mKernel); 226 | clReleaseProgram(software.mProgram); 227 | } 228 | 229 | void release(oclHardware &hardware) { 230 | clReleaseCommandQueue(hardware.mQueue); 231 | clReleaseContext(hardware.mContext); 232 | if ((hardware.mMajorVersion >= 1) && (hardware.mMinorVersion > 1)) { 233 | // Only available in OpenCL >= 1.2 234 | clReleaseDevice(hardware.mDevice); 235 | } 236 | } 237 | -------------------------------------------------------------------------------- /xrt/includes/oclHelper/oclHelper.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #ifndef _OCL_HELP_H_ 30 | #define _OCL_HELP_H_ 31 | 32 | #include 33 | 34 | struct oclHardware { 35 | cl_platform_id mPlatform; 36 | cl_context mContext; 37 | cl_device_id mDevice; 38 | cl_command_queue mQueue; 39 | short mMajorVersion; 40 | short mMinorVersion; 41 | }; 42 | 43 | struct oclSoftware { 44 | cl_program mProgram; 45 | cl_kernel mKernel; 46 | char mKernelName[128]; 47 | char mFileName[1024]; 48 | char mCompileOptions[1024]; 49 | }; 50 | 51 | oclHardware getOclHardware(cl_device_type type); 52 | 53 | int getOclSoftware(oclSoftware &software, const oclHardware &hardware); 54 | 55 | void release(oclSoftware& software); 56 | 57 | void release(oclHardware& hardware); 58 | 59 | const char *oclErrorCode(cl_int code); 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /xrt/includes/oclHelper/oclHelper.mk: -------------------------------------------------------------------------------- 1 | oclHelper_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper/oclHelper.cpp ${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper/oclErrorCodes.cpp 2 | oclHelper_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper/oclHelper.h 3 | oclHelper_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper 4 | -------------------------------------------------------------------------------- /xrt/includes/opencl/opencl.mk: -------------------------------------------------------------------------------- 1 | # Definition of include file locations 2 | xrt_path = $(XILINX_XRT) 3 | ifneq ($(HOST_ARCH), x86) 4 | xrt_path = $(SYSROOT)/usr/ 5 | endif 6 | 7 | OPENCL_INCLUDE:= $(xrt_path)/include 8 | ifneq ($(HOST_ARCH), x86) 9 | OPENCL_INCLUDE:= $(xrt_path)/include/xrt 10 | endif 11 | 12 | VIVADO_INCLUDE:= $(XILINX_VIVADO)/include 13 | opencl_CXXFLAGS=-I$(OPENCL_INCLUDE) -I$(VIVADO_INCLUDE) 14 | OPENCL_LIB:= $(xrt_path)/lib 15 | opencl_LDFLAGS=-L$(OPENCL_LIB) -lOpenCL -lpthread 16 | -------------------------------------------------------------------------------- /xrt/includes/xcl2/xcl2.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | 30 | #include "xcl2.hpp" 31 | #include 32 | #include 33 | #include 34 | 35 | namespace xcl { 36 | std::vector get_devices(const std::string &vendor_name) { 37 | size_t i; 38 | cl_int err; 39 | std::vector platforms; 40 | OCL_CHECK(err, err = cl::Platform::get(&platforms)); 41 | cl::Platform platform; 42 | for (i = 0; i < platforms.size(); i++) { 43 | platform = platforms[i]; 44 | OCL_CHECK(err, 45 | std::string platformName = 46 | platform.getInfo(&err)); 47 | if (platformName == vendor_name) { 48 | std::cout << "Found Platform" << std::endl; 49 | std::cout << "Platform Name: " << platformName.c_str() << std::endl; 50 | break; 51 | } 52 | } 53 | if (i == platforms.size()) { 54 | std::cout << "Error: Failed to find Xilinx platform" << std::endl; 55 | exit(EXIT_FAILURE); 56 | } 57 | //Getting ACCELERATOR Devices and selecting 1st such device 58 | std::vector devices; 59 | OCL_CHECK(err, 60 | err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); 61 | return devices; 62 | } 63 | 64 | std::vector get_xil_devices() { return get_devices("Xilinx"); } 65 | 66 | std::vector 67 | read_binary_file(const std::string &xclbin_file_name) { 68 | std::cout << "INFO: Reading " << xclbin_file_name << std::endl; 69 | 70 | if (access(xclbin_file_name.c_str(), R_OK) != 0) { 71 | printf("ERROR: %s xclbin not available please build\n", 72 | xclbin_file_name.c_str()); 73 | exit(EXIT_FAILURE); 74 | } 75 | //Loading XCL Bin into char buffer 76 | std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; 77 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 78 | bin_file.seekg(0, bin_file.end); 79 | auto nb = bin_file.tellg(); 80 | bin_file.seekg(0, bin_file.beg); 81 | std::vector buf; 82 | buf.resize(nb); 83 | bin_file.read(reinterpret_cast(buf.data()), nb); 84 | return buf; 85 | } 86 | 87 | bool is_emulation() { 88 | bool ret = false; 89 | char *xcl_mode = getenv("XCL_EMULATION_MODE"); 90 | if (xcl_mode != NULL) { 91 | ret = true; 92 | } 93 | return ret; 94 | } 95 | 96 | bool is_hw_emulation() { 97 | bool ret = false; 98 | char *xcl_mode = getenv("XCL_EMULATION_MODE"); 99 | if ((xcl_mode != NULL) && !strcmp(xcl_mode, "hw_emu")) { 100 | ret = true; 101 | } 102 | return ret; 103 | } 104 | 105 | bool is_xpr_device(const char *device_name) { 106 | const char *output = strstr(device_name, "xpr"); 107 | 108 | if (output == NULL) { 109 | return false; 110 | } else { 111 | return true; 112 | } 113 | } 114 | }; // namespace xcl 115 | -------------------------------------------------------------------------------- /xrt/includes/xcl2/xcl2.hpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2018, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | 30 | 31 | #pragma once 32 | 33 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 34 | #define CL_HPP_TARGET_OPENCL_VERSION 120 35 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 36 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 37 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 38 | 39 | //OCL_CHECK doesn't work if call has templatized function call 40 | #define OCL_CHECK(error,call) \ 41 | call; \ 42 | if (error != CL_SUCCESS) { \ 43 | printf("%s:%d Error calling " #call ", error code is: %d\n", \ 44 | __FILE__,__LINE__, error); \ 45 | exit(EXIT_FAILURE); \ 46 | } 47 | 48 | #include 49 | #include 50 | #include 51 | #include 52 | // When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the hood 53 | // User ptr is used if and only if it is properly aligned (page aligned). When not 54 | // aligned, runtime has no choice but to create its own host side buffer that backs 55 | // user ptr. This in turn implies that all operations that move data to and from 56 | // device incur an extra memcpy to move data to/from runtime's own host buffer 57 | // from/to user pointer. So it is recommended to use this allocator if user wish to 58 | // Create Buffer/Memory Object with CL_MEM_USE_HOST_PTR to align user buffer to the 59 | // page boundary. It will ensure that user buffer will be used when user create 60 | // Buffer/Mem Object with CL_MEM_USE_HOST_PTR. 61 | template 62 | struct aligned_allocator 63 | { 64 | using value_type = T; 65 | T* allocate(std::size_t num) 66 | { 67 | void* ptr = nullptr; 68 | if (posix_memalign(&ptr,4096,num*sizeof(T))) 69 | throw std::bad_alloc(); 70 | return reinterpret_cast(ptr); 71 | } 72 | void deallocate(T* p, std::size_t num) 73 | { 74 | free(p); 75 | } 76 | }; 77 | 78 | namespace xcl { 79 | std::vector get_xil_devices(); 80 | std::vector get_devices(const std::string& vendor_name); 81 | std::vector read_binary_file(const std::string &xclbin_file_name); 82 | bool is_emulation (); 83 | bool is_hw_emulation (); 84 | bool is_xpr_device (const char *device_name); 85 | class Stream{ 86 | public: 87 | static decltype(&clCreateStream) createStream; 88 | static decltype(&clReleaseStream) releaseStream; 89 | static decltype(&clReadStream) readStream; 90 | static decltype(&clWriteStream) writeStream; 91 | static decltype(&clPollStreams) pollStreams; 92 | static void init(const cl_platform_id& platform) { 93 | void *bar = clGetExtensionFunctionAddressForPlatform(platform, "clCreateStream"); 94 | createStream = (decltype(&clCreateStream))bar; 95 | bar = clGetExtensionFunctionAddressForPlatform(platform, "clReleaseStream"); 96 | releaseStream = (decltype(&clReleaseStream))bar; 97 | bar = clGetExtensionFunctionAddressForPlatform(platform, "clReadStream"); 98 | readStream = (decltype(&clReadStream))bar; 99 | bar = clGetExtensionFunctionAddressForPlatform(platform, "clWriteStream"); 100 | writeStream = (decltype(&clWriteStream))bar; 101 | bar = clGetExtensionFunctionAddressForPlatform(platform, "clPollStreams"); 102 | pollStreams = (decltype(&clPollStreams))bar; 103 | } 104 | }; 105 | } 106 | -------------------------------------------------------------------------------- /xrt/includes/xcl2/xcl2.mk: -------------------------------------------------------------------------------- 1 | xcl2_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/xcl2/xcl2.cpp 2 | xcl2_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/xcl2/xcl2.hpp 3 | 4 | xcl2_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/xcl2 5 | --------------------------------------------------------------------------------