├── .gitignore
├── LICENSE
├── README.md
├── benchmark
    ├── Makefile
    ├── analyze_load_balance_spmv.py
    ├── bench_bfs.cpp
    ├── bench_pagerank.cpp
    ├── bench_spmspv.cpp
    ├── bench_spmv.cpp
    ├── bench_sssp.cpp
    ├── run_bfs.sh
    ├── run_pagerank.sh
    ├── run_spmv.sh
    └── run_sssp.sh
├── generate_bitstream
    ├── Makefile
    └── synthesize.cpp
├── graphlily
    ├── app
    │   ├── bfs.h
    │   ├── module_collection.h
    │   ├── pagerank.h
    │   └── sssp.h
    ├── global.h
    ├── hw
    │   ├── float_pe.h
    │   ├── kernel_add_scalar_vector_dense_impl.h
    │   ├── kernel_assign_vector_dense_impl.h
    │   ├── kernel_assign_vector_sparse_new_frontier_impl.h
    │   ├── kernel_assign_vector_sparse_no_new_frontier_impl.h
    │   ├── kernel_spmspv_impl.h
    │   ├── kernel_spmv_impl.h
    │   ├── math_constants.h
    │   ├── overlay.cpp
    │   ├── overlay.h
    │   ├── shuffle.h
    │   ├── ufixed_pe_fwd.h
    │   └── util.h
    ├── io
    │   ├── data_formatter.h
    │   └── data_loader.h
    ├── module
    │   ├── add_scalar_vector_dense_module.h
    │   ├── assign_vector_dense_module.h
    │   ├── assign_vector_sparse_module.h
    │   ├── base_module.h
    │   ├── spmspv_module.h
    │   └── spmv_module.h
    └── synthesizer
    │   ├── base_synthesizer.h
    │   └── overlay_synthesizer.h
├── tests
    ├── Makefile
    ├── test_app.cpp
    ├── test_data
    │   ├── create_csr.py
    │   ├── eye_10_csr_float32.npz
    │   └── line_8_csr_float32.npz
    ├── test_io.cpp
    ├── test_module_apply.cpp
    ├── test_module_spmv_spmspv.cpp
    ├── test_pe_cluster.cpp
    ├── test_shuffle.cpp
    └── testbench
    │   ├── pe_tb.cpp
    │   ├── pe_tb.h
    │   ├── shuffle_tb.cpp
    │   └── shuffle_tb.h
└── xrt
    └── includes
        ├── cmdparser
            ├── cmdlineparser.cpp
            ├── cmdlineparser.h
            └── cmdparser.mk
        ├── logger
            ├── logger.cpp
            ├── logger.h
            └── logger.mk
        ├── oclHelper
            ├── oclErrorCodes.cpp
            ├── oclHelper.cpp
            ├── oclHelper.h
            └── oclHelper.mk
        ├── opencl
            └── opencl.mk
        └── xcl2
            ├── xcl2.cpp
            ├── xcl2.hpp
            └── xcl2.mk


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.jou
 2 | */host
 3 | profile_summary.html
 4 | profile_summary.csv
 5 | xclbin.run_summary
 6 | _v++_*/
 7 | .Xil/
 8 | *.ll
 9 | *.exe
10 | emconfig.json
11 | xmltmp
12 | dltmp*
13 | runtime/
14 | *.log
15 | xclbin/
16 | lib/
17 | *.orig
18 | *_BACKUP_*
19 | *_BASE_*
20 | *_LOCAL_*
21 | *_REMOTE_*
22 | *.swp
23 | *_x.*
24 | *build/
25 | *build_dir.*
26 | *.run/
27 | .vscode
28 | data
29 | build*
30 | spmspv
31 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2021, Cornell Zhang Research Group
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the names of its
17 |    contributors may be used to endorse or promote products derived from
18 |    this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | GraphLily: A Graph Linear Algebra Overlay on HBM-Equipped FPGAs
 2 | ===============================================================
 3 | 
 4 | GraphLily is the first FPGA overlay for graph processing.
 5 | GraphLily supports a rich set of graph algorithms by adopting the GraphBLAS programming interface, which formulates graph algorithms as sparse linear algebra kernels.
 6 | GraphLily effectively utilizes the high bandwidth of HBM to accelerate SpMV and SpMSpV, the two widely-used kernels in GraphBLAS, by co-designing the data layout and the accelerator architecture.
 7 | GraphLily further builds a middleware to provide runtime support, enabling users to easily port existing GraphBLAS programs from CPUs/GPUs to FPGAs.
 8 | 
 9 | For more information, refer to our [ICCAD'21 paper](https://www.csl.cornell.edu/~zhiruz/pdfs/graphlily-iccad2021.pdf).
10 | ```
11 | @article{hu2021graphlily,
12 |   title={GraphLily: Accelerating Graph Linear Algebra on HBM-Equipped FPGAs},
13 |   author={Hu, Yuwei and Du, Yixiao and Ustun, Ecenur and Zhang, Zhiru},
14 |   journal={International Conference On Computer Aided Design},
15 |   year={2021}
16 | }
17 | ```
18 | 
19 | ## Prerequisites
20 | - Platform: Xilinx Alveo U280
21 | - Tool: Xilinx Vitis 2019.2
22 | 
23 | ## Run Benchmarking
24 | 
25 | ### Clone the repo
26 | ```
27 | git clone git@github.com:cornell-zhang/GraphLily.git
28 | export GRAPHLILY_ROOT_PATH=/path/to/GraphLily
29 | ```
30 | 
31 | ### Get the bitstream
32 | - A pre-compiled bitstream (166 MHz) is provided [here](https://drive.google.com/file/d/1OGry0OtbvmGiSirhJy3tCPz51VMeV1HM/view?usp=sharing).
33 | - To generate a new bitstream:
34 | ```
35 | cd GraphLily/generate_bitstream
36 | make synthesize
37 | ```
38 | 
39 | ### Prepare datasets
40 | The input is an adjacency matrix in csr format stored as a scipy npz file. Please install [cnpy](https://github.com/rogersce/cnpy), which is required for data loading.
41 | 
42 | Our ICCAD'21 paper evaluated the following six graph datasets:
43 | 
44 | - [googleplus](https://drive.google.com/file/d/1Wv9C7s0lK0KdrRPUsTqjlENvbMMKfykg/view?usp=sharing)
45 | - [ogbl-ppa](https://drive.google.com/file/d/189Qp9h4BxXR8dAiQdmJWkW89y08eU5qR/view?usp=sharing)
46 | - [hollywood](https://drive.google.com/file/d/1irBTVuYdJaMXQTUGQh7AerBjs784ykeO/view?usp=sharing)
47 | - [pokec](https://drive.google.com/file/d/1UEwsIYgNWmm3ucBfatjg_lmG25oXWWI-/view?usp=sharing)
48 | - [ogbn-products](https://drive.google.com/file/d/1yBJjW5aRpJt2if32gOWSmaYcI10KDQj0/view?usp=sharing)
49 | - [orkut](https://drive.google.com/file/d/1Am0hPLhGNAwjYWt5nd_-XsIaKBiWcwqt/view?usp=sharing)
50 | 
51 | ### Run
52 | Go to the GraphLily/benchmark folder, modify the cnpy path in Makefile, modify the bitstream path and the datasets path in run_bfs.sh, then:
53 | ```
54 | bash run_bfs.sh
55 | ```
56 | 


--------------------------------------------------------------------------------
/benchmark/Makefile:
--------------------------------------------------------------------------------
 1 | HOST_ARCH = x86
 2 | 
 3 | CXXFLAGS += -Wall -O3 -g -std=c++11
 4 | CXXFLAGS += -I$(GRAPHLILY_ROOT_PATH)
 5 | 
 6 | LDFLAGS += -lrt -lstdc++
 7 | 
 8 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/xcl2/xcl2.mk
 9 | CXXFLAGS += $(xcl2_CXXFLAGS)
10 | LDFLAGS += $(xcl2_LDFLAGS)
11 | 
12 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/opencl/opencl.mk
13 | CXXFLAGS += $(opencl_CXXFLAGS)
14 | LDFLAGS += $(opencl_LDFLAGS)
15 | 
16 | CXXFLAGS += -I/work/shared/common/project_build/graphblas/software/cnpy
17 | LDFLAGS += -L/work/shared/common/project_build/graphblas/software/cnpy/build -lcnpy
18 | 
19 | BUILD_DIR = ./build
20 | 
21 | bench_spmv: bench_spmv.cpp $(xcl2_SRCS)
22 | 	g++ $(CXXFLAGS) bench_spmv.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
23 | 	mkdir -p $(BUILD_DIR)
24 | 	mv bench_spmv $(BUILD_DIR)/
25 | 
26 | bench_spmspv: bench_spmspv.cpp $(xcl2_SRCS)
27 | 	g++ $(CXXFLAGS) bench_spmspv.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
28 | 	mkdir -p $(BUILD_DIR)
29 | 	mv bench_spmspv $(BUILD_DIR)/
30 | 
31 | bench_bfs: bench_bfs.cpp $(xcl2_SRCS)
32 | 	g++ $(CXXFLAGS) bench_bfs.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
33 | 	mkdir -p $(BUILD_DIR)
34 | 	mv bench_bfs $(BUILD_DIR)/
35 | 
36 | bench_sssp: bench_sssp.cpp $(xcl2_SRCS)
37 | 	g++ $(CXXFLAGS) bench_sssp.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
38 | 	mkdir -p $(BUILD_DIR)
39 | 	mv bench_sssp $(BUILD_DIR)/
40 | 
41 | bench_pagerank: bench_pagerank.cpp $(xcl2_SRCS)
42 | 	g++ $(CXXFLAGS) bench_pagerank.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
43 | 	mkdir -p $(BUILD_DIR)
44 | 	mv bench_pagerank $(BUILD_DIR)/
45 | 
46 | clean:
47 | 	rm -rf $(BUILD_DIR)
48 | 


--------------------------------------------------------------------------------
/benchmark/analyze_load_balance_spmv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse
 3 | 
 4 | num_channels = 16
 5 | pack_size = 8
 6 | 
 7 | def calculate_degree_standard_deviation(nnz_each_row):
 8 |     return nnz_each_row.std()
 9 | 
10 | def calculate_imbalance_factor(nnz_each_row):
11 |     num_PEs = num_channels * pack_size
12 |     nnz_each_PE = np.zeros(num_PEs)
13 |     step = num_PEs
14 |     for i in range(num_PEs):
15 |         nnz_each_PE[i] = nnz_each_row[i::step].sum()
16 |     return nnz_each_PE.max() / nnz_each_PE.mean()
17 | 
18 | path = "/work/shared/common/research/graphblas/data/sparse_matrix_graph/"
19 | datasets = ["gplus_108K_13M_csr_float32.npz",
20 |             "ogbl_ppa_576K_42M_csr_float32.npz",
21 |             "hollywood_1M_113M_csr_float32.npz",
22 |             "pokec_1633K_31M_csr_float32.npz",
23 |             "ogbn_products_2M_124M_csr_float32.npz",
24 |             "orkut_3M_213M_csr_float32.npz"]
25 | 
26 | if __name__ == "__main__":
27 |     for dataset in datasets:
28 |         csr_matrix = scipy.sparse.load_npz(path + dataset)
29 |         nnz_each_row = csr_matrix.indptr[1::] - csr_matrix.indptr[:-1:]
30 |         standard_deviation = calculate_degree_standard_deviation(nnz_each_row)
31 |         average_degree = csr_matrix.nnz / csr_matrix.shape[0]
32 |         normalized_standard_deviation = standard_deviation / average_degree
33 |         print(dataset)
34 |         print("standard_deviation: ", standard_deviation)
35 |         print("normalized_standard_deviation: ", normalized_standard_deviation)
36 |         print("imbalance_factor: ", calculate_imbalance_factor(nnz_each_row))
37 | 


--------------------------------------------------------------------------------
/benchmark/bench_bfs.cpp:
--------------------------------------------------------------------------------
  1 | #pragma GCC diagnostic push
  2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context"
  3 | #pragma GCC diagnostic ignored "-Wuninitialized"
  4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  5 | 
  6 | #include <iostream>
  7 | #include <chrono>
  8 | 
  9 | #include "graphlily/app/bfs.h"
 10 | 
 11 | 
 12 | template<typename data_t>
 13 | void verify(std::vector<float, aligned_allocator<float>> &reference_results,
 14 |             std::vector<data_t, aligned_allocator<data_t>> &kernel_results) {
 15 |     if (!(reference_results.size() == kernel_results.size())) {
 16 |         std::cout << "Size mismatch!" << std::endl;
 17 |         exit(EXIT_FAILURE);
 18 |     }
 19 |     float epsilon = 0.0001;
 20 |     for (size_t i = 0; i < reference_results.size(); i++) {
 21 |         if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) {
 22 |             std::cout << "Error: Result mismatch"
 23 |                       << std::endl;
 24 |             std::cout << "i = " << i
 25 |                       << " Reference result = " << reference_results[i]
 26 |                       << " Kernel result = " << kernel_results[i]
 27 |                       << std::endl;
 28 |             exit(EXIT_FAILURE);
 29 |         }
 30 |     }
 31 | }
 32 | 
 33 | 
 34 | void bench_bfs(uint32_t num_channels, uint32_t spmv_out_buf_len,
 35 |                uint32_t spmspv_out_buf_len, uint32_t vec_buf_len,
 36 |                std::string bitstream, std::string dataset, uint32_t num_iterations) {
 37 |     graphlily::app::BFS bfs(num_channels, spmv_out_buf_len, spmspv_out_buf_len, vec_buf_len);
 38 |     bfs.set_target("hw");
 39 |     bfs.set_up_runtime(bitstream);
 40 | 
 41 |     bool skip_empty_rows = true;
 42 |     bfs.load_and_format_matrix(dataset, skip_empty_rows);
 43 |     std::cout << "finished load_and_format_matrix" << std::endl;
 44 |     bfs.send_matrix_host_to_device();
 45 | 
 46 |     uint32_t source = 0;
 47 |     auto reference_results = bfs.compute_reference_results(source, num_iterations);
 48 | 
 49 |     // // Make sure the results make sense, e.g., the starting vertex connects to at least one vertex
 50 |     // for (int i = 0; i < 10; i++) {
 51 |     //     std::cout << reference_results[i] <<std::endl;
 52 |     // }
 53 | 
 54 |     // Pull
 55 |     auto kernel_results = bfs.pull(source, num_iterations);
 56 |     // verify<graphlily::val_t>(reference_results, kernel_results);
 57 |     // std::cout << "BFS pull passed" << std::endl;
 58 | 
 59 |     uint32_t num_runs = 1;
 60 |     auto t1 = std::chrono::high_resolution_clock::now();
 61 |     for (size_t i = 0; i < num_runs; i++) {
 62 |         kernel_results = bfs.pull(source, num_iterations);
 63 |     }
 64 |     auto t2 = std::chrono::high_resolution_clock::now();
 65 |     float average_time_in_sec = float(std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count())
 66 |         / 1000000 / num_runs;
 67 |     std::cout << "Pull average_time: " << average_time_in_sec * 1000 << " ms" << std::endl;
 68 |     uint32_t nnz = bfs.get_nnz();
 69 |     double op_count = nnz * num_iterations;
 70 |     double throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec;
 71 |     std::cout << "Pull Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl;
 72 | 
 73 |     // Pull-Push
 74 |     float threshold = 0.001;
 75 |     kernel_results = bfs.pull_push(source, num_iterations, threshold);
 76 |     // verify<graphlily::val_t>(reference_results, kernel_results);
 77 |     // std::cout << "BFS pull-push passed" << std::endl;
 78 | 
 79 |     num_runs = 1;
 80 |     t1 = std::chrono::high_resolution_clock::now();
 81 |     for (size_t i = 0; i < num_runs; i++) {
 82 |         kernel_results = bfs.pull_push(source, num_iterations, threshold);
 83 |     }
 84 |     t2 = std::chrono::high_resolution_clock::now();
 85 |     average_time_in_sec = float(std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count())
 86 |         / 1000000 / num_runs;
 87 |     std::cout << "Pull-Push average_time: " << average_time_in_sec * 1000 << " ms" << std::endl;
 88 |     throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec;
 89 |     std::cout << "Pull-Push Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl;
 90 | }
 91 | 
 92 | 
 93 | int main(int argc, char *argv[]) {
 94 |     bench_bfs(strtol(argv[1], NULL, 10),
 95 |               strtol(argv[2], NULL, 10),
 96 |               strtol(argv[3], NULL, 10),
 97 |               strtol(argv[4], NULL, 10),
 98 |               argv[5],
 99 |               argv[6],
100 |               strtol(argv[7], NULL, 10));
101 | }
102 | 
103 | #pragma GCC diagnostic pop
104 | 


--------------------------------------------------------------------------------
/benchmark/bench_pagerank.cpp:
--------------------------------------------------------------------------------
 1 | #pragma GCC diagnostic push
 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 3 | #pragma GCC diagnostic ignored "-Wuninitialized"
 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 5 | 
 6 | #include <iostream>
 7 | #include <chrono>
 8 | 
 9 | #include "graphlily/app/pagerank.h"
10 | 
11 | 
12 | template<typename data_t>
13 | void verify(std::vector<float, aligned_allocator<float>> &reference_results,
14 |             std::vector<data_t, aligned_allocator<data_t>> &kernel_results) {
15 |     if (!(reference_results.size() == kernel_results.size())) {
16 |         std::cout << "Size mismatch!" << std::endl;
17 |         exit(EXIT_FAILURE);
18 |     }
19 |     float epsilon = 0.0001;
20 |     for (size_t i = 0; i < reference_results.size(); i++) {
21 |         if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) {
22 |             std::cout << "Error: Result mismatch"
23 |                       << std::endl;
24 |             std::cout << "i = " << i
25 |                       << " Reference result = " << reference_results[i]
26 |                       << " Kernel result = " << kernel_results[i]
27 |                       << std::endl;
28 |             exit(EXIT_FAILURE);
29 |         }
30 |     }
31 | }
32 | 
33 | 
34 | void bench_pagerank(uint32_t num_channels, uint32_t spmv_out_buf_len,
35 |                     uint32_t vec_buf_len, std::string bitstream, std::string dataset) {
36 |     graphlily::app::PageRank pagerank(graphlily::num_hbm_channels, spmv_out_buf_len, vec_buf_len);
37 |     pagerank.set_target("hw");
38 |     pagerank.set_up_runtime(bitstream);
39 | 
40 |     float damping = 0.9;
41 |     bool skip_empty_rows = true;
42 |     pagerank.load_and_format_matrix(dataset, damping, skip_empty_rows);
43 |     std::cout << "finished load_and_format_matrix" << std::endl;
44 |     pagerank.send_matrix_host_to_device();
45 | 
46 |     uint32_t num_iterations = 10;
47 |     auto reference_results = pagerank.compute_reference_results(damping, num_iterations);
48 | 
49 |     auto kernel_results = pagerank.pull(damping, num_iterations);
50 |     // verify<graphlily::val_t>(reference_results, kernel_results);
51 |     // std::cout << "PageRank passed" << std::endl;
52 | 
53 |     uint32_t num_runs = 1;
54 |     auto t1 = std::chrono::high_resolution_clock::now();
55 |     for (size_t i = 0; i < num_runs; i++) {
56 |         kernel_results = pagerank.pull(damping, num_iterations);
57 |     }
58 |     auto t2 = std::chrono::high_resolution_clock::now();
59 |     float average_time_in_sec = float(std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count())
60 |         / 1000000 / num_runs / num_iterations;
61 |     std::cout << "PageRank time for one iteration: " << average_time_in_sec * 1000 << " ms" << std::endl;
62 |     uint32_t nnz = pagerank.get_nnz();
63 |     double op_count = nnz;
64 |     double throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec;
65 |     std::cout << "PageRank Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl;
66 | }
67 | 
68 | 
69 | int main(int argc, char *argv[]) {
70 |     bench_pagerank(strtol(argv[1], NULL, 10),
71 |                    strtol(argv[2], NULL, 10),
72 |                    strtol(argv[3], NULL, 10),
73 |                    argv[4],
74 |                    argv[5]);
75 | }
76 | 
77 | #pragma GCC diagnostic pop
78 | 


--------------------------------------------------------------------------------
/benchmark/bench_spmv.cpp:
--------------------------------------------------------------------------------
  1 | #pragma GCC diagnostic push
  2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context"
  3 | #pragma GCC diagnostic ignored "-Wuninitialized"
  4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  5 | 
  6 | #include <iostream>
  7 | #include <chrono>
  8 | 
  9 | #include "xcl2.hpp"
 10 | 
 11 | #include "graphlily/io/data_loader.h"
 12 | #include "graphlily/module/spmv_module.h"
 13 | 
 14 | 
 15 | template<typename data_t>
 16 | void verify(std::vector<float, aligned_allocator<float>> &reference_results,
 17 |             std::vector<data_t, aligned_allocator<data_t>> &kernel_results) {
 18 |     if (!(reference_results.size() == kernel_results.size())) {
 19 |         std::cout << "Size mismatch!" << std::endl;
 20 |         exit(EXIT_FAILURE);
 21 |     }
 22 |     float epsilon = 0.0001;
 23 |     for (size_t i = 0; i < reference_results.size(); i++) {
 24 |         if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) {
 25 |             std::cout << "Error: Result mismatch"
 26 |                       << std::endl;
 27 |             std::cout << "i = " << i
 28 |                       << " Reference result = " << reference_results[i]
 29 |                       << " Kernel result = " << kernel_results[i]
 30 |                       << std::endl;
 31 |             exit(EXIT_FAILURE);
 32 |         }
 33 |     }
 34 | }
 35 | 
 36 | 
 37 | void bench_spmv(uint32_t num_channels, uint32_t out_buf_len, uint32_t vec_buf_len,
 38 |                 std::string bitstream, std::string dataset) {
 39 |     graphlily::module::SpMVModule<graphlily::val_t, graphlily::val_t> spmv(num_channels,
 40 |                                                                            out_buf_len,
 41 |                                                                            vec_buf_len);
 42 |     spmv.set_target("hw");
 43 |     graphlily::MaskType mask_type = graphlily::kNoMask;
 44 |     spmv.set_mask_type(mask_type);
 45 |     spmv.set_semiring(graphlily::ArithmeticSemiring);
 46 |     spmv.set_up_runtime(bitstream);
 47 | 
 48 |     std::string csr_float_npz_path = dataset;
 49 |     CSRMatrix<float> csr_matrix = graphlily::io::load_csr_matrix_from_float_npz(csr_float_npz_path);
 50 |     for (auto &x : csr_matrix.adj_data) x = 1.0 / csr_matrix.num_rows;
 51 | 
 52 |     graphlily::io::util_round_csr_matrix_dim(
 53 |         csr_matrix,
 54 |         num_channels * graphlily::pack_size,
 55 |         graphlily::pack_size);
 56 | 
 57 |     std::vector<float, aligned_allocator<float>> vector_float(csr_matrix.num_cols);
 58 |     std::generate(vector_float.begin(), vector_float.end(), [&]{return float(rand() % 2);});
 59 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> vector(vector_float.begin(),
 60 |                                                                               vector_float.end());
 61 | 
 62 |     std::vector<float, aligned_allocator<float>> mask_float(csr_matrix.num_cols);
 63 |     std::generate(mask_float.begin(), mask_float.end(), [&](){return float(rand() % 2);});
 64 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> mask(mask_float.begin(),
 65 |                                                                             mask_float.end());
 66 | 
 67 |     bool skip_empty_rows = true;
 68 |     spmv.load_and_format_matrix(csr_matrix, skip_empty_rows);
 69 | 
 70 |     std::cout << "finished load_and_format_matrix" << std::endl;
 71 | 
 72 |     spmv.send_matrix_host_to_device();
 73 |     spmv.send_vector_host_to_device(vector);
 74 |     // send the mask to device even if the kernel does not use it
 75 |     spmv.send_mask_host_to_device(mask);
 76 | 
 77 |     std::cout << "start run" << std::endl;
 78 | 
 79 |     spmv.run();
 80 | 
 81 |     auto kernel_results = spmv.send_results_device_to_host();
 82 |     std::vector<float, aligned_allocator<float>> reference_results;
 83 |     if (mask_type == graphlily::kNoMask) {
 84 |         reference_results = spmv.compute_reference_results(vector_float);
 85 |     } else {
 86 |         reference_results = spmv.compute_reference_results(vector_float, mask_float);
 87 |     }
 88 | 
 89 |     // for (int i = 0; i < 10; i++) {
 90 |     //     std::cout << reference_results[i] << " " << kernel_results[i] <<std::endl;
 91 |     // }
 92 | 
 93 |     // verify<graphlily::val_t>(reference_results, kernel_results);
 94 |     // std::cout << "SpMV passed" << std::endl;
 95 | 
 96 |     uint32_t num_runs = 100;
 97 |     auto t1 = std::chrono::high_resolution_clock::now();
 98 |     for (size_t i = 0; i < num_runs; i++) {
 99 |         spmv.run();
100 |     }
101 |     auto t2 = std::chrono::high_resolution_clock::now();
102 |     float average_time_in_sec = float(std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count())
103 |         / 1000000 / num_runs;
104 |     std::cout << "average_time: " << average_time_in_sec * 1000 << " ms" << std::endl;
105 | 
106 |     uint32_t nnz = spmv.get_nnz();
107 |     double throughput = nnz;
108 |     throughput /= 1000;
109 |     throughput /= 1000;
110 |     throughput /= 1000;
111 |     throughput /= average_time_in_sec;
112 |     std::cout << "Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl;
113 | }
114 | 
115 | 
116 | int main(int argc, char *argv[]) {
117 |     bench_spmv(strtol(argv[1], NULL, 10),
118 |                strtol(argv[2], NULL, 10),
119 |                strtol(argv[3], NULL, 10),
120 |                argv[4],
121 |                argv[5]);
122 | }
123 | 
124 | #pragma GCC diagnostic pop
125 | 


--------------------------------------------------------------------------------
/benchmark/bench_sssp.cpp:
--------------------------------------------------------------------------------
 1 | #pragma GCC diagnostic push
 2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context"
 3 | #pragma GCC diagnostic ignored "-Wuninitialized"
 4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 5 | 
 6 | #include <iostream>
 7 | #include <chrono>
 8 | 
 9 | #include "graphlily/app/sssp.h"
10 | 
11 | 
12 | template<typename data_t>
13 | void verify(std::vector<float, aligned_allocator<float>> &reference_results,
14 |             std::vector<data_t, aligned_allocator<data_t>> &kernel_results) {
15 |     if (!(reference_results.size() == kernel_results.size())) {
16 |         std::cout << "Size mismatch!" << std::endl;
17 |         exit(EXIT_FAILURE);
18 |     }
19 |     float epsilon = 0.0001;
20 |     for (size_t i = 0; i < reference_results.size(); i++) {
21 |         if (abs(float(kernel_results[i]) - reference_results[i]) > epsilon) {
22 |             std::cout << "Error: Result mismatch"
23 |                       << std::endl;
24 |             std::cout << "i = " << i
25 |                       << " Reference result = " << reference_results[i]
26 |                       << " Kernel result = " << kernel_results[i]
27 |                       << std::endl;
28 |             exit(EXIT_FAILURE);
29 |         }
30 |     }
31 | }
32 | 
33 | 
34 | void bench_sssp(uint32_t num_channels, uint32_t spmv_out_buf_len,
35 |                 uint32_t spmspv_out_buf_len, uint32_t vec_buf_len,
36 |                 std::string bitstream, std::string dataset, uint32_t num_iterations) {
37 |     graphlily::app::SSSP sssp(num_channels, spmv_out_buf_len, spmspv_out_buf_len, vec_buf_len);
38 |     sssp.set_target("hw");
39 |     sssp.set_up_runtime(bitstream);
40 | 
41 |     bool skip_empty_rows = true;
42 |     sssp.load_and_format_matrix(dataset, skip_empty_rows);
43 |     std::cout << "finished load_and_format_matrix" << std::endl;
44 |     sssp.send_matrix_host_to_device();
45 | 
46 |     uint32_t source = 0;
47 |     auto reference_results = sssp.compute_reference_results(source, num_iterations);
48 | 
49 |     // Pull
50 |     auto kernel_results = sssp.pull(source, num_iterations);
51 |     // verify<graphlily::val_t>(reference_results, kernel_results);
52 |     // std::cout << "SSSP pull passed" << std::endl;
53 | 
54 |     uint32_t num_runs = 1;
55 |     auto t1 = std::chrono::high_resolution_clock::now();
56 |     for (size_t i = 0; i < num_runs; i++) {
57 |         kernel_results = sssp.pull(source, num_iterations);
58 |     }
59 |     auto t2 = std::chrono::high_resolution_clock::now();
60 |     float average_time_in_sec = float(std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count())
61 |         / 1000000 / num_runs;
62 |     std::cout << "Pull average_time: " << average_time_in_sec * 1000 << " ms" << std::endl;
63 |     uint32_t nnz = sssp.get_nnz();
64 |     double op_count = nnz * num_iterations;
65 |     double throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec;
66 |     std::cout << "Pull Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl;
67 | 
68 |     // Pull-Push
69 |     float threshold = 0.001;
70 |     kernel_results = sssp.pull_push(source, num_iterations, threshold);
71 |     // verify<graphlily::val_t>(reference_results, kernel_results);
72 |     // std::cout << "SSSP pull-push passed" << std::endl;
73 | 
74 |     num_runs = 1;
75 |     t1 = std::chrono::high_resolution_clock::now();
76 |     for (size_t i = 0; i < num_runs; i++) {
77 |         kernel_results = sssp.pull_push(source, num_iterations, threshold);
78 |     }
79 |     t2 = std::chrono::high_resolution_clock::now();
80 |     average_time_in_sec = float(std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1).count())
81 |         / 1000000 / num_runs;
82 |     std::cout << "Pull-Push average_time: " << average_time_in_sec * 1000 << " ms" << std::endl;
83 |     throughput = op_count / 1000 / 1000 / 1000 / average_time_in_sec;
84 |     std::cout << "Pull-Push Compute THROUGHPUT = " << throughput << " GTEPS" << std::endl;
85 | }
86 | 
87 | 
88 | int main(int argc, char *argv[]) {
89 |     bench_sssp(strtol(argv[1], NULL, 10),
90 |                strtol(argv[2], NULL, 10),
91 |                strtol(argv[3], NULL, 10),
92 |                strtol(argv[4], NULL, 10),
93 |                argv[5],
94 |                argv[6],
95 |                strtol(argv[7], NULL, 10));
96 | }
97 | 
98 | #pragma GCC diagnostic pop
99 | 


--------------------------------------------------------------------------------
/benchmark/run_bfs.sh:
--------------------------------------------------------------------------------
 1 | make bench_bfs
 2 | 
 3 | num_channels=16
 4 | spmv_out_buf_len=1024000
 5 | spmspv_out_buf_len=256000
 6 | vec_buf_len=30720
 7 | 
 8 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/
 9 | bitstream+=open_source_166MHz/overlay.xclbin
10 | 
11 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph
12 | 
13 | DATSETS=(gplus_108K_13M_csr_float32.npz
14 |          ogbl_ppa_576K_42M_csr_float32.npz
15 |          hollywood_1M_113M_csr_float32.npz
16 |          pokec_1633K_31M_csr_float32.npz
17 |          ogbn_products_2M_124M_csr_float32.npz
18 |          orkut_3M_213M_csr_float32.npz)
19 | 
20 | NUM_ITER=(7 11 10 11 23 6)
21 | 
22 | BUILD_DIR=./build
23 | 
24 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do
25 |     echo ${BUILD_DIR}/bench_bfs ${DATSETS[i]}
26 |     ${BUILD_DIR}/bench_bfs $num_channels $spmv_out_buf_len $spmspv_out_buf_len $vec_buf_len \
27 |         $bitstream $DATASET_PATH/${DATSETS[i]} ${NUM_ITER[i]}
28 | done
29 | 


--------------------------------------------------------------------------------
/benchmark/run_pagerank.sh:
--------------------------------------------------------------------------------
 1 | make bench_pagerank
 2 | 
 3 | num_channels=16
 4 | spmv_out_buf_len=1024000
 5 | vec_buf_len=30720
 6 | 
 7 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/
 8 | bitstream+=open_source_166MHz/overlay.xclbin
 9 | 
10 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph
11 | 
12 | DATSETS=(gplus_108K_13M_csr_float32.npz
13 |          ogbl_ppa_576K_42M_csr_float32.npz
14 |          hollywood_1M_113M_csr_float32.npz
15 |          pokec_1633K_31M_csr_float32.npz
16 |          ogbn_products_2M_124M_csr_float32.npz
17 |          orkut_3M_213M_csr_float32.npz)
18 | 
19 | BUILD_DIR=./build
20 | 
21 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do
22 |     echo ${BUILD_DIR}/bench_pagerank ${DATSETS[i]}
23 |     ${BUILD_DIR}/bench_pagerank $num_channels $spmv_out_buf_len $vec_buf_len $bitstream $DATASET_PATH/${DATSETS[i]}
24 | done
25 | 


--------------------------------------------------------------------------------
/benchmark/run_spmv.sh:
--------------------------------------------------------------------------------
 1 | make bench_spmv
 2 | 
 3 | num_channels=16
 4 | spmv_out_buf_len=1024000
 5 | vec_buf_len=30720
 6 | 
 7 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/
 8 | bitstream+=open_source_166MHz/overlay.xclbin
 9 | 
10 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph
11 | 
12 | DATSETS=(gplus_108K_13M_csr_float32.npz
13 |          ogbl_ppa_576K_42M_csr_float32.npz
14 |          hollywood_1M_113M_csr_float32.npz
15 |          pokec_1633K_31M_csr_float32.npz
16 |          ogbn_products_2M_124M_csr_float32.npz
17 |          orkut_3M_213M_csr_float32.npz)
18 | 
19 | BUILD_DIR=./build
20 | 
21 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do
22 |     echo ${BUILD_DIR}/bench_spmv ${DATSETS[i]}
23 |     ${BUILD_DIR}/bench_spmv $num_channels $spmv_out_buf_len $vec_buf_len $bitstream $DATASET_PATH/${DATSETS[i]}
24 | done
25 | 


--------------------------------------------------------------------------------
/benchmark/run_sssp.sh:
--------------------------------------------------------------------------------
 1 | make bench_sssp
 2 | 
 3 | num_channels=16
 4 | spmv_out_buf_len=1024000
 5 | spmspv_out_buf_len=256000
 6 | vec_buf_len=30720
 7 | 
 8 | bitstream=/work/shared/common/project_build/graphblas/bitstreams/
 9 | bitstream+=open_source_166MHz/overlay.xclbin
10 | 
11 | DATASET_PATH=/work/shared/common/project_build/graphblas/data/sparse_matrix_graph
12 | 
13 | DATSETS=(gplus_108K_13M_csr_float32.npz
14 |          ogbl_ppa_576K_42M_csr_float32.npz
15 |          hollywood_1M_113M_csr_float32.npz
16 |          pokec_1633K_31M_csr_float32.npz
17 |          ogbn_products_2M_124M_csr_float32.npz
18 |          orkut_3M_213M_csr_float32.npz)
19 | 
20 | NUM_ITER=(7 11 10 11 23 6)
21 | 
22 | BUILD_DIR=./build
23 | 
24 | for ((i = 0; i < ${#DATSETS[@]}; i++)) do
25 |     echo ${BUILD_DIR}/bench_sssp ${DATSETS[i]}
26 |     ${BUILD_DIR}/bench_sssp $num_channels $spmv_out_buf_len $spmspv_out_buf_len $vec_buf_len \
27 |         $bitstream $DATASET_PATH/${DATSETS[i]} ${NUM_ITER[i]}
28 | done
29 | 


--------------------------------------------------------------------------------
/generate_bitstream/Makefile:
--------------------------------------------------------------------------------
 1 | HOST_ARCH = x86
 2 | 
 3 | CXXFLAGS += -Wall -O3 -g -std=c++11
 4 | CXXFLAGS += -I$(GRAPHLILY_ROOT_PATH)
 5 | 
 6 | LDFLAGS += -lrt -lstdc++
 7 | 
 8 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/xcl2/xcl2.mk
 9 | CXXFLAGS += $(xcl2_CXXFLAGS)
10 | LDFLAGS += $(xcl2_LDFLAGS)
11 | 
12 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/opencl/opencl.mk
13 | CXXFLAGS += $(opencl_CXXFLAGS)
14 | LDFLAGS += $(opencl_LDFLAGS)
15 | 
16 | BUILD_DIR = ./build
17 | synthesize: synthesize.cpp $(xcl2_SRCS)
18 | 	g++ $(CXXFLAGS) synthesize.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
19 | 	mkdir -p $(BUILD_DIR)
20 | 	mv synthesize $(BUILD_DIR)/
21 | 	cd $(BUILD_DIR); ./synthesize
22 | 


--------------------------------------------------------------------------------
/generate_bitstream/synthesize.cpp:
--------------------------------------------------------------------------------
 1 | #include "graphlily/synthesizer/overlay_synthesizer.h"
 2 | 
 3 | 
 4 | int main(int argc, char *argv[]) {
 5 |     uint32_t spmv_out_buf_len = 1000 * 1024;
 6 |     uint32_t spmspv_out_buf_len = 250 * 1024;
 7 |     uint32_t vec_buf_len = 30 * 1024;
 8 |     uint32_t num_hbm_channels = 16;
 9 | 
10 |     graphlily::synthesizer::OverlaySynthesizer synthesizer(num_hbm_channels,
11 |                                                            spmv_out_buf_len,
12 |                                                            spmspv_out_buf_len,
13 |                                                            vec_buf_len);
14 |     synthesizer.set_target("hw");
15 |     synthesizer.synthesize();
16 | }
17 | 


--------------------------------------------------------------------------------
/graphlily/app/module_collection.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_MODULE_COLLECTION_H_
  2 | #define GRAPHLILY_MODULE_COLLECTION_H_
  3 | 
  4 | #include "graphlily/global.h"
  5 | #include "graphlily/module/base_module.h"
  6 | 
  7 | 
  8 | namespace graphlily {
  9 | namespace app {
 10 | 
 11 | using namespace module;
 12 | 
 13 | class ModuleCollection {
 14 | protected:
 15 |     /*! \brief The modules */
 16 |     std::vector<BaseModule*> modules_;
 17 |     /*! \brief The number of modules */
 18 |     uint32_t num_modules_ = 0;
 19 |     /*! \brief The kernel names */
 20 |     std::vector<std::string> kernel_names_;
 21 |     /*! \brief The target; can be sw_emu, hw_emu, hw */
 22 |     std::string target_;
 23 | 
 24 |     // OpenCL runtime
 25 |     cl::Device device_;
 26 |     cl::Context context_;
 27 |     std::vector<cl::Kernel> kernels_;
 28 |     std::vector<cl::CommandQueue> command_queues_;
 29 | 
 30 | public:
 31 |     ModuleCollection() {};
 32 | 
 33 |     /*!
 34 |      * \brief Free up resources in the destructor.
 35 |      */
 36 |     ~ModuleCollection() {
 37 |         for (size_t i = 0; i < this->num_modules_; i++) {
 38 |             delete this->modules_[i];
 39 |         }
 40 |     }
 41 | 
 42 |     /*!
 43 |      * \brief Add a module.
 44 |      * \param module The module to be added.
 45 |      */
 46 |     void add_module(BaseModule *module) {
 47 |         this->modules_.push_back(module);
 48 |         std::string kernel_name = module->get_kernel_name();
 49 |         this->kernel_names_.push_back(kernel_name);
 50 |         this->num_modules_++;
 51 |     }
 52 | 
 53 |     /*!
 54 |      * \brief Set the target.
 55 |      */
 56 |     void set_target(std::string target) {
 57 |        assert(target == "sw_emu" || target == "hw_emu" || target == "hw");
 58 |        this->target_ = target;
 59 |     }
 60 | 
 61 |     /*!
 62 |      * \brief Load the xclbin file and set up runtime.
 63 |      * \param xclbin_file_path The xclbin file path.
 64 |      */
 65 |     void set_up_runtime(std::string xclbin_file_path);
 66 | };
 67 | 
 68 | 
 69 | void ModuleCollection::set_up_runtime(std::string xclbin_file_path) {
 70 |     this->kernels_.resize(this->num_modules_);
 71 |     this->command_queues_.resize(this->num_modules_);
 72 |     cl_int err;
 73 |     // Set this->device_ and this->context_
 74 |     if (this->target_ == "sw_emu" || this->target_ == "hw_emu") {
 75 |         setenv("XCL_EMULATION_MODE", this->target_.c_str(), true);
 76 |     }
 77 |     this->device_ = graphlily::find_device();
 78 |     this->context_ = cl::Context(this->device_, NULL, NULL, NULL);
 79 |     // Set this->kernels_
 80 |     auto file_buf = xcl::read_binary_file(xclbin_file_path);
 81 |     cl::Program::Binaries binaries{{file_buf.data(), file_buf.size()}};
 82 |     cl::Program program(this->context_, {this->device_}, binaries, NULL, &err);
 83 |     if (err != CL_SUCCESS) {
 84 |         std::cout << "Failed to program device with xclbin file\n";
 85 |     } else {
 86 |         std::cout << "Successfully programmed device with xclbin file\n";
 87 |     }
 88 |     for (size_t i = 0; i < this->num_modules_; i++) {
 89 |         OCL_CHECK(err, this->kernels_[i] = cl::Kernel(program, this->kernel_names_[i].c_str(), &err));
 90 |     }
 91 |     // Set this->command_queues_
 92 |     for (size_t i = 0; i < this->num_modules_; i++) {
 93 |         OCL_CHECK(err, this->command_queues_[i] = cl::CommandQueue(this->context_,
 94 |                                                                    this->device_,
 95 |                                                                    CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
 96 |                                                                    CL_QUEUE_PROFILING_ENABLE,
 97 |                                                                    &err));
 98 |     }
 99 |     // Set up runtime for each module
100 |     for (size_t i = 0; i < this->num_modules_; i++) {
101 |         this->modules_[i]->set_device(this->device_);
102 |         this->modules_[i]->set_context(this->context_);
103 |         this->modules_[i]->set_kernel(this->kernels_[i]);
104 |         this->modules_[i]->set_command_queue(this->command_queues_[i]);
105 |     }
106 |     // Set unused arguments for each module
107 |     for (size_t i = 0; i < this->num_modules_; i++) {
108 |         this->modules_[i]->set_unused_args();
109 |     }
110 |     // Set the mode for each module
111 |     for (size_t i = 0; i < this->num_modules_; i++) {
112 |         this->modules_[i]->set_mode();
113 |     }
114 | }
115 | 
116 | }  // namespace app
117 | }  // namespace graphlily
118 | 
119 | #endif  // GRAPHLILY_MODULE_COLLECTION_H_
120 | 


--------------------------------------------------------------------------------
/graphlily/app/pagerank.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_APP_PAGERANK_H_
  2 | #define GRAPHLILY_APP_PAGERANK_H_
  3 | 
  4 | #include "graphlily/app/module_collection.h"
  5 | #include "graphlily/module/spmv_module.h"
  6 | #include "graphlily/module/add_scalar_vector_dense_module.h"
  7 | #include "graphlily/io/data_loader.h"
  8 | #include "graphlily/io/data_formatter.h"
  9 | 
 10 | #include <iostream>
 11 | #include <chrono>
 12 | 
 13 | 
 14 | namespace graphlily {
 15 | namespace app {
 16 | 
 17 | class PageRank : public app::ModuleCollection {
 18 | private:
 19 |     // modules
 20 |     graphlily::module::SpMVModule<graphlily::val_t, graphlily::val_t> *SpMV_;
 21 |     graphlily::module::eWiseAddModule<graphlily::val_t> *eWiseAdd_;
 22 |     // Sparse matrix size
 23 |     uint32_t matrix_num_rows_;
 24 |     uint32_t matrix_num_cols_;
 25 |     // SpMV kernel configuration
 26 |     uint32_t num_channels_;
 27 |     uint32_t spmv_out_buf_len_;
 28 |     uint32_t vec_buf_len_;
 29 |     // Semiring
 30 |     graphlily::SemiringType semiring_ = graphlily::ArithmeticSemiring;
 31 |     // Data types
 32 |     using aligned_dense_vec_t = graphlily::aligned_dense_vec_t;
 33 |     using aligned_sparse_vec_t = graphlily::aligned_sparse_vec_t;
 34 |     using aligned_dense_float_vec_t = graphlily::aligned_dense_float_vec_t;
 35 | 
 36 | public:
 37 |     PageRank(uint32_t num_channels, uint32_t spmv_out_buf_len, uint32_t vec_buf_len) {
 38 |         this->num_channels_ = num_channels;
 39 |         this->spmv_out_buf_len_ = spmv_out_buf_len;
 40 |         this->vec_buf_len_ = vec_buf_len;
 41 | 
 42 |         this->SpMV_ = new graphlily::module::SpMVModule<graphlily::val_t, graphlily::val_t>(
 43 |             this->num_channels_,
 44 |             this->spmv_out_buf_len_,
 45 |             this->vec_buf_len_);
 46 |         this->SpMV_->set_semiring(semiring_);
 47 |         this->SpMV_->set_mask_type(graphlily::kNoMask);
 48 |         this->add_module(this->SpMV_);
 49 | 
 50 |         this->eWiseAdd_ = new graphlily::module::eWiseAddModule<graphlily::val_t>();
 51 |         this->add_module(this->eWiseAdd_);
 52 |     }
 53 | 
 54 | 
 55 |     uint32_t get_nnz() {
 56 |         return this->SpMV_->get_nnz();
 57 |     }
 58 | 
 59 | 
 60 |     void load_and_format_matrix(std::string csr_float_npz_path, float damping, bool skip_empty_rows) {
 61 |         CSRMatrix<float> csr_matrix = graphlily::io::load_csr_matrix_from_float_npz(csr_float_npz_path);
 62 |         graphlily::io::util_round_csr_matrix_dim(
 63 |             csr_matrix,
 64 |             this->num_channels_ * graphlily::pack_size,
 65 |             this->num_channels_ * graphlily::pack_size);
 66 |         graphlily::io::util_normalize_csr_matrix_by_outdegree(csr_matrix);
 67 |         for (auto &x : csr_matrix.adj_data) x = x * damping;
 68 |         this->SpMV_->load_and_format_matrix(csr_matrix, skip_empty_rows);
 69 |         this->matrix_num_rows_ = this->SpMV_->get_num_rows();
 70 |         this->matrix_num_cols_ = this->SpMV_->get_num_cols();
 71 |         assert(this->matrix_num_rows_ == this->matrix_num_cols_);
 72 |     }
 73 | 
 74 | 
 75 |     void send_matrix_host_to_device() {
 76 |         this->SpMV_->send_matrix_host_to_device();
 77 |     }
 78 | 
 79 | 
 80 |     aligned_dense_vec_t pull(graphlily::val_t damping, uint32_t num_iterations) {
 81 |         aligned_dense_vec_t rank(this->matrix_num_rows_, 1.0 / this->matrix_num_rows_);
 82 |         this->SpMV_->send_vector_host_to_device(rank);
 83 |         this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf);
 84 |         this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf);
 85 |         for (size_t iter = 1; iter <= num_iterations; iter++) {
 86 |             this->SpMV_->run();
 87 |             this->eWiseAdd_->run(this->matrix_num_rows_, (1 - damping) / this->matrix_num_rows_);
 88 |         }
 89 |         return this->SpMV_->send_vector_device_to_host();
 90 |     }
 91 | 
 92 | 
 93 |     aligned_dense_vec_t pull_time_breakdown(graphlily::val_t damping, uint32_t num_iterations) {
 94 |         float total_time_ms = 0.0;
 95 |         float spmv_time_ms = 0.0;
 96 |         float ewise_time_ms = 0.0;
 97 |         float data_transfer_time_ms = 0.0;
 98 |         // Initialize
 99 |         auto total_time_start = std::chrono::high_resolution_clock::now();
100 |         auto spmv_time_start = std::chrono::high_resolution_clock::now();
101 |         auto ewise_time_start = std::chrono::high_resolution_clock::now();
102 |         auto data_transfer_time_start = std::chrono::high_resolution_clock::now();
103 |         auto total_time_end = std::chrono::high_resolution_clock::now();
104 |         auto spmv_time_end = std::chrono::high_resolution_clock::now();
105 |         auto ewise_time_end = std::chrono::high_resolution_clock::now();
106 |         auto data_transfer_time_end = std::chrono::high_resolution_clock::now();
107 | 
108 |         data_transfer_time_start = std::chrono::high_resolution_clock::now();
109 |         aligned_dense_vec_t rank(this->matrix_num_rows_, 1.0 / this->matrix_num_rows_);
110 |         this->SpMV_->send_vector_host_to_device(rank);
111 |         this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf);
112 |         this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf);
113 |         data_transfer_time_end = std::chrono::high_resolution_clock::now();
114 |         data_transfer_time_ms += float(std::chrono::duration_cast<std::chrono::microseconds>(
115 |             data_transfer_time_end - data_transfer_time_start).count()) / 1000;
116 | 
117 |         for (size_t iter = 1; iter <= num_iterations; iter++) {
118 |             spmv_time_start = std::chrono::high_resolution_clock::now();
119 |             this->SpMV_->run();
120 |             spmv_time_end = std::chrono::high_resolution_clock::now();
121 |             spmv_time_ms += float(std::chrono::duration_cast<std::chrono::microseconds>(
122 |                 spmv_time_end - spmv_time_start).count()) / 1000;
123 | 
124 |             ewise_time_start = std::chrono::high_resolution_clock::now();
125 |             this->eWiseAdd_->run(this->matrix_num_rows_, (1 - damping) / this->matrix_num_rows_);
126 |             ewise_time_end = std::chrono::high_resolution_clock::now();
127 |             ewise_time_ms += float(std::chrono::duration_cast<std::chrono::microseconds>(
128 |                 ewise_time_end - ewise_time_start).count()) / 1000;
129 |         }
130 | 
131 |         data_transfer_time_start = std::chrono::high_resolution_clock::now();
132 |         auto result = this->SpMV_->send_mask_device_to_host();  // the mask of SpMV on the host is not valid
133 |         data_transfer_time_end = std::chrono::high_resolution_clock::now();
134 |         data_transfer_time_ms += float(std::chrono::duration_cast<std::chrono::microseconds>(
135 |             data_transfer_time_end - data_transfer_time_start).count()) / 1000;
136 | 
137 |         total_time_end = std::chrono::high_resolution_clock::now();
138 |         total_time_ms = float(std::chrono::duration_cast<std::chrono::microseconds>(
139 |             total_time_end - total_time_start).count()) / 1000;
140 | 
141 |         std::cout << "total_time_ms per iteration: " << total_time_ms / num_iterations << std::endl;
142 |         std::cout << "spmv_time_ms per iteration: " << spmv_time_ms / num_iterations << std::endl;
143 |         std::cout << "ewise_time_ms per iteration: " << ewise_time_ms / num_iterations << std::endl;
144 |         std::cout << "data_transfer_time_ms per iteration: " << data_transfer_time_ms / num_iterations << std::endl;
145 | 
146 |         return result;
147 |     }
148 | 
149 | 
150 |     aligned_dense_float_vec_t compute_reference_results(float damping, uint32_t num_iterations) {
151 |         aligned_dense_float_vec_t rank(this->matrix_num_rows_, 1.0 / this->matrix_num_rows_);
152 |         for (size_t iter = 1; iter <= num_iterations; iter++) {
153 |             rank = this->SpMV_->compute_reference_results(rank);
154 |             rank = this->eWiseAdd_->compute_reference_results(rank,
155 |                                                               this->matrix_num_rows_,
156 |                                                               (1 - damping) / this->matrix_num_rows_);
157 |         }
158 |         return rank;
159 |     }
160 | };
161 | 
162 | }  // namespace app
163 | }  // namespace graphlily
164 | 
165 | #endif  // GRAPHLILY_APP_PAGERANK_H_
166 | 


--------------------------------------------------------------------------------
/graphlily/app/sssp.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_APP_SSSP_H_
  2 | #define GRAPHLILY_APP_SSSP_H_
  3 | 
  4 | #include "graphlily/app/module_collection.h"
  5 | #include "graphlily/module/spmv_module.h"
  6 | #include "graphlily/module/spmspv_module.h"
  7 | #include "graphlily/module/assign_vector_dense_module.h"
  8 | #include "graphlily/module/assign_vector_sparse_module.h"
  9 | #include "graphlily/module/add_scalar_vector_dense_module.h"
 10 | #include "graphlily/io/data_loader.h"
 11 | #include "graphlily/io/data_formatter.h"
 12 | 
 13 | 
 14 | namespace {
 15 | 
 16 | void _preprocess(CSRMatrix<float>& csr_matrix) {
 17 |     // randomly initialize edge weights
 18 |     for (uint32_t i = 0; i < csr_matrix.adj_data.size(); i++) {
 19 |         // csr_matrix.adj_data[i] = i % 10 + 1;
 20 |         csr_matrix.adj_data[i] = 1;  // When all edge weights are 1, SSSP becomes BFS
 21 |     }
 22 |     // add self edges and set their weights to zero
 23 |     uint32_t num_rows = csr_matrix.adj_indptr.size() - 1;
 24 |     std::vector<uint32_t> nnz_each_row(num_rows);
 25 |     for (size_t i = 0; i < num_rows; i++) {
 26 |         nnz_each_row[i] = csr_matrix.adj_indptr[i + 1] - csr_matrix.adj_indptr[i];
 27 |     }
 28 |     csr_matrix.adj_data.reserve(csr_matrix.adj_data.size() + num_rows);
 29 |     csr_matrix.adj_indices.reserve(csr_matrix.adj_indices.size() + num_rows);
 30 |     for (size_t row_idx = 0; row_idx < num_rows; row_idx++) {
 31 |         uint32_t start = csr_matrix.adj_indptr[row_idx];
 32 |         uint32_t end = csr_matrix.adj_indptr[row_idx + 1];
 33 |         if (start == end) {
 34 |             csr_matrix.adj_data.insert(csr_matrix.adj_data.begin() + start, float(0));
 35 |             csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin() + start, row_idx);
 36 |             nnz_each_row[row_idx]++;
 37 |         } else {
 38 |             bool add_self_edge = false;
 39 |             for (size_t i = start; i < end; i++) {
 40 |                 uint32_t col_idx = csr_matrix.adj_indices[i];
 41 |                 if (col_idx == row_idx) {
 42 |                     csr_matrix.adj_data[i] = float(0);
 43 |                     break;
 44 |                 } else if (col_idx > row_idx) {
 45 |                     add_self_edge = true;
 46 |                     csr_matrix.adj_data.insert(csr_matrix.adj_data.begin() + i, float(0));
 47 |                     csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin() + i, row_idx);
 48 |                     break;
 49 |                 } else if (i == (end - 1)) {
 50 |                     add_self_edge = true;
 51 |                     csr_matrix.adj_data.insert(csr_matrix.adj_data.begin() + i, float(0));
 52 |                     csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin() + i, row_idx);
 53 |                     break;
 54 |                 }
 55 |             }
 56 |             if (add_self_edge) {
 57 |                 nnz_each_row[row_idx]++;
 58 |             }
 59 |         }
 60 |         csr_matrix.adj_indptr[row_idx + 1] = csr_matrix.adj_indptr[row_idx] + nnz_each_row[row_idx];
 61 |     }
 62 | }
 63 | 
 64 | }  // namespace
 65 | 
 66 | 
 67 | namespace graphlily {
 68 | namespace app {
 69 | 
 70 | class SSSP : public app::ModuleCollection {
 71 | private:
 72 |     // modules
 73 |     module::SpMVModule<graphlily::val_t, graphlily::val_t> *SpMV_;
 74 |     module::AssignVectorDenseModule<graphlily::val_t> *DenseAssign_;
 75 |     module::SpMSpVModule<graphlily::val_t, graphlily::val_t, graphlily::idx_val_t> *SpMSpV_;
 76 |     module::AssignVectorSparseModule<graphlily::val_t, graphlily::idx_val_t> *SparseAssign_;
 77 |     module::eWiseAddModule<graphlily::val_t> *eWiseAdd_;  // for on-device data transfer
 78 |     // Sparse matrix size
 79 |     uint32_t matrix_num_rows_;
 80 |     uint32_t matrix_num_cols_;
 81 |     // SpMV kernel configuration
 82 |     uint32_t num_channels_;
 83 |     uint32_t spmv_out_buf_len_;
 84 |     uint32_t spmspv_out_buf_len_;
 85 |     uint32_t vec_buf_len_;
 86 |     // Semiring
 87 |     graphlily::SemiringType semiring_ = graphlily::TropicalSemiring;
 88 |     // Data types
 89 |     using aligned_dense_vec_t = graphlily::aligned_dense_vec_t;
 90 |     using aligned_sparse_vec_t = graphlily::aligned_sparse_vec_t;
 91 |     using aligned_dense_float_vec_t = graphlily::aligned_dense_float_vec_t;
 92 | 
 93 | public:
 94 |     SSSP(uint32_t num_channels, uint32_t spmv_out_buf_len,
 95 |             uint32_t spmspv_out_buf_len, uint32_t vec_buf_len) {
 96 |         this->num_channels_ = num_channels;
 97 |         this->spmv_out_buf_len_ = spmv_out_buf_len;
 98 |         this->spmspv_out_buf_len_ = spmspv_out_buf_len;
 99 |         this->vec_buf_len_ = vec_buf_len;
100 | 
101 |         this->SpMV_ = new module::SpMVModule<graphlily::val_t, graphlily::val_t>(
102 |             this->num_channels_,
103 |             this->spmv_out_buf_len_,
104 |             this->vec_buf_len_);
105 |         this->SpMV_->set_semiring(semiring_);
106 |         this->SpMV_->set_mask_type(graphlily::kNoMask);
107 |         this->add_module(this->SpMV_);
108 | 
109 |         this->SpMSpV_ = new module::SpMSpVModule<graphlily::val_t, graphlily::val_t, graphlily::idx_val_t>(
110 |             spmspv_out_buf_len);
111 |         this->SpMSpV_->set_semiring(semiring_);
112 |         this->SpMSpV_->set_mask_type(graphlily::kNoMask);
113 |         this->add_module(this->SpMSpV_);
114 | 
115 |         bool generate_new_frontier = true;
116 |         this->SparseAssign_ = new module::AssignVectorSparseModule<graphlily::val_t, graphlily::idx_val_t>(
117 |             generate_new_frontier);
118 |         this->add_module(this->SparseAssign_);
119 | 
120 |         this->eWiseAdd_ = new module::eWiseAddModule<graphlily::val_t>();
121 |         this->add_module(this->eWiseAdd_);
122 |     }
123 | 
124 | 
125 |     uint32_t get_nnz() {
126 |         return this->SpMV_->get_nnz();
127 |     }
128 | 
129 | 
130 |     void load_and_format_matrix(std::string csr_float_npz_path, bool skip_empty_rows) {
131 |         CSRMatrix<float> csr_matrix = graphlily::io::load_csr_matrix_from_float_npz(csr_float_npz_path);
132 |         _preprocess(csr_matrix);
133 |         graphlily::io::util_round_csr_matrix_dim(
134 |             csr_matrix,
135 |             this->num_channels_ * graphlily::pack_size,
136 |             this->num_channels_ * graphlily::pack_size);
137 |         CSCMatrix<float> csc_matrix = graphlily::io::csr2csc(csr_matrix);
138 |         this->SpMV_->load_and_format_matrix(csr_matrix, skip_empty_rows);
139 |         this->SpMSpV_->load_and_format_matrix(csc_matrix);
140 |         this->matrix_num_rows_ = this->SpMV_->get_num_rows();
141 |         this->matrix_num_cols_ = this->SpMV_->get_num_cols();
142 |         assert(this->matrix_num_rows_ == this->matrix_num_cols_);
143 |     }
144 | 
145 | 
146 |     void send_matrix_host_to_device() {
147 |         this->SpMV_->send_matrix_host_to_device();
148 |         this->SpMSpV_->send_matrix_host_to_device();
149 |     }
150 | 
151 | 
152 |     aligned_dense_vec_t pull(uint32_t source, uint32_t num_iterations) {
153 |         aligned_dense_vec_t input(this->matrix_num_rows_, semiring_.zero);
154 |         input[source] = 0;
155 |         this->SpMV_->send_vector_host_to_device(input);
156 |         this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf);
157 |         this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf);
158 |         for (size_t iter = 1; iter <= num_iterations; iter++) {
159 |             this->SpMV_->run();
160 |             // this->SpMV_->copy_buffer_device_to_device(this->SpMV_->results_buf,
161 |             //                                           this->SpMV_->vector_buf,
162 |             //                                           sizeof(graphlily::val_t) * this->matrix_num_rows_);
163 |             this->eWiseAdd_->run(this->matrix_num_rows_, 0);
164 |         }
165 |         return this->SpMV_->send_vector_device_to_host();
166 |     }
167 | 
168 | 
169 |     aligned_dense_vec_t push(uint32_t source, uint32_t num_iterations) {
170 |         // The sparse input vector
171 |         aligned_sparse_vec_t spmspv_input(2);
172 |         idx_val_t head;
173 |         graphlily::idx_t nnz = 1;  // one source vertex
174 |         head.index = nnz;
175 |         spmspv_input[0] = head;
176 |         spmspv_input[1] = {source, 0};
177 | 
178 |         // The dense distance vector
179 |         aligned_dense_vec_t distance(this->matrix_num_rows_, semiring_.zero);
180 |         distance[source] = 0;
181 | 
182 |         // Push
183 |         this->SpMSpV_->send_vector_host_to_device(spmspv_input);
184 |         this->SpMSpV_->send_mask_host_to_device(distance);
185 |         this->SparseAssign_->bind_mask_buf(this->SpMSpV_->results_buf);
186 |         this->SparseAssign_->bind_inout_buf(this->SpMSpV_->mask_buf);
187 |         this->SparseAssign_->bind_new_frontier_buf(this->SpMSpV_->vector_buf);
188 |         for (size_t iter = 1; iter <= num_iterations; iter++) {
189 |             this->SpMSpV_->run();
190 |             this->SparseAssign_->run();
191 |         }
192 | 
193 |         return this->SpMSpV_->send_mask_device_to_host();
194 |     }
195 | 
196 | 
197 |     aligned_dense_vec_t pull_push(uint32_t source, uint32_t num_iterations, float threshold = 0.05) {
198 |         // The sparse input vector
199 |         aligned_sparse_vec_t spmspv_input(2);
200 |         idx_val_t head;
201 |         graphlily::idx_t nnz = 1;  // one source vertex
202 |         head.index = nnz;
203 |         spmspv_input[0] = head;
204 |         spmspv_input[1] = {source, 0};
205 | 
206 |         // The dense distance vector
207 |         aligned_dense_vec_t distance(this->matrix_num_rows_, semiring_.zero);
208 |         distance[source] = 0;
209 | 
210 |         // Push
211 |         this->SpMSpV_->send_vector_host_to_device(spmspv_input);
212 |         this->SpMSpV_->send_mask_host_to_device(distance);
213 |         this->SparseAssign_->bind_mask_buf(this->SpMSpV_->results_buf);
214 |         this->SparseAssign_->bind_inout_buf(this->SpMSpV_->mask_buf);
215 |         this->SparseAssign_->bind_new_frontier_buf(this->SpMSpV_->vector_buf);
216 |         uint32_t iter = 1;
217 |         uint32_t vector_nnz;
218 |         do {
219 |             this->SpMSpV_->run();
220 |             this->SparseAssign_->run();
221 |             vector_nnz = this->SpMSpV_->get_results_nnz();
222 |             iter++;
223 |         } while (iter < num_iterations && (float(vector_nnz) / this->matrix_num_rows_ < threshold));
224 | 
225 |         std::cout << "SpMSpV runs for " << (iter - 1) << " iterations" << std::endl;
226 | 
227 |         // Switch from push to pull
228 |         aligned_dense_vec_t spmv_input = this->SpMSpV_->send_mask_device_to_host();
229 |         this->SpMV_->send_vector_host_to_device(spmv_input);
230 |         this->eWiseAdd_->bind_in_buf(this->SpMV_->results_buf);
231 |         this->eWiseAdd_->bind_out_buf(this->SpMV_->vector_buf);
232 | 
233 |         // Pull
234 |         for ( ; iter <= num_iterations; iter++) {
235 |             this->SpMV_->run();
236 |             // this->SpMV_->copy_buffer_device_to_device(this->SpMV_->results_buf,
237 |             //                                           this->SpMV_->vector_buf,
238 |             //                                           sizeof(graphlily::val_t) * this->matrix_num_rows_);
239 |             this->eWiseAdd_->run(this->matrix_num_rows_, 0);
240 |         }
241 | 
242 |         return this->SpMV_->send_vector_device_to_host();
243 |     }
244 | 
245 | 
246 |     aligned_dense_float_vec_t compute_reference_results(uint32_t source, uint32_t num_iterations) {
247 |         aligned_dense_float_vec_t input(this->matrix_num_rows_, semiring_.zero);
248 |         input[source] = 0;
249 |         for (size_t iter = 1; iter <= num_iterations; iter++) {
250 |             input = this->SpMV_->compute_reference_results(input);
251 |         }
252 |         return input;
253 |     }
254 | };
255 | 
256 | }  // namespace app
257 | }  // namespace graphlily
258 | 
259 | #endif  // GRAPHLILY_APP_SSSP_H_
260 | 


--------------------------------------------------------------------------------
/graphlily/global.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_GLOBAL_H_
  2 | #define GRAPHLILY_GLOBAL_H_
  3 | 
  4 | #include <string>
  5 | #include <cstdlib>
  6 | 
  7 | #include "ap_fixed.h"
  8 | #include "xcl2.hpp"
  9 | 
 10 | 
 11 | namespace {
 12 | 
 13 | std::string get_root_path() {
 14 |     char* root_path = getenv("GRAPHLILY_ROOT_PATH");
 15 |     return root_path == NULL ? std::string("") : std::string(root_path);
 16 | }
 17 | 
 18 | }  // namespace
 19 | 
 20 | 
 21 | namespace graphlily {
 22 | 
 23 | // The root path
 24 | const std::string root_path = get_root_path();
 25 | 
 26 | // The device
 27 | const std::string device_name = "xilinx_u280_xdma_201920_3";
 28 | 
 29 | // Find the device
 30 | cl::Device find_device() {
 31 |     auto devices = xcl::get_xil_devices();
 32 |     for (size_t i = 0; i < devices.size(); i++) {
 33 |         cl::Device device = devices[i];
 34 |         if (device.getInfo<CL_DEVICE_NAME>() == device_name) {
 35 |             return device;
 36 |         }
 37 |     }
 38 |     std::cout << "Failed to find " << device_name << ", exit!\n";
 39 |     exit(EXIT_FAILURE);
 40 | }
 41 | 
 42 | // HBM channels
 43 | #define MAX_HBM_CHANNEL_COUNT 32
 44 | #define CHANNEL_NAME(n) n | XCL_MEM_TOPOLOGY
 45 | const int HBM[MAX_HBM_CHANNEL_COUNT] = {
 46 |     CHANNEL_NAME(0),  CHANNEL_NAME(1),  CHANNEL_NAME(2),  CHANNEL_NAME(3),  CHANNEL_NAME(4),
 47 |     CHANNEL_NAME(5),  CHANNEL_NAME(6),  CHANNEL_NAME(7),  CHANNEL_NAME(8),  CHANNEL_NAME(9),
 48 |     CHANNEL_NAME(10), CHANNEL_NAME(11), CHANNEL_NAME(12), CHANNEL_NAME(13), CHANNEL_NAME(14),
 49 |     CHANNEL_NAME(15), CHANNEL_NAME(16), CHANNEL_NAME(17), CHANNEL_NAME(18), CHANNEL_NAME(19),
 50 |     CHANNEL_NAME(20), CHANNEL_NAME(21), CHANNEL_NAME(22), CHANNEL_NAME(23), CHANNEL_NAME(24),
 51 |     CHANNEL_NAME(25), CHANNEL_NAME(26), CHANNEL_NAME(27), CHANNEL_NAME(28), CHANNEL_NAME(29),
 52 |     CHANNEL_NAME(30), CHANNEL_NAME(31)};
 53 | 
 54 | const int DDR[2] = {CHANNEL_NAME(32), CHANNEL_NAME(33)};
 55 | 
 56 | // Kernel configurations
 57 | const uint32_t pack_size = 8;
 58 | const uint32_t spmv_row_interleave_factor = 1;
 59 | const uint32_t num_hbm_channels = 16;
 60 | 
 61 | // Data types (please change this according to the kernel!)
 62 | // using val_t = unsigned;
 63 | using val_t = ap_ufixed<32, 8, AP_RND, AP_SAT>;
 64 | // using val_t = float;
 65 | typedef uint32_t idx_t;
 66 | const uint32_t idx_marker = 0xffffffff;
 67 | typedef struct {idx_t data[pack_size];} packed_idx_t;
 68 | 
 69 | typedef struct {idx_t index; val_t val;} idx_val_t;
 70 | typedef struct {idx_t index; float val;} idx_float_t;
 71 | 
 72 | using aligned_dense_vec_t = std::vector<val_t, aligned_allocator<val_t>>;
 73 | using aligned_sparse_vec_t = std::vector<idx_val_t, aligned_allocator<idx_val_t>>;
 74 | 
 75 | using aligned_dense_float_vec_t = std::vector<float, aligned_allocator<float>>;
 76 | using aligned_sparse_float_vec_t = std::vector<idx_float_t, aligned_allocator<idx_float_t>>;
 77 | 
 78 | const val_t UINT_INF = 0xffffffff;
 79 | const val_t UFIXED_INF = 255;
 80 | const val_t FLOAT_INF = 999999999;
 81 | 
 82 | // Operation type, named as k<opx><op+>
 83 | enum OperationType {
 84 |     kMulAdd = 0,
 85 |     kLogicalAndOr = 1,
 86 |     kAddMin = 2,
 87 | };
 88 | 
 89 | // Semiring definition
 90 | struct SemiringType {
 91 |     OperationType op;
 92 |     val_t one;  // identity element for operator <x> (a <x> one = a)
 93 |     val_t zero;  // identity element for operator <+> (a <+> zero = a)
 94 | };
 95 | 
 96 | const SemiringType ArithmeticSemiring = {kMulAdd, 1, 0};
 97 | const SemiringType LogicalSemiring = {kLogicalAndOr, 1, 0};
 98 | // const SemiringType TropicalSemiring = {kAddMin, 0, UINT_INF};
 99 | const SemiringType TropicalSemiring = {kAddMin, 0, UFIXED_INF};
100 | // const SemiringType TropicalSemiring = {kAddMin, 0, FLOAT_INF};
101 | 
102 | // Mask type
103 | enum MaskType {
104 |     kNoMask = 0,
105 |     kMaskWriteToZero = 1,
106 |     kMaskWriteToOne = 2,
107 | };
108 | 
109 | // Makefile for synthesizing xclbin
110 | const std::string makefile_prologue =
111 |     "DEVICE = /opt/xilinx/platforms/" + device_name + "/" + device_name + ".xpfm\n"
112 |     "\n"
113 |     "TEMP_DIR := ./_x.$(TARGET)\n"
114 |     "BUILD_DIR := ./build_dir.$(TARGET)\n"
115 |     "\n"
116 |     "VPP := v++\n"
117 |     "\n"
118 |     "CLFLAGS += -t $(TARGET) --platform $(DEVICE) --kernel_frequency 200 --save-temps\n"
119 |     "\n"
120 |     "FUSED_KERNEL = $(BUILD_DIR)/fused.xclbin\n"
121 |     "\n"
122 |     "emconfig.json:\n"
123 |     "\temconfigutil --platform $(DEVICE)\n"
124 |     "\n"
125 |     "build: $(FUSED_KERNEL) emconfig.json\n"
126 |     "\n";
127 | 
128 | const std::string makefile_epilogue =
129 |     "$(FUSED_KERNEL): $(KERNEL_OBJS)\n"
130 |     "\tmkdir -p $(BUILD_DIR)\n"
131 |     "\t$(VPP) $(CLFLAGS) --temp_dir $(BUILD_DIR) -l $(LDCLFLAGS) -o'$@' $(+)\n";
132 | 
133 | std::string add_kernel_to_makefile(std::string kernel_name) {
134 |     std::string makefile_body;
135 |     makefile_body += ("LDCLFLAGS += --config " + kernel_name + ".ini" + "\n");
136 |     makefile_body += ("KERNEL_OBJS += $(TEMP_DIR)/" + kernel_name + ".xo" + "\n");
137 |     makefile_body += "\n";
138 |     makefile_body += ("$(TEMP_DIR)/" + kernel_name + ".xo: " + kernel_name + ".cpp" + "\n");
139 |     makefile_body += ("\tmkdir -p $(TEMP_DIR)\n");
140 |     makefile_body += ("\t$(VPP) $(CLFLAGS) --temp_dir $(TEMP_DIR) -c -k " + kernel_name + " -I'$(<D)' -o'$@' '$<'\n");
141 |     makefile_body += "\n";
142 |     return makefile_body;
143 | }
144 | 
145 | // Project folder name
146 | const std::string proj_folder_name = "proj";
147 | 
148 | //------------------------------------------
149 | // Utilities
150 | //------------------------------------------
151 | 
152 | // convert a sparse vector to dense
153 | template<typename sparse_vec_t, typename dense_vec_t, typename val_t>
154 | dense_vec_t convert_sparse_vec_to_dense_vec(const sparse_vec_t &sparse_vector,
155 |                                             uint32_t range,
156 |                                             val_t zero) {
157 |     dense_vec_t dense_vector(range);
158 |     std::fill(dense_vector.begin(), dense_vector.end(), zero);
159 |     int nnz = sparse_vector[0].index;
160 |     for (int i = 1; i < nnz + 1; i++) {
161 |         dense_vector[sparse_vector[i].index] = sparse_vector[i].val;
162 |     }
163 |     return dense_vector;
164 | }
165 | 
166 | // used to calculate BANK_ID_NBITS
167 | unsigned log2(unsigned x) {
168 |     switch (x) {
169 |         case    1: return 0;
170 |         case    2: return 1;
171 |         case    4: return 2;
172 |         case    8: return 3;
173 |         case   16: return 4;
174 |         default  : return 0;
175 |     }
176 | }
177 | 
178 | }  // namespace graphlily
179 | 
180 | #endif  // GRAPHLILY_GLOBAL_H_
181 | 


--------------------------------------------------------------------------------
/graphlily/hw/kernel_add_scalar_vector_dense_impl.h:
--------------------------------------------------------------------------------
 1 | #include "./overlay.h"
 2 | 
 3 | #include <assert.h>
 4 | 
 5 | 
 6 | void kernel_add_scalar_vector_dense(
 7 |     const PACKED_VAL_T *in,  // The input vector
 8 |     PACKED_VAL_T *out,       // The output vector
 9 |     unsigned length,         // The length of the in/out vector
10 |     VAL_T val                // The value to be added
11 | ) {
12 |     assert(length % PACK_SIZE == 0);
13 |     unsigned size = length / PACK_SIZE;
14 |     PACKED_VAL_T tmp_in;
15 |     PACKED_VAL_T tmp_out;
16 | 
17 |     loop_kernel_add_scalar_vector_dense:
18 |     for (int i = 0; i < size; i++) {
19 |         #pragma HLS PIPELINE II=1
20 |         tmp_in = in[i];
21 |         for (int k = 0; k < PACK_SIZE; k++) {
22 |             #pragma HLS UNROLL
23 |             tmp_out.data[k] = tmp_in.data[k] + val;
24 |         }
25 |         out[i] = tmp_out;
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/graphlily/hw/kernel_assign_vector_dense_impl.h:
--------------------------------------------------------------------------------
 1 | #include "./overlay.h"
 2 | 
 3 | #include <assert.h>
 4 | #include <iostream>
 5 | #include <cstdlib>
 6 | 
 7 | 
 8 | void kernel_assign_vector_dense(
 9 |     const PACKED_VAL_T *mask,  // The mask vector
10 |     PACKED_VAL_T *in,          // The input vector
11 |     PACKED_VAL_T *out,         // The output vector
12 |     unsigned length,           // The length of the mask/inout vector
13 |     VAL_T val,                 // The value to be assigned to the inout vector
14 |     MASK_T mask_type           // The mask type
15 | ) {
16 |     assert(length % PACK_SIZE == 0);
17 |     unsigned size = length / PACK_SIZE;
18 |     PACKED_VAL_T tmp_mask;
19 |     PACKED_VAL_T tmp_inout;
20 | 
21 |     loop_kernel_assign_vector_dense:
22 |     for (int i = 0; i < size; i++) {
23 |         #pragma HLS PIPELINE II=1
24 | 
25 |         tmp_mask = mask[i];
26 |         tmp_inout = in[i];
27 |         for (int k = 0; k < PACK_SIZE; k++) {
28 |             #pragma HLS UNROLL
29 |             if (mask_type == WRITETOZERO) {
30 |                 if (tmp_mask.data[k] == 0) {
31 |                     tmp_inout.data[k] = val;
32 |                 }
33 |             } else if (mask_type == WRITETOONE) {
34 |                 if (tmp_mask.data[k] != 0) {
35 |                     tmp_inout.data[k] = val;
36 |                 }
37 |             } else {
38 |                 tmp_inout.data[k] = 0;
39 |                 #ifndef __SYNTHESIS__
40 |                 std::cout << "Invalid mask type" << std::endl;
41 |                 exit(EXIT_FAILURE);
42 |                 #endif
43 |             }
44 |         }
45 |         out[i] = tmp_inout;
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/graphlily/hw/kernel_assign_vector_sparse_new_frontier_impl.h:
--------------------------------------------------------------------------------
 1 | #include "./overlay.h"
 2 | 
 3 | 
 4 | void kernel_assign_vector_sparse_new_frontier(
 5 |     const IDX_VAL_T *mask,    // The sparse mask vector. The index field of the first element is the length.
 6 |     VAL_T *inout,             // The inout vector.
 7 |     IDX_VAL_T *new_frontier   // The new frontier. The index field of the first element is the length.
 8 | ) {
 9 |     // local buffer
10 |     VAL_T local_inout_buf[BATCH_SIZE];
11 |     IDX_VAL_T local_mask_buf[BATCH_SIZE];
12 |     IDX_VAL_T local_new_frontier_buf[BATCH_SIZE];
13 |     #pragma HLS DATA_PACK variable=local_mask_buf
14 |     #pragma HLS DATA_PACK variable=local_new_frontier_buf
15 | 
16 |     IDX_T length = mask[0].index;
17 |     unsigned num_batches = (length + BATCH_SIZE - 1) / BATCH_SIZE;
18 |     unsigned remain = length;
19 | 
20 |     unsigned new_frontier_length = 0;
21 | 
22 |     loop_over_batches:
23 |     for (unsigned batch_cnt = 0; batch_cnt < num_batches; batch_cnt++) {
24 |         #pragma HLS pipeline off
25 |         unsigned batch_new_frontier_length = 0;
26 | 
27 |         // read stage
28 |         loop_read_inout_val:
29 |         for (unsigned i = 0; i < BATCH_SIZE; i++) {
30 |             #pragma HLS pipeline II=1
31 |             if (i < remain) {
32 |                 IDX_VAL_T tmp_mask = mask[i + 1 + batch_cnt * BATCH_SIZE];
33 |                 local_mask_buf[i].index = tmp_mask.index;
34 |                 local_mask_buf[i].val = tmp_mask.val;
35 |                 local_inout_buf[i] = inout[tmp_mask.index];
36 |             }
37 |         }
38 | 
39 |         // process stage
40 |         loop_process:
41 |         for (unsigned i = 0; i < BATCH_SIZE; i++) {
42 |             #pragma HLS pipeline II=1
43 |             if (i < remain) {
44 |                 if (local_inout_buf[i] > local_mask_buf[i].val) {
45 |                     local_inout_buf[i] = local_mask_buf[i].val;
46 |                     local_new_frontier_buf[batch_new_frontier_length] = local_mask_buf[i];
47 |                     batch_new_frontier_length++;
48 |                 }
49 |             }
50 |         }
51 | 
52 |         // write inout
53 |         loop_write_inout_val:
54 |         for (unsigned i = 0; i < BATCH_SIZE; i++) {
55 |             #pragma HLS pipeline II=1
56 |             if (i < remain) {
57 |                 inout[local_mask_buf[i].index] = local_inout_buf[i];
58 |             }
59 |         }
60 | 
61 |         // write new_frontier
62 |         loop_write_new_frontier:
63 |         for (unsigned i = 0; i < batch_new_frontier_length; i++) {
64 |             #pragma HLS pipeline II=1
65 |             new_frontier[i + 1 + new_frontier_length] = local_new_frontier_buf[i];
66 |         }
67 |         new_frontier_length += batch_new_frontier_length;
68 | 
69 |         // update progress
70 |         remain -= BATCH_SIZE;
71 |     }
72 | 
73 |     // attach head to new_frontier
74 |     IDX_VAL_T new_frontier_head;
75 |     new_frontier_head.index = new_frontier_length;
76 |     new_frontier_head.val = 0;
77 |     new_frontier[0] = new_frontier_head;
78 | }
79 | 


--------------------------------------------------------------------------------
/graphlily/hw/kernel_assign_vector_sparse_no_new_frontier_impl.h:
--------------------------------------------------------------------------------
 1 | #include "./overlay.h"
 2 | 
 3 | 
 4 | void kernel_assign_vector_sparse_no_new_frontier(
 5 |     const IDX_VAL_T *mask,  // The sparse mask vector. The index field of the first element is the length.
 6 |     VAL_T *inout,           // The inout vector.
 7 |     VAL_T val               // The value to be assigned to the inout vector.
 8 | ) {
 9 |     // local buffer
10 |     VAL_T local_inout_buf[BATCH_SIZE];
11 |     IDX_VAL_T local_mask_buf[BATCH_SIZE];
12 |     #pragma HLS DATA_PACK variable=local_mask_buf
13 | 
14 |     IDX_T length = mask[0].index;
15 |     unsigned num_batches = (length + BATCH_SIZE - 1) / BATCH_SIZE;
16 |     unsigned remain = length;
17 | 
18 |     loop_over_batches:
19 |     for (unsigned batch_cnt = 0; batch_cnt < num_batches; batch_cnt++) {
20 |         #pragma HLS pipeline off
21 | 
22 |         // read stage
23 |         loop_read_inout_val:
24 |         for (unsigned i = 0; i < BATCH_SIZE; i++) {
25 |             #pragma HLS pipeline II=1
26 |             if (i < remain) {
27 |                 IDX_VAL_T tmp_mask = mask[i + 1 + batch_cnt * BATCH_SIZE];
28 |                 local_mask_buf[i].index = tmp_mask.index;
29 |                 local_mask_buf[i].val = tmp_mask.val;
30 |                 local_inout_buf[i] = inout[tmp_mask.index];
31 |             }
32 |         }
33 | 
34 |         // process stage
35 |         loop_process:
36 |         for (unsigned i = 0; i < BATCH_SIZE; i++) {
37 |             #pragma HLS pipeline II=1
38 |             if (i < remain) {
39 |                 local_inout_buf[i] = val;
40 |             }
41 |         }
42 | 
43 |         // write inout
44 |         loop_write_inout_val:
45 |         for (unsigned i = 0; i < BATCH_SIZE; i++) {
46 |             #pragma HLS pipeline II=1
47 |             if (i < remain) {
48 |                 inout[local_mask_buf[i].index] = local_inout_buf[i];
49 |             }
50 |         }
51 | 
52 |         // update progress
53 |         remain -= BATCH_SIZE;
54 |     }
55 | }
56 | 


--------------------------------------------------------------------------------
/graphlily/hw/math_constants.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRAPHLILY_HW_MATH_CONSTANTS_H_
 2 | #define GRAPHLILY_HW_MATH_CONSTANTS_H_
 3 | 
 4 | #include "ap_fixed.h"
 5 | 
 6 | const unsigned UINT_INF = 0xffffffff;
 7 | const ap_ufixed<32, 8, AP_RND, AP_SAT> UFIXED_INF = 255;
 8 | const float FLOAT_INF = 999999999;
 9 | 
10 | #endif  // GRAPHLILY_HW_MATH_CONSTANTS_H_
11 | 


--------------------------------------------------------------------------------
/graphlily/hw/overlay.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRAPHLILY_HW_OVERLAY_H_
 2 | #define GRAPHLILY_HW_OVERLAY_H_
 3 | 
 4 | #include "ap_fixed.h"
 5 | #include "./math_constants.h"
 6 | 
 7 | #define IDX_MARKER 0xffffffff
 8 | 
 9 | const unsigned PACK_SIZE = 8;
10 | const unsigned NUM_PORT_PER_BANK = 1;
11 | const unsigned NUM_BANK_PER_HBM_CHANNEL = PACK_SIZE / NUM_PORT_PER_BANK;
12 | const unsigned BANK_ID_NBITS = 3;
13 | const unsigned BANK_ID_MASK = 7;
14 | 
15 | const unsigned SPMV_ROW_INTERLEAVE_FACTOR = 1;
16 | 
17 | // data types
18 | typedef unsigned IDX_T;
19 | typedef struct {IDX_T data[PACK_SIZE];} PACKED_IDX_T;
20 | 
21 | // typedef unsigned VAL_T;
22 | typedef ap_ufixed<32, 8, AP_RND, AP_SAT> VAL_T;
23 | // typedef float VAL_T;
24 | typedef struct {VAL_T data[PACK_SIZE];} PACKED_VAL_T;
25 | 
26 | typedef struct {
27 |    PACKED_IDX_T indices;
28 |    PACKED_VAL_T vals;
29 | } SPMV_MAT_PKT_T;
30 | 
31 | typedef SPMV_MAT_PKT_T SPMSPV_MAT_PKT_T;
32 | 
33 | typedef struct {IDX_T index; VAL_T val;} IDX_VAL_T;
34 | 
35 | // semiring
36 | typedef char OP_T;
37 | #define MULADD 0
38 | #define ANDOR  1
39 | #define ADDMIN 2
40 | 
41 | const VAL_T MulAddZero = 0;
42 | const VAL_T AndOrZero  = 0;
43 | // const VAL_T AddMinZero = UINT_INF;
44 | const VAL_T AddMinZero = UFIXED_INF;
45 | // const VAL_T AddMinZero = FLOAT_INF;
46 | 
47 | const VAL_T MulAddOne = 1;
48 | const VAL_T AndOrOne  = 1;
49 | const VAL_T AddMinOne = 0;
50 | 
51 | // mask type
52 | typedef char MASK_T;
53 | #define NOMASK      0
54 | #define WRITETOZERO 1
55 | #define WRITETOONE  2
56 | 
57 | // Kernel configurations
58 | const unsigned FIFO_DEPTH = 64;
59 | const unsigned BATCH_SIZE = 128;
60 | 
61 | // Below kernel configurations will be overwritten by the compiler
62 | // const unsigned SPMV_OUT_BUF_LEN =;
63 | // const unsigned SPMSPV_OUT_BUF_LEN =;
64 | // const unsigned VEC_BUF_LEN =;
65 | // #define NUM_HBM_CHANNEL
66 | // #define SPMV_NUM_PE_TOTAL
67 | 
68 | // #endif  // GRAPHLILY_HW_OVERLAY_H_
69 | 


--------------------------------------------------------------------------------
/graphlily/hw/shuffle.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_HW_SHUFFLE_H_
  2 | #define GRAPHLILY_HW_SHUFFLE_H_
  3 | 
  4 | #include <iostream>
  5 | #include <iomanip>
  6 | 
  7 | #include "hls_stream.h"
  8 | 
  9 | #include "./util.h"
 10 | 
 11 | 
 12 | #ifndef __SYNTHESIS__
 13 | bool line_tracing_shuffle_1p = false;
 14 | #endif
 15 | 
 16 | // pipeline register
 17 | // Its latency is 5.
 18 | // Only use this function to wrap signals that travels along stage A!
 19 | template<typename PayloadT>
 20 | PayloadT pipereg_stage_A (PayloadT in) {
 21 |     #pragma HLS pipeline II=1
 22 |     #pragma HLS latency min=5 max=5
 23 |     return in;
 24 | }
 25 | 
 26 | //------------------------------------------------------------
 27 | // arbiters
 28 | //------------------------------------------------------------
 29 | 
 30 | // TODO: Do we really need to expose addr_mask as an argument? Can it be inferred?
 31 | // TODO: Is num_in_lane always equal to num_out_lane?
 32 | template<unsigned num_in_lane, unsigned num_out_lane, unsigned addr_mask>
 33 | unsigned arbiter_1p(
 34 |     unsigned in_addr[num_in_lane],
 35 |     bool     in_valid[num_in_lane],
 36 |     // bool     in_granted[num_in_lane],
 37 |     bool     in_resend[num_in_lane],
 38 |     unsigned xbar_sel[num_out_lane],
 39 |     bool     out_valid[num_out_lane],
 40 |     unsigned rotate_priority
 41 | ) {
 42 |     #pragma HLS pipeline II=1
 43 |     #pragma HLS latency min=5 max=5
 44 |     // #pragma HLS inline
 45 | 
 46 |     // bool in_granted[num_in_lane];
 47 |     // #pragma HLS array_partition variable=in_granted complete
 48 | 
 49 |     // static unsigned in_addr[num_in_lane];
 50 |     // #pragma HLS array_partition variable=in_addr complete
 51 |     // loop_A_extract_addr:
 52 |     // for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
 53 |     //     in_addr[ILid] = in_payload[ILid].index;
 54 |     // }
 55 | 
 56 |     // prioritized valid and addr
 57 |     bool arb_p_in_valid[num_in_lane];
 58 |     #pragma HLS array_partition variable=arb_p_in_valid complete
 59 |     unsigned arb_p_in_addr[num_in_lane];
 60 |     #pragma HLS array_partition variable=arb_p_in_addr complete
 61 | 
 62 |     array_shift_left<unsigned, num_in_lane>(in_addr, arb_p_in_addr, rotate_priority);
 63 |     array_shift_left<bool, num_in_lane>(in_valid, arb_p_in_valid, rotate_priority);
 64 | 
 65 |     loop_A_arbsearch:
 66 |     for (unsigned OLid = 0; OLid < num_out_lane; OLid++) {
 67 |         #pragma HLS unroll
 68 |         bool found = false;
 69 |         unsigned chosen_port = 0;
 70 | 
 71 |         loop_ab_logic_encoder_unroll:
 72 |         for (unsigned ILid_plus_1 = num_in_lane; ILid_plus_1 > 0; ILid_plus_1--) {
 73 |             #pragma HLS unroll
 74 |             if (arb_p_in_valid[ILid_plus_1 - 1] && ((arb_p_in_addr[ILid_plus_1 - 1] & addr_mask) == OLid)) {
 75 |                 chosen_port = ILid_plus_1 - 1;
 76 |                 found = true;
 77 |             }
 78 |         }
 79 |         if (!found) {
 80 |             out_valid[OLid] = false;
 81 |             xbar_sel[OLid] = 0;
 82 |         } else {
 83 |             out_valid[OLid] = true;
 84 |             xbar_sel[OLid] = chosen_port;
 85 |         }
 86 |     }
 87 | 
 88 |     array_cyclic_add<unsigned, num_out_lane, num_in_lane>(xbar_sel, out_valid, rotate_priority);
 89 | 
 90 |     unsigned grant_count = 0;
 91 |     loop_A_grant:
 92 |     for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
 93 |         #pragma HLS unroll
 94 |         unsigned requested_olid = in_addr[ILid] & addr_mask;
 95 |         bool in_granted = (in_valid[ILid]
 96 |                            && out_valid[requested_olid]
 97 |                            && (xbar_sel[requested_olid] == ILid));
 98 |         in_resend[ILid] = in_valid[ILid] && !in_granted;
 99 |         if (in_granted) grant_count++;
100 |     }
101 | 
102 |     // loop_A_resend:
103 |     // for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
104 |     //     #pragma HLS unroll
105 |     //     // resend path
106 |     //     in_resend[ILid] = in_valid[ILid] && !in_granted[ILid];
107 |     // }
108 | 
109 |     // loop_A_pass:
110 |     // for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
111 |     //     out_payload[ILid] = in_payload[ILid];
112 |     // }
113 | 
114 |     return grant_count;
115 | }
116 | 
117 | 
118 | /* shuffler-1p
119 |  * 1 downstream entitiy could process 1 payload per cycle
120 |  * type PayloadT must be a struct of 2 fields:
121 |  * unsigned index
122 |  * <PayloadValT> data
123 | */
124 | template<typename PayloadT, typename PayloadValT,
125 |          unsigned num_in_lane, unsigned num_out_lane, unsigned addr_mask>
126 | void shuffler_1p(
127 |     // fifos
128 |     hls::stream<PayloadT> input_lanes[num_in_lane],
129 |     hls::stream<PayloadT> output_lanes[num_out_lane],
130 |     // total number of payloads (avaliable after all payloads are put into the input lanes)
131 |     hls::stream<unsigned> &num_payloads_in,
132 |     // all outputs are in the output lanes
133 |     hls::stream<unsigned> &num_payloads_out
134 | ) {
135 |     // pipeline control variables
136 |     bool prev_finish = false;
137 |     unsigned payload_cnt = 0;
138 |     unsigned num_granted_A = 0;
139 |     unsigned num_granted_C = 0;
140 |     unsigned process_cnt = 0;
141 | 
142 |     // pipeline data registers before arbiter
143 |     PayloadT payload_F[num_in_lane];
144 |     PayloadT payload_A[num_in_lane];
145 |     unsigned addr_A[num_in_lane];
146 |     #pragma HLS array_partition variable=payload_F complete
147 |     #pragma HLS array_partition variable=payload_A complete
148 |     #pragma HLS array_partition variable=addr_A complete
149 | 
150 |     // pipeline data registers after arbiter
151 |     PayloadT payload_C[num_in_lane];
152 |     #pragma HLS array_partition variable=payload_C complete
153 | 
154 |     // pipeline valid registers before arbiter
155 |     bool valid_F[num_in_lane];
156 |     bool valid_A[num_in_lane];
157 |     #pragma HLS array_partition variable=valid_F complete
158 |     #pragma HLS array_partition variable=valid_A complete
159 | 
160 |     // pipeline valid registers after arbiter
161 |     unsigned xbar_sel_C[num_out_lane];
162 |     bool xbar_valid_C[num_out_lane];
163 |     bool valid_C[num_in_lane];
164 |     #pragma HLS array_partition variable=xbar_sel_C complete
165 |     #pragma HLS array_partition variable=xbar_valid_C complete
166 |     #pragma HLS array_partition variable=valid_C complete
167 | 
168 |     // resend control
169 |     PayloadT payload_resend[num_in_lane];
170 |     bool resend[num_in_lane];
171 |     #pragma HLS data_pack variable=payload_resend
172 |     #pragma HLS array_partition variable=payload_resend complete
173 |     #pragma HLS array_partition variable=resend complete
174 | 
175 |     // loop control
176 |     bool loop_exit = false;
177 | 
178 |     // arbiter inputs
179 |     unsigned arbiter_in_addr[num_in_lane];
180 |     bool arbiter_in_valid[num_in_lane];
181 |     #pragma HLS array_partition variable=arbiter_in_addr complete
182 |     #pragma HLS array_partition variable=arbiter_in_valid complete
183 | 
184 |     // arbiter outputs
185 |     // bool arbiter_in_granted[num_in_lane];
186 |     unsigned xbar_sel_A[num_out_lane];
187 |     bool xbar_valid_A[num_out_lane];
188 |     // #pragma HLS array_partition variable=arbiter_in_granted complete
189 |     #pragma HLS array_partition variable=xbar_sel_A complete
190 |     #pragma HLS array_partition variable=xbar_valid_A complete
191 | 
192 |     // arbiter priority rotation
193 |     unsigned rotate_priority = 0;
194 |     unsigned next_rotate_priority = 0;
195 | 
196 |     // reset
197 |     loop_rst_IL:
198 |     for (unsigned i = 0; i < num_in_lane; i++) {
199 |         #pragma HLS unroll
200 | 
201 |         payload_F[i].index = 0;
202 |         payload_A[i].index = 0;
203 |         addr_A[i] = 0;
204 |         payload_C[i].index = 0;
205 |         payload_F[i].data = (PayloadValT){0, 0};
206 |         payload_A[i].data = (PayloadValT){0, 0};
207 |         payload_C[i].data = (PayloadValT){0, 0};
208 |         payload_resend[i].index = 0;
209 |         payload_resend[i].data = (PayloadValT){0, 0};
210 | 
211 |         valid_A[i] = false;
212 |         valid_F[i] = false;
213 |         valid_C[i] = false;
214 | 
215 |         resend[i] = false;
216 |     }
217 | 
218 |     loop_reset_OL:
219 |     for (unsigned i = 0; i < num_out_lane; i++) {
220 |         xbar_sel_A[i] = 0;
221 |         xbar_valid_A[i] = false;
222 |         xbar_sel_C[i] = 0;
223 |         xbar_valid_C[i] = false;
224 |     }
225 | 
226 |     #ifndef __SYNTHESIS__
227 |     int cnt = 0;
228 |     #endif
229 | 
230 |     loop_shuffle_pipeline:
231 |     while (!loop_exit) {
232 |         #pragma HLS pipeline II=1
233 |         #pragma HLS latency min=7 max=7
234 |         #pragma HLS dependence variable=resend inter distance=6 RAW True
235 |         #pragma HLS dependence variable=payload_resend inter distance=6 RAW True
236 |         #pragma HLS dependence variable=loop_exit inter distance=8 RAW True
237 | 
238 |         // Fetch stage (F)
239 |         loop_F:
240 |         for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
241 |             #pragma HLS unroll
242 |             PayloadT payload;
243 |             if (!resend[ILid]) {
244 |                 valid_F[ILid] = input_lanes[ILid].read_nb(payload);
245 |             } else {
246 |                 payload.data = (PayloadValT){0, 0};
247 |                 payload.index = 0;
248 |                 valid_F[ILid] = true;
249 |             }
250 |             payload_F[ILid] = resend[ILid] ? payload_resend[ILid] : payload;
251 |         }
252 |         // ------- end of F stage
253 | 
254 |         // Arbiter stage (A)
255 |         loop_A_pass:
256 |         for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
257 |             #pragma HLS unroll
258 |             // prepare arbiter inputs
259 |             payload_A[ILid] = payload_F[ILid];
260 |             valid_A[ILid] = valid_F[ILid];
261 |             addr_A[ILid] = payload_F[ILid].index;
262 |             arbiter_in_valid[ILid] = valid_F[ILid];
263 |         }
264 |         rotate_priority = next_rotate_priority;
265 |         // pipeline arbiter, depth = 6
266 |         num_granted_A = arbiter_1p<num_in_lane, num_out_lane, addr_mask>(
267 |             addr_A,
268 |             arbiter_in_valid,
269 |             // arbiter_in_granted,
270 |             resend,
271 |             xbar_sel_A,
272 |             xbar_valid_A,
273 |             rotate_priority
274 |         );
275 |         next_rotate_priority = (rotate_priority + 1) % num_in_lane;
276 |         loop_A_fwd:
277 |         for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
278 |             #pragma HLS unroll
279 |             payload_resend[ILid] = pipereg_stage_A<PayloadT>(payload_A[ILid]);
280 |             // payload_resend[ILid].index = pipereg_stage_A<unsigned>(payload_A[ILid].index);
281 |             // payload_resend[ILid].data = pipereg_stage_A<PayloadValT>(payload_A[ILid].data);
282 |         }
283 |         // ------- end of A stage
284 | 
285 |         // crossbar stage (C)
286 |         loop_C_pass_il:
287 |         for (unsigned ILid = 0; ILid < num_in_lane; ILid++) {
288 |             #pragma HLS unroll
289 |             payload_C[ILid] = payload_A[ILid];
290 |             valid_C[ILid] = valid_A[ILid];
291 |         }
292 |         num_granted_C = num_granted_A;
293 |         loop_C_pass_ol:
294 |         for (unsigned OLid = 0; OLid < num_in_lane; OLid++) {
295 |             #pragma HLS unroll
296 |             xbar_sel_C[OLid] = xbar_sel_A[OLid];
297 |             xbar_valid_C[OLid] = xbar_valid_A[OLid];
298 |         }
299 |         loop_C_xbar:
300 |         for (unsigned OLid = 0; OLid < num_out_lane; OLid++) {
301 |             #pragma HLS unroll
302 |             if (xbar_valid_C[OLid]) {
303 |                 if (valid_C[xbar_sel_C[OLid]]) {
304 |                     output_lanes[OLid].write(payload_C[xbar_sel_C[OLid]]);
305 |                 }
306 |             }
307 |         }
308 |         // ------- end of C stage
309 | 
310 |         if (!prev_finish) { prev_finish = num_payloads_in.read_nb(payload_cnt); }
311 |         process_cnt += num_granted_C;
312 |         bool all_processed = (process_cnt == payload_cnt);
313 |         loop_exit = all_processed && prev_finish;
314 |     }
315 | 
316 |     num_payloads_out.write(payload_cnt);
317 | }
318 | 
319 | #endif  // GRAPHLILY_HW_SHUFFLE_H_
320 | 


--------------------------------------------------------------------------------
/graphlily/hw/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_HW_UTIL_H_
  2 | #define GRAPHLILY_HW_UTIL_H_
  3 | 
  4 | 
  5 | template<typename T, unsigned len>
  6 | void array_shift_left(T array[len], T array_dest[len], unsigned rotate) {
  7 |     #pragma HLS inline
  8 |     // #pragma HLS latency min=0 max=0
  9 |     #pragma HLS array_partition variable=array complete
 10 |     #pragma HLS array_partition variable=array_dest complete
 11 |     for (unsigned i = 0; i < len; i++) {
 12 |         #pragma HLS unroll
 13 |         array_dest[i] = array[(i + rotate) % len];
 14 |     }
 15 | }
 16 | 
 17 | 
 18 | template<typename T, unsigned len, unsigned maximum>
 19 | void array_cyclic_add(T array[len], bool array_valid[len], unsigned inc) {
 20 |     #pragma HLS inline
 21 |     // #pragma HLS latency min=0 max=0
 22 |     #pragma HLS array_partition variable=array complete
 23 |     for (unsigned i = 0; i < len; i++) {
 24 |         #pragma HLS unroll
 25 |         if (array_valid[i]) {
 26 |             array[i] = (array[i] + inc) % maximum;
 27 |         }
 28 |     }
 29 | }
 30 | 
 31 | 
 32 | template<unsigned len>
 33 | bool array_and_reduction(bool array[len]) {
 34 |     #pragma HLS inline
 35 |     #pragma HLS expression_balance
 36 |     bool result = true;
 37 |     for (unsigned i = 0; i < len; i++) {
 38 |         #pragma HLS unroll
 39 |         result = result && array[i];
 40 |     }
 41 |     return result;
 42 | }
 43 | 
 44 | 
 45 | template<unsigned len>
 46 | bool array_or_reduction(bool array[len]) {
 47 |     #pragma HLS inline
 48 |     #pragma HLS expression_balance
 49 |     bool result = false;
 50 |     for (unsigned i = 0; i < len; i++) {
 51 |         #pragma HLS unroll
 52 |         result = result || array[i];
 53 |     }
 54 |     return result;
 55 | }
 56 | 
 57 | 
 58 | template<unsigned len>
 59 | unsigned array_popcount(bool array[len]) {
 60 |     #pragma HLS pipeline II = 1
 61 |     #pragma HLS latency min=1 max=1
 62 |     #pragma HLS array_partition variable=array complete
 63 |     #pragma HLS expression_balance
 64 |     unsigned cnt = 0;
 65 |     for (unsigned i = 0; i < len; i++) {
 66 |         #pragma HLS unroll
 67 |         if (array[i]) {
 68 |             cnt++;
 69 |         }
 70 |     }
 71 |     return cnt;
 72 | }
 73 | 
 74 | 
 75 | template<typename T, unsigned len>
 76 | T array_sum(T array[len]) {
 77 |     #pragma HLS inline
 78 |     #pragma HLS expression_balance
 79 |     T result = 0;
 80 |     for (unsigned i = 0; i < len; i++) {
 81 |         #pragma HLS unroll
 82 |         result += array[i];
 83 |     }
 84 |     return result;
 85 | }
 86 | 
 87 | 
 88 | template<typename T, unsigned len>
 89 | T array_max(T array[len]) {
 90 |     #pragma HLS inline
 91 |     #pragma HLS expression_balance
 92 |     T result = 0;
 93 |     for (unsigned i = 0; i < len; i++) {
 94 |         #pragma HLS unroll
 95 |         result = (array[i] > result)? array[i] : result;
 96 |     }
 97 |     return result;
 98 | }
 99 | 
100 | 
101 | // force a register
102 | template<typename T>
103 | T HLS_REG(T in) {
104 | #pragma HLS pipeline
105 | #pragma HLS inline off
106 | #pragma HLS interface port=return register
107 |     return in;
108 | }
109 | 
110 | 
111 | // // Cyclic partitioning
112 | // unsigned get_bank_idx(unsigned full_addr) {
113 | //     return full_addr & BANK_ID_MASK;
114 | // }
115 | 
116 | 
117 | // // Cyclic partitioning
118 | // unsigned get_bank_address(unsigned full_addr) {
119 | //     return full_addr >> BANK_ID_NBITS;
120 | // }
121 | 
122 | #endif  // GRAPHLILY_HW_UTIL_H_
123 | 


--------------------------------------------------------------------------------
/graphlily/io/data_loader.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_IO_DATA_LOADER_H_
  2 | #define GRAPHLILY_IO_DATA_LOADER_H_
  3 | 
  4 | #include <cstdint>
  5 | #include <vector>
  6 | 
  7 | #include "cnpy.h"
  8 | 
  9 | 
 10 | namespace graphlily {
 11 | namespace io {
 12 | 
 13 | //--------------------------------------------------
 14 | // Compressed Sparse Row (CSR) format support
 15 | //--------------------------------------------------
 16 | 
 17 | // Data structure for csr matrix.
 18 | template<typename data_type>
 19 | struct CSRMatrix {
 20 |     /*! \brief The number of rows of the sparse matrix */
 21 |     uint32_t num_rows;
 22 |     /*! \brief The number of columns of the sparse matrix */
 23 |     uint32_t num_cols;
 24 |     /*! \brief The non-zero data of the sparse matrix */
 25 |     std::vector<data_type> adj_data;
 26 |     /*! \brief The column indices of the sparse matrix */
 27 |     std::vector<uint32_t> adj_indices;
 28 |     /*! \brief The index pointers of the sparse matrix */
 29 |     std::vector<uint32_t> adj_indptr;
 30 | };
 31 | 
 32 | 
 33 | // Create a csr matrix from raw input.
 34 | template<typename data_type>
 35 | CSRMatrix<data_type> create_csr_matrix(uint32_t num_rows,
 36 |                                        uint32_t num_cols,
 37 |                                        std::vector<data_type> const &adj_data,
 38 |                                        std::vector<uint32_t> const &adj_indices,
 39 |                                        std::vector<uint32_t> const &adj_indptr) {
 40 |     CSRMatrix<data_type> csr_matrix;
 41 |     csr_matrix.num_rows = num_rows;
 42 |     csr_matrix.num_cols = num_cols;
 43 |     csr_matrix.adj_data = adj_data;
 44 |     csr_matrix.adj_indices = adj_indices;
 45 |     csr_matrix.adj_indptr = adj_indptr;
 46 |     return csr_matrix;
 47 | }
 48 | 
 49 | 
 50 | // Load a csr matrix from a scipy sparse npz file. The sparse matrix should have float data type.
 51 | CSRMatrix<float> load_csr_matrix_from_float_npz(std::string csr_float_npz_path) {
 52 |     CSRMatrix<float> csr_matrix;
 53 |     cnpy::npz_t npz = cnpy::npz_load(csr_float_npz_path);
 54 |     cnpy::NpyArray npy_shape = npz["shape"];
 55 |     uint32_t num_rows = npy_shape.data<uint32_t>()[0];
 56 |     uint32_t num_cols = npy_shape.data<uint32_t>()[2];
 57 |     csr_matrix.num_rows = num_rows;
 58 |     csr_matrix.num_cols = num_cols;
 59 |     cnpy::NpyArray npy_data = npz["data"];
 60 |     uint32_t nnz = npy_data.shape[0];
 61 |     cnpy::NpyArray npy_indices = npz["indices"];
 62 |     cnpy::NpyArray npy_indptr = npz["indptr"];
 63 |     csr_matrix.adj_data.insert(csr_matrix.adj_data.begin(), &npy_data.data<float>()[0],
 64 |         &npy_data.data<float>()[nnz]);
 65 |     csr_matrix.adj_indices.insert(csr_matrix.adj_indices.begin(), &npy_indices.data<uint32_t>()[0],
 66 |         &npy_indices.data<uint32_t>()[nnz]);
 67 |     csr_matrix.adj_indptr.insert(csr_matrix.adj_indptr.begin(), &npy_indptr.data<uint32_t>()[0],
 68 |         &npy_indptr.data<uint32_t>()[num_rows + 1]);
 69 |     return csr_matrix;
 70 | }
 71 | 
 72 | 
 73 | // Convert a float csr matrix to another data type.
 74 | // TODO: does ap_int make formatting slower than float?
 75 | template<typename data_type>
 76 | CSRMatrix<data_type> csr_matrix_convert_from_float(CSRMatrix<float> const &in) {
 77 |     CSRMatrix<data_type> out;
 78 |     out.num_rows = in.num_rows;
 79 |     out.num_cols = in.num_cols;
 80 |     std::copy(in.adj_data.begin(), in.adj_data.end(), std::back_inserter(out.adj_data));
 81 |     out.adj_indices = in.adj_indices;
 82 |     out.adj_indptr = in.adj_indptr;
 83 |     return out;
 84 | }
 85 | 
 86 | 
 87 | //--------------------------------------------------
 88 | // Compressed Sparse Colunm (CSC) format support
 89 | //--------------------------------------------------
 90 | 
 91 | // Data structure for csc matrix.
 92 | template<typename data_type>
 93 | struct CSCMatrix {
 94 |     /*! \brief The number of rows of the sparse matrix */
 95 |     uint32_t num_rows;
 96 |     /*! \brief The number of columns of the sparse matrix */
 97 |     uint32_t num_cols;
 98 |     /*! \brief The non-zero data of the sparse matrix */
 99 |     std::vector<data_type> adj_data;
100 |     /*! \brief The row indices of the sparse matrix */
101 |     std::vector<uint32_t> adj_indices;
102 |     /*! \brief The index pointers of the sparse matrix */
103 |     std::vector<uint32_t> adj_indptr;
104 | };
105 | 
106 | 
107 | // Convert csr to csc.
108 | template<typename data_type>
109 | CSCMatrix<data_type> csr2csc(CSRMatrix<data_type> const &csr_matrix) {
110 |     CSCMatrix<data_type> csc_matrix;
111 |     csc_matrix.num_rows = csr_matrix.num_rows;
112 |     csc_matrix.num_cols = csr_matrix.num_cols;
113 |     csc_matrix.adj_data = std::vector<data_type>(csr_matrix.adj_data.size());
114 |     csc_matrix.adj_indices = std::vector<uint32_t>(csr_matrix.adj_indices.size());
115 |     csc_matrix.adj_indptr = std::vector<uint32_t>(csc_matrix.num_cols + 1);
116 |     // Convert adj_indptr
117 |     uint32_t nnz = csr_matrix.adj_indptr[csr_matrix.num_rows];
118 |     std::vector<uint32_t> nnz_each_col(csc_matrix.num_cols);
119 |     std::fill(nnz_each_col.begin(), nnz_each_col.end(), 0);
120 |     for (size_t n = 0; n < nnz; n++) {
121 |         nnz_each_col[csr_matrix.adj_indices[n]]++;
122 |     }
123 |     csc_matrix.adj_indptr[0] = 0;
124 |     for (size_t col_idx = 0; col_idx < csc_matrix.num_cols; col_idx++) {
125 |         csc_matrix.adj_indptr[col_idx + 1] = csc_matrix.adj_indptr[col_idx] + nnz_each_col[col_idx];
126 |     }
127 |     assert(csc_matrix.adj_indptr[csc_matrix.num_cols] == nnz);
128 |     // Convert adj_data and adj_indices
129 |     std::vector<uint32_t> nnz_consumed_each_col(csc_matrix.num_cols);
130 |     std::fill(nnz_consumed_each_col.begin(), nnz_consumed_each_col.end(), 0);
131 |     for (size_t row_idx = 0; row_idx < csr_matrix.num_rows; row_idx++){
132 |         for (size_t i = csr_matrix.adj_indptr[row_idx]; i < csr_matrix.adj_indptr[row_idx + 1]; i++){
133 |             uint32_t col_idx = csr_matrix.adj_indices[i];
134 |             uint32_t dest = csc_matrix.adj_indptr[col_idx] + nnz_consumed_each_col[col_idx];
135 |             csc_matrix.adj_indices[dest] = row_idx;
136 |             csc_matrix.adj_data[dest] = csr_matrix.adj_data[i];
137 |             nnz_consumed_each_col[col_idx]++;
138 |         }
139 |     }
140 |     for (size_t col_idx = 0; col_idx < csc_matrix.num_cols; col_idx++) {
141 |         assert(nnz_consumed_each_col[col_idx] == nnz_each_col[col_idx]);
142 |     }
143 |     return csc_matrix;
144 | }
145 | 
146 | 
147 | // Convert a float csc matrix to another data type.
148 | template<typename data_type>
149 | CSCMatrix<data_type> csc_matrix_convert_from_float(CSCMatrix<float> const &in) {
150 |     CSCMatrix<data_type> out;
151 |     out.num_rows = in.num_rows;
152 |     out.num_cols = in.num_cols;
153 |     std::copy(in.adj_data.begin(), in.adj_data.end(), std::back_inserter(out.adj_data));
154 |     out.adj_indices = in.adj_indices;
155 |     out.adj_indptr = in.adj_indptr;
156 |     return out;
157 | }
158 | 
159 | }  // namespace io
160 | }  // namespace graphlily
161 | 
162 | #endif  // GRAPHLILY_IO_DATA_LOADER_H_
163 | 


--------------------------------------------------------------------------------
/graphlily/module/add_scalar_vector_dense_module.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_EWISE_ADD_MODULE_H_
  2 | #define GRAPHLILY_EWISE_ADD_MODULE_H_
  3 | 
  4 | #include <cstdint>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <chrono>
  8 | 
  9 | #include "xcl2.hpp"
 10 | 
 11 | #include "graphlily/global.h"
 12 | #include "graphlily/module/base_module.h"
 13 | 
 14 | 
 15 | namespace graphlily {
 16 | namespace module {
 17 | 
 18 | template<typename vector_data_t>
 19 | class eWiseAddModule : public BaseModule {
 20 | private:
 21 |     using packed_val_t = struct {vector_data_t data[graphlily::pack_size];};
 22 |     using aligned_dense_vec_t = std::vector<vector_data_t, aligned_allocator<vector_data_t>>;
 23 | 
 24 |     /*! \brief Internal copy of the input vector */
 25 |     aligned_dense_vec_t in_;
 26 |     /*! \brief Internal copy of the output vector */
 27 |     aligned_dense_vec_t out_;
 28 | 
 29 | public:
 30 |     // Device buffers
 31 |     cl::Buffer in_buf;
 32 |     cl::Buffer out_buf;
 33 | 
 34 | public:
 35 |     eWiseAddModule() : BaseModule("overlay") {}
 36 | 
 37 |     /*Overlay argument list:
 38 |     * (H = num_hbm_channels)
 39 |     * Index       Argument                     used in this module?
 40 |     * 0 ~ H-1     matrix for spmv              n
 41 |     * H+0         vector for spmv              y
 42 |     * H+1         mask for spmv (read port)    n
 43 |     * H+2         mask for spmv (write port)   n
 44 |     * H+3         output for spmv              y
 45 |     *
 46 |     * H+4 ~ +6    matrix for spmspv            n
 47 |     * H+7         vector for spmspv            n
 48 |     * H+8         mask for spmspv              n
 49 |     * H+9         output for spmspv            n
 50 |     *
 51 |     * H+10        # of rows                    n
 52 |     * H+11        # of columns                 n
 53 |     *
 54 |     * H+12        operation type               n
 55 |     * H+13        mask type                    n
 56 |     *
 57 |     * H+14        overlay mode select          y
 58 |     *
 59 |     * H+15        apply vector length          y
 60 |     * H+16        input value for assign       y
 61 |     */
 62 |     void set_unused_args() override {
 63 |         // Set unused arguments for SpMV
 64 |         for (uint32_t i = 0; i < graphlily::num_hbm_channels; i++) {
 65 |             this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4));
 66 |         }
 67 |         this->kernel_.setArg(graphlily::num_hbm_channels + 1, cl::Buffer(this->context_, 0, 4));
 68 |         this->kernel_.setArg(graphlily::num_hbm_channels + 2, cl::Buffer(this->context_, 0, 4));
 69 |         // Set unused arguments for SpMSpV
 70 |         for (uint32_t i = graphlily::num_hbm_channels + 4; i <= graphlily::num_hbm_channels + 9; i++) {
 71 |             this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4));
 72 |         }
 73 |         // Set unused scalar arguments
 74 |         this->kernel_.setArg(graphlily::num_hbm_channels + 10, (unsigned)NULL);
 75 |         this->kernel_.setArg(graphlily::num_hbm_channels + 11, (unsigned)NULL);
 76 |         this->kernel_.setArg(graphlily::num_hbm_channels + 12, (char)NULL);
 77 |         this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)NULL);
 78 |     }
 79 | 
 80 |     void set_mode() override {
 81 |         this->kernel_.setArg(graphlily::num_hbm_channels + 14, 3);  // 3 is kernel_add_scalar_vector_dense
 82 |     }
 83 | 
 84 |     /*!
 85 |      * \brief Send the input vector from host to device.
 86 |      */
 87 |     void send_in_host_to_device(aligned_dense_vec_t &in);
 88 | 
 89 |     /*!
 90 |      * \brief Allocate the output buffer.
 91 |      */
 92 |     void allocate_out_buf(uint32_t len);
 93 | 
 94 |     /*!
 95 |      * \brief Bind the input buffer to an existing buffer.
 96 |      */
 97 |     void bind_in_buf(cl::Buffer src_buf) {
 98 |         this->in_buf = src_buf;
 99 |         this->kernel_.setArg(graphlily::num_hbm_channels + 3, this->in_buf);
100 |     }
101 | 
102 |     /*!
103 |      * \brief Bind the output buffer to an existing buffer.
104 |      */
105 |     void bind_out_buf(cl::Buffer src_buf) {
106 |         this->out_buf = src_buf;
107 |         this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->out_buf);
108 |     }
109 | 
110 |     /*!
111 |      * \brief Run the module.
112 |      * \param len The length of the in/out vector.
113 |      * \param val The value to be added.
114 |      */
115 |     void run(uint32_t len, vector_data_t val);
116 | 
117 |     /*!
118 |      * \brief Send the output vector from device to host.
119 |      * \return The output vector.
120 |      */
121 |     aligned_dense_vec_t send_out_device_to_host() {
122 |         this->command_queue_.enqueueMigrateMemObjects({this->out_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
123 |         this->command_queue_.finish();
124 |         return this->out_;
125 |     }
126 | 
127 |     /*!
128 |      * \brief Compute reference results.
129 |      * \param in The inout vector.
130 |      * \param len The length of the mask/inout vector.
131 |      * \param val The value to be assigned to the inout vector.
132 |      * \return The output vector.
133 |      */
134 |     graphlily::aligned_dense_float_vec_t
135 |     compute_reference_results(graphlily::aligned_dense_float_vec_t const &in,
136 |                               uint32_t len,
137 |                               float val);
138 | };
139 | 
140 | 
141 | template<typename vector_data_t>
142 | void eWiseAddModule<vector_data_t>::send_in_host_to_device(aligned_dense_vec_t &in) {
143 |     this->in_.assign(in.begin(), in.end());
144 |     cl_mem_ext_ptr_t in_ext;
145 |     in_ext.obj = this->in_.data();
146 |     in_ext.param = 0;
147 |     in_ext.flags = graphlily::HBM[22];
148 |     cl_int err;
149 |     OCL_CHECK(err, this->in_buf = cl::Buffer(this->context_,
150 |                 CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
151 |                 sizeof(vector_data_t) * this->in_.size(),
152 |                 &in_ext,
153 |                 &err));
154 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 3, this->in_buf));
155 |     OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->in_buf}, 0));
156 |     this->command_queue_.finish();
157 | }
158 | 
159 | 
160 | template<typename vector_data_t>
161 | void eWiseAddModule<vector_data_t>::allocate_out_buf(uint32_t len) {
162 |     this->out_.resize(len);
163 |     cl_mem_ext_ptr_t out_ext;
164 |     out_ext.obj = this->out_.data();
165 |     out_ext.param = 0;
166 |     out_ext.flags = graphlily::HBM[20];
167 |     cl_int err;
168 |     OCL_CHECK(err, this->out_buf = cl::Buffer(this->context_,
169 |                 CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
170 |                 sizeof(vector_data_t) * this->out_.size(),
171 |                 &out_ext,
172 |                 &err));
173 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->out_buf));
174 |     OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->out_buf}, 0));
175 |     this->command_queue_.finish();
176 | }
177 | 
178 | 
179 | template<typename vector_data_t>
180 | void eWiseAddModule<vector_data_t>::run(uint32_t len, vector_data_t val) {
181 |     cl_int err;
182 |     // TODO: is the overhead of setArg and enqueueTask large at run time?
183 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 15, len));
184 |     // To avoid runtime error of invalid scalar argument size
185 |     if (!(std::is_same<vector_data_t, unsigned>::value || std::is_same<vector_data_t, float>::value)) {
186 |         OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, 8, (void*)&val));
187 |     } else {
188 |         OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, val));
189 |     }
190 |     OCL_CHECK(err, err = this->command_queue_.enqueueTask(this->kernel_));
191 |     this->command_queue_.finish();
192 | }
193 | 
194 | 
195 | template<typename vector_data_t> graphlily::aligned_dense_float_vec_t
196 | eWiseAddModule<vector_data_t>::compute_reference_results(graphlily::aligned_dense_float_vec_t const &in,
197 |                                                          uint32_t len,
198 |                                                          float val) {
199 |     graphlily::aligned_dense_float_vec_t out(len);
200 |     for (uint32_t i = 0; i < len; i++) {
201 |         out[i] = in[i] + val;
202 |     }
203 |     return out;
204 | }
205 | 
206 | }  // namespace module
207 | }  // namespace graphlily
208 | 
209 | #endif  // GRAPHLILY_EWISE_ADD_MODULE_H_
210 | 


--------------------------------------------------------------------------------
/graphlily/module/assign_vector_dense_module.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_ASSIGN_VECTOR_DENSE_MODULE_H_
  2 | #define GRAPHLILY_ASSIGN_VECTOR_DENSE_MODULE_H_
  3 | 
  4 | #include <cstdint>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <chrono>
  8 | 
  9 | #include "xcl2.hpp"
 10 | 
 11 | #include "graphlily/global.h"
 12 | #include "graphlily/module/base_module.h"
 13 | 
 14 | 
 15 | namespace graphlily {
 16 | namespace module {
 17 | 
 18 | template<typename vector_data_t>
 19 | class AssignVectorDenseModule : public BaseModule {
 20 | private:
 21 |     using packed_val_t = struct {vector_data_t data[graphlily::pack_size];};
 22 |     using aligned_dense_vec_t = std::vector<vector_data_t, aligned_allocator<vector_data_t>>;
 23 | 
 24 |     /*! \brief The mask type */
 25 |     graphlily::MaskType mask_type_;
 26 |     /*! \brief Internal copy of mask */
 27 |     aligned_dense_vec_t mask_;
 28 |     /*! \brief Internal copy of inout */
 29 |     aligned_dense_vec_t inout_;
 30 | 
 31 | public:
 32 |     // Device buffers
 33 |     cl::Buffer mask_buf;
 34 |     cl::Buffer inout_buf;
 35 | 
 36 | public:
 37 |     AssignVectorDenseModule() : BaseModule("overlay") {}
 38 | 
 39 |     /*Overlay argument list:
 40 |     * (H = num_hbm_channels)
 41 |     * Index       Argument                     used in this module?
 42 |     * 0 ~ H-1     matrix for spmv              n
 43 |     * H+0         vector for spmv              y
 44 |     * H+1         mask for spmv (read port)    y
 45 |     * H+2         mask for spmv (write port)   y
 46 |     * H+3         output for spmv              n
 47 |     *
 48 |     * H+4 ~ +6    matrix for spmspv            n
 49 |     * H+7         vector for spmspv            n
 50 |     * H+8         mask for spmspv              n
 51 |     * H+9         output for spmspv            n
 52 |     *
 53 |     * H+10        # of rows                    n
 54 |     * H+11        # of columns                 n
 55 |     *
 56 |     * H+12        operation type               n
 57 |     * H+13        mask type                    y
 58 |     *
 59 |     * H+14        overlay mode select          y
 60 |     *
 61 |     * H+15        apply vector length          y
 62 |     * H+16        input value for assign       y
 63 |     */
 64 |     void set_unused_args() override {
 65 |         // Set unused arguments for SpMV
 66 |         for (uint32_t i = 0; i < graphlily::num_hbm_channels; i++) {
 67 |             this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4));
 68 |         }
 69 |         this->kernel_.setArg(graphlily::num_hbm_channels + 3, cl::Buffer(this->context_, 0, 4));
 70 |         // Set unused arguments for SpMSpV
 71 |         for (uint32_t i = graphlily::num_hbm_channels + 4; i <= graphlily::num_hbm_channels + 9; i++) {
 72 |             this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4));
 73 |         }
 74 |         // Set unused scalar arguments
 75 |         this->kernel_.setArg(graphlily::num_hbm_channels + 10, (unsigned)NULL);
 76 |         this->kernel_.setArg(graphlily::num_hbm_channels + 11, (unsigned)NULL);
 77 |         this->kernel_.setArg(graphlily::num_hbm_channels + 12, (char)NULL);
 78 |     }
 79 | 
 80 |     void set_mode() override {
 81 |         this->kernel_.setArg(graphlily::num_hbm_channels + 14, 4);;  // 4 is kernel_assign_vector_dense
 82 |     }
 83 | 
 84 |     /*!
 85 |      * \brief Set the mask type.
 86 |      * \param mask_type The mask type.
 87 |      */
 88 |     void set_mask_type(graphlily::MaskType mask_type) {
 89 |         if (mask_type == graphlily::kNoMask) {
 90 |             std::cerr << "Please set the mask type" << std::endl;
 91 |             exit(EXIT_FAILURE);
 92 |         } else {
 93 |             this->mask_type_ = mask_type;
 94 |         }
 95 |     }
 96 | 
 97 |     /*!
 98 |      * \brief Send the mask from host to device.
 99 |      */
100 |     void send_mask_host_to_device(aligned_dense_vec_t &mask);
101 | 
102 |     /*!
103 |      * \brief Send the inout from host to device.
104 |      */
105 |     void send_inout_host_to_device(aligned_dense_vec_t &inout);
106 | 
107 |     /*!
108 |      * \brief Bind the mask buffer to an existing buffer.
109 |      */
110 |     void bind_mask_buf(cl::Buffer src_buf) {
111 |         this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)this->mask_type_);
112 |         this->mask_buf = src_buf;
113 |         this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->mask_buf);
114 |     }
115 | 
116 |     /*!
117 |      * \brief Bind the inout buffer to an existing buffer.
118 |      */
119 |     void bind_inout_buf(cl::Buffer src_buf) {
120 |         this->inout_buf = src_buf;
121 |         // set both read and write ports
122 |         this->kernel_.setArg(graphlily::num_hbm_channels + 1, this->inout_buf);
123 |         this->kernel_.setArg(graphlily::num_hbm_channels + 2, this->inout_buf);
124 |     }
125 | 
126 |     /*!
127 |      * \brief Run the module.
128 |      * \param len The length of the mask/inout vector.
129 |      * \param val The value to be assigned to the inout vector.
130 |      */
131 |     void run(uint32_t len, vector_data_t val);
132 | 
133 |     /*!
134 |      * \brief Send the mask from device to host.
135 |      * \return The mask.
136 |      */
137 |     aligned_dense_vec_t send_mask_device_to_host() {
138 |         this->command_queue_.enqueueMigrateMemObjects({this->mask_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
139 |         this->command_queue_.finish();
140 |         return this->mask_;
141 |     }
142 | 
143 |     /*!
144 |      * \brief Send the inout from device to host.
145 |      * \return The inout.
146 |      */
147 |     aligned_dense_vec_t send_inout_device_to_host() {
148 |         this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
149 |         this->command_queue_.finish();
150 |         return this->inout_;
151 |     }
152 | 
153 |     /*!
154 |      * \brief Compute reference results.
155 |      * \param mask The mask vector.
156 |      * \param inout The inout vector.
157 |      * \param len The length of the mask/inout vector.
158 |      * \param val The value to be assigned to the inout vector.
159 |      */
160 |     void compute_reference_results(graphlily::aligned_dense_float_vec_t &mask,
161 |                                    graphlily::aligned_dense_float_vec_t &inout,
162 |                                    uint32_t len,
163 |                                    float val);
164 | };
165 | 
166 | 
167 | template<typename vector_data_t>
168 | void AssignVectorDenseModule<vector_data_t>::send_mask_host_to_device(aligned_dense_vec_t &mask) {
169 |     this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)this->mask_type_);
170 |     this->mask_.assign(mask.begin(), mask.end());
171 |     cl_mem_ext_ptr_t mask_ext;
172 |     mask_ext.obj = this->mask_.data();
173 |     mask_ext.param = 0;
174 |     mask_ext.flags = graphlily::HBM[20];
175 |     cl_int err;
176 |     OCL_CHECK(err, this->mask_buf = cl::Buffer(this->context_,
177 |                 CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
178 |                 sizeof(vector_data_t) * this->mask_.size(),
179 |                 &mask_ext,
180 |                 &err));
181 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 0, this->mask_buf));
182 |     OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->mask_buf}, 0));
183 |     this->command_queue_.finish();
184 | }
185 | 
186 | 
187 | template<typename vector_data_t>
188 | void AssignVectorDenseModule<vector_data_t>::send_inout_host_to_device(aligned_dense_vec_t &inout) {
189 |     this->inout_.assign(inout.begin(), inout.end());
190 |     cl_mem_ext_ptr_t inout_ext;
191 |     inout_ext.obj = this->inout_.data();
192 |     inout_ext.param = 0;
193 |     inout_ext.flags = graphlily::HBM[21];
194 |     cl_int err;
195 |     OCL_CHECK(err, this->inout_buf = cl::Buffer(this->context_,
196 |                 CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
197 |                 sizeof(vector_data_t) * this->inout_.size(),
198 |                 &inout_ext,
199 |                 &err));
200 |     // set both read and write ports
201 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 1, this->inout_buf));
202 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 2, this->inout_buf));
203 |     OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, 0));
204 |     this->command_queue_.finish();
205 | }
206 | 
207 | 
208 | template<typename vector_data_t>
209 | void AssignVectorDenseModule<vector_data_t>::run(uint32_t len, vector_data_t val) {
210 |     cl_int err;
211 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 15, len));
212 |     // To avoid runtime error of invalid scalar argument size
213 |     if (!(std::is_same<vector_data_t, unsigned>::value || std::is_same<vector_data_t, float>::value)) {
214 |         OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, 8, (void*)&val));
215 |     } else {
216 |         OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 16, val));
217 |     }
218 |     OCL_CHECK(err, err = this->command_queue_.enqueueTask(this->kernel_));
219 |     this->command_queue_.finish();
220 | }
221 | 
222 | 
223 | template<typename vector_data_t>
224 | void AssignVectorDenseModule<vector_data_t>::compute_reference_results(
225 |     graphlily::aligned_dense_float_vec_t &mask,
226 |     graphlily::aligned_dense_float_vec_t &inout,
227 |     uint32_t len,
228 |     float val
229 | ) {
230 |     if (this->mask_type_ == graphlily::kMaskWriteToZero) {
231 |         for (size_t i = 0; i < len; i++) {
232 |             if (mask[i] == 0) {
233 |                 inout[i] = val;
234 |             }
235 |         }
236 |     } else if (this->mask_type_ == graphlily::kMaskWriteToOne) {
237 |         for (size_t i = 0; i < len; i++) {
238 |             if (mask[i] != 0) {
239 |                 inout[i] = val;
240 |             }
241 |         }
242 |     } else {
243 |         std::cout << "Invalid mask type" << std::endl;
244 |         exit(EXIT_FAILURE);
245 |     }
246 | }
247 | 
248 | }  // namespace module
249 | }  // namespace graphlily
250 | 
251 | #endif  // GRAPHLILY_ASSIGN_VECTOR_DENSE_MODULE_H_
252 | 


--------------------------------------------------------------------------------
/graphlily/module/assign_vector_sparse_module.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_ASSIGN_VECTOR_SPARSE_MODULE_H_
  2 | #define GRAPHLILY_ASSIGN_VECTOR_SPARSE_MODULE_H_
  3 | 
  4 | #include <cstdint>
  5 | #include <vector>
  6 | #include <fstream>
  7 | #include <chrono>
  8 | 
  9 | #include "xcl2.hpp"
 10 | 
 11 | #include "graphlily/global.h"
 12 | #include "graphlily/module/base_module.h"
 13 | 
 14 | 
 15 | namespace graphlily {
 16 | namespace module {
 17 | 
 18 | template<typename vector_data_t, typename sparse_vector_data_t>
 19 | class AssignVectorSparseModule : public BaseModule {
 20 | private:
 21 |     using aligned_mask_t = std::vector<sparse_vector_data_t, aligned_allocator<sparse_vector_data_t>>;
 22 |     using aligned_dense_vec_t = std::vector<vector_data_t, aligned_allocator<vector_data_t>>;
 23 | 
 24 |     /*! \brief Generate new frontier (used in SSSP) or not (used in BFS) */
 25 |     bool generate_new_frontier_;
 26 |     /*! \brief Internal copy of mask */
 27 |     aligned_mask_t mask_;
 28 |     /*! \brief Internal copy of inout */
 29 |     aligned_dense_vec_t inout_;
 30 |     /*! \brief Internal copy of new_frontier */
 31 |     aligned_mask_t new_frontier_;
 32 | 
 33 | public:
 34 |     // Device buffers
 35 |     cl::Buffer mask_buf;
 36 |     cl::Buffer inout_buf;
 37 |     cl::Buffer new_frontier_buf;
 38 | 
 39 | public:
 40 |     AssignVectorSparseModule(bool generate_new_frontier) : BaseModule("overlay") {
 41 |         this->generate_new_frontier_ = generate_new_frontier;
 42 |     }
 43 | 
 44 |     /*Overlay argument list:
 45 |     * (H = num_hbm_channels)
 46 |     * Index       Argument                     used in this module?
 47 |     * 0 ~ H-1     matrix for spmv              n
 48 |     * H+0         vector for spmv              n
 49 |     * H+1         mask for spmv (read port)    n
 50 |     * H+2         mask for spmv (write port)   n
 51 |     * H+3         output for spmv              n
 52 |     *
 53 |     * H+4 ~ +6    matrix for spmspv            n
 54 |     * H+7         vector for spmspv            y
 55 |     * H+8         mask for spmspv              y
 56 |     * H+9         output for spmspv            n
 57 |     *
 58 |     * H+10        # of rows                    n
 59 |     * H+11        # of columns                 n
 60 |     *
 61 |     * H+12        operation type               n
 62 |     * H+13        mask type                    n
 63 |     *
 64 |     * H+14        overlay mode select          y
 65 |     *
 66 |     * H+15        apply vector length          n
 67 |     * H+16        input value for assign       y
 68 |     */
 69 |     void set_unused_args() override {
 70 |         // Set unused arguments for SpMV
 71 |         for (uint32_t i = 0; i < graphlily::num_hbm_channels + 4; i++) {
 72 |             this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4));
 73 |         }
 74 |         // Set unused arguments for SpMSpV
 75 |         for (uint32_t i = graphlily::num_hbm_channels + 4; i < graphlily::num_hbm_channels + 7; i++) {
 76 |             this->kernel_.setArg(i, cl::Buffer(this->context_, 0, 4));
 77 |         }
 78 |         // Set unused scalar arguments
 79 |         this->kernel_.setArg(graphlily::num_hbm_channels + 15, (unsigned)NULL);
 80 |         this->kernel_.setArg(graphlily::num_hbm_channels + 10, (unsigned)NULL);
 81 |         this->kernel_.setArg(graphlily::num_hbm_channels + 11, (unsigned)NULL);
 82 |         this->kernel_.setArg(graphlily::num_hbm_channels + 12, (char)NULL);
 83 |         this->kernel_.setArg(graphlily::num_hbm_channels + 13, (char)NULL);
 84 |         if (!this->generate_new_frontier_) {
 85 |             this->kernel_.setArg(graphlily::num_hbm_channels + 9, cl::Buffer(this->context_, 0, 4));
 86 |         }
 87 |         if (this->generate_new_frontier_) {
 88 |             if (!(std::is_same<vector_data_t, unsigned>::value || std::is_same<vector_data_t, float>::value)) {
 89 |                 this->kernel_.setArg(graphlily::num_hbm_channels + 16, (long long)NULL);
 90 |             } else {
 91 |                 this->kernel_.setArg(graphlily::num_hbm_channels + 16, (unsigned)NULL);
 92 |             }
 93 |         }
 94 |     }
 95 | 
 96 |     void set_mode() override {
 97 |         if (this->generate_new_frontier_) {
 98 |             this->kernel_.setArg(graphlily::num_hbm_channels + 14, 6);
 99 |         } else {
100 |             this->kernel_.setArg(graphlily::num_hbm_channels + 14, 5);
101 |         }
102 |     }
103 | 
104 |     /*!
105 |      * \brief Send the mask from host to device.
106 |      */
107 |     void send_mask_host_to_device(aligned_mask_t &mask);
108 | 
109 |     /*!
110 |      * \brief Send the inout from host to device.
111 |      */
112 |     void send_inout_host_to_device(aligned_dense_vec_t &inout);
113 | 
114 |     /*!
115 |      * \brief Bind the mask buffer to an existing buffer.
116 |      */
117 |     void bind_mask_buf(cl::Buffer src_buf) {
118 |         this->mask_buf = src_buf;
119 |         if (this->generate_new_frontier_) {
120 |             this->kernel_.setArg(graphlily::num_hbm_channels + 9, this->mask_buf);
121 |         } else {
122 |             this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->mask_buf);
123 |         }
124 |     }
125 | 
126 |     /*!
127 |      * \brief Bind the inout buffer to an existing buffer.
128 |      */
129 |     void bind_inout_buf(cl::Buffer src_buf) {
130 |         this->inout_buf = src_buf;
131 |         this->kernel_.setArg(graphlily::num_hbm_channels + 8, this->inout_buf);
132 |     }
133 | 
134 |     /*!
135 |      * \brief Bind the new_frontier buffer to an existing buffer.
136 |      */
137 |     void bind_new_frontier_buf(cl::Buffer src_buf) {
138 |         if (!this->generate_new_frontier_) {
139 |             std::cout << "[ERROR]: this->generate_new_frontier_ should be true" << std::endl;
140 |             exit(EXIT_FAILURE);
141 |         }
142 |         this->new_frontier_buf = src_buf;
143 |         this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->new_frontier_buf);
144 |     }
145 | 
146 |     /*!
147 |      * \brief Run the module when this->generate_new_frontier_ is false (BFS mode).
148 |      * \param val The value to be assigned to the inout vector.
149 |      */
150 |     void run(vector_data_t val);
151 | 
152 |     /*!
153 |      * \brief Run the module when this->generate_new_frontier_ is true (SSSP mode).
154 |      */
155 |     void run();
156 | 
157 |     /*!
158 |      * \brief Send the mask from device to host.
159 |      * \return The mask.
160 |      */
161 |     aligned_mask_t send_mask_device_to_host() {
162 |         this->command_queue_.enqueueMigrateMemObjects({this->mask_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
163 |         this->command_queue_.finish();
164 |         return this->mask_;
165 |     }
166 | 
167 |     /*!
168 |      * \brief Send the inout from device to host.
169 |      * \return The inout.
170 |      */
171 |     aligned_dense_vec_t send_inout_device_to_host() {
172 |         this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
173 |         this->command_queue_.finish();
174 |         return this->inout_;
175 |     }
176 | 
177 |     /*!
178 |      * \brief Send the new_frontier from device to host.
179 |      * \return The inout.
180 |      */
181 |     aligned_mask_t send_new_frontier_device_to_host() {
182 |         if (!this->generate_new_frontier_) {
183 |             std::cout << "[ERROR]: this->generate_new_frontier_ should be true" << std::endl;
184 |             exit(EXIT_FAILURE);
185 |         }
186 |         this->command_queue_.enqueueMigrateMemObjects({this->new_frontier_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
187 |         this->command_queue_.finish();
188 |         return this->new_frontier_;
189 |     }
190 | 
191 |     /*!
192 |      * \brief Compute reference results when this->generate_new_frontier_ is false (BFS mode).
193 |      * \param mask The mask vector.
194 |      * \param inout The inout vector.
195 |      * \param val The value to be assigned to the inout vector.
196 |      */
197 |     void compute_reference_results(graphlily::aligned_sparse_float_vec_t &mask,
198 |                                    graphlily::aligned_dense_float_vec_t &inout,
199 |                                    float val);
200 | 
201 |     /*!
202 |      * \brief Compute reference results when this->generate_new_frontier_ is true (SSSP mode).
203 |      * \param mask The mask vector.
204 |      * \param inout The inout vector.
205 |      * \param new_frontier The new frontier.
206 |      */
207 |     void compute_reference_results(graphlily::aligned_sparse_float_vec_t &mask,
208 |                                    graphlily::aligned_dense_float_vec_t &inout,
209 |                                    graphlily::aligned_sparse_float_vec_t &new_frontier);
210 | };
211 | 
212 | 
213 | template<typename vector_data_t, typename sparse_vector_data_t>
214 | void AssignVectorSparseModule<vector_data_t, sparse_vector_data_t>::send_mask_host_to_device(
215 |     aligned_mask_t &mask
216 | ) {
217 |     cl_int err;
218 |     // handle mask
219 |     this->mask_.assign(mask.begin(), mask.end());
220 |     cl_mem_ext_ptr_t mask_ext;
221 |     mask_ext.obj = this->mask_.data();
222 |     mask_ext.param = 0;
223 |     if (this->generate_new_frontier_) {
224 |         mask_ext.flags = graphlily::HBM[22];
225 |     } else {
226 |         mask_ext.flags = graphlily::HBM[20];
227 |     }
228 |     OCL_CHECK(err, this->mask_buf = cl::Buffer(this->context_,
229 |                 CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
230 |                 sizeof(sparse_vector_data_t) * this->mask_.size(),
231 |                 &mask_ext,
232 |                 &err));
233 |     if (this->generate_new_frontier_) {
234 |         this->kernel_.setArg(graphlily::num_hbm_channels + 9, this->mask_buf);
235 |     } else {
236 |         this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->mask_buf);
237 |     }
238 |     if (this->generate_new_frontier_) {
239 |         // allocate memory for new_frontier
240 |         this->new_frontier_.resize(this->mask_.size());
241 |         cl_mem_ext_ptr_t new_frontier_ext;
242 |         new_frontier_ext.obj = this->new_frontier_.data();
243 |         new_frontier_ext.param = 0;
244 |         new_frontier_ext.flags = graphlily::HBM[20];
245 |         OCL_CHECK(err, this->new_frontier_buf = cl::Buffer(this->context_,
246 |                     CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
247 |                     sizeof(sparse_vector_data_t) * this->new_frontier_.size(),
248 |                     &new_frontier_ext,
249 |                     &err));
250 |         OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 7, this->new_frontier_buf));
251 |         OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->new_frontier_buf}, 0));
252 |         this->command_queue_.finish();
253 |     }
254 | }
255 | 
256 | 
257 | template<typename vector_data_t, typename sparse_vector_data_t>
258 | void AssignVectorSparseModule<vector_data_t, sparse_vector_data_t>::send_inout_host_to_device(
259 |     aligned_dense_vec_t &inout
260 | ) {
261 |     this->inout_.assign(inout.begin(), inout.end());
262 |     cl_mem_ext_ptr_t inout_ext;
263 |     inout_ext.obj = this->inout_.data();
264 |     inout_ext.param = 0;
265 |     inout_ext.flags = graphlily::HBM[21];
266 |     cl_int err;
267 |     OCL_CHECK(err, this->inout_buf = cl::Buffer(this->context_,
268 |                 CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
269 |                 sizeof(vector_data_t) * this->inout_.size(),
270 |                 &inout_ext,
271 |                 &err));
272 |     OCL_CHECK(err, err = this->kernel_.setArg(graphlily::num_hbm_channels + 8, this->inout_buf));
273 |     OCL_CHECK(err, err = this->command_queue_.enqueueMigrateMemObjects({this->inout_buf}, 0));
274 |     this->command_queue_.finish();
275 | }
276 | 
277 | 
278 | template<typename vector_data_t, typename sparse_vector_data_t>
279 | void AssignVectorSparseModule<vector_data_t, sparse_vector_data_t>::run(vector_data_t val) {
280 |     if (this->generate_new_frontier_) {
281 |         std::cout << "[ERROR]: this->generate_new_frontier_ should be false" << std::endl;
282 |         exit(EXIT_FAILURE);
283 |     }
284 |     // To avoid runtime error of invalid scalar argument size
285 |     if (!(std::is_same<vector_data_t, unsigned>::value || std::is_same<vector_data_t, float>::value)) {
286 |        this->kernel_.setArg(graphlily::num_hbm_channels + 16, 8, (void*)&val);
287 |     } else {
288 |         this->kernel_.setArg(graphlily::num_hbm_channels + 16, val);
289 |     }
290 |     this->command_queue_.enqueueTask(this->kernel_);
291 |     this->command_queue_.finish();
292 | }
293 | 
294 | 
295 | template<typename vector_data_t, typename sparse_vector_data_t>
296 | void AssignVectorSparseModule<vector_data_t, sparse_vector_data_t>::run() {
297 |     if (!this->generate_new_frontier_) {
298 |         std::cout << "[ERROR]: this->generate_new_frontier_ should be true" << std::endl;
299 |         exit(EXIT_FAILURE);
300 |     }
301 |     this->command_queue_.enqueueTask(this->kernel_);
302 |     this->command_queue_.finish();
303 | }
304 | 
305 | 
306 | template<typename vector_data_t, typename sparse_vector_data_t>
307 | void AssignVectorSparseModule<vector_data_t, sparse_vector_data_t>::compute_reference_results(
308 |     graphlily::aligned_sparse_float_vec_t &mask,
309 |     graphlily::aligned_dense_float_vec_t &inout,
310 |     float val
311 | ) {
312 |     for (size_t i = 0; i < mask[0].index; i++) {
313 |         inout[mask[i + 1].index] = val;
314 |     }
315 | }
316 | 
317 | 
318 | template<typename vector_data_t, typename sparse_vector_data_t>
319 | void AssignVectorSparseModule<vector_data_t, sparse_vector_data_t>::compute_reference_results(
320 |     graphlily::aligned_sparse_float_vec_t &mask,
321 |     graphlily::aligned_dense_float_vec_t &inout,
322 |     graphlily::aligned_sparse_float_vec_t &new_frontier
323 | ) {
324 |     new_frontier.clear();
325 |     for (size_t i = 0; i < mask[0].index; i++) {
326 |         if (inout[mask[i + 1].index] > mask[i + 1].val) {
327 |             inout[mask[i + 1].index] = mask[i + 1].val;
328 |             new_frontier.push_back(mask[i + 1]);
329 |         }
330 |     }
331 |     graphlily::idx_float_t new_frontier_head;
332 |     new_frontier_head.index = new_frontier.size();
333 |     new_frontier_head.val = 0;
334 |     new_frontier.insert(new_frontier.begin(), new_frontier_head);
335 | }
336 | 
337 | }  // namespace module
338 | }  // namespace graphlily
339 | 
340 | #endif  // GRAPHLILY_ASSIGN_VECTOR_SPARSE_MODULE_H_
341 | 


--------------------------------------------------------------------------------
/graphlily/module/base_module.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_BASE_MODULE_H_
  2 | #define GRAPHLILY_BASE_MODULE_H_
  3 | 
  4 | #include "graphlily/global.h"
  5 | 
  6 | 
  7 | namespace graphlily {
  8 | namespace module {
  9 | 
 10 | class BaseModule {
 11 | protected:
 12 |     /*! \brief The kernel name */
 13 |     std::string kernel_name_;
 14 |     /*! \brief The target; can be sw_emu, hw_emu, hw */
 15 |     std::string target_;
 16 | 
 17 |     // OpenCL runtime
 18 |     cl::Device device_;
 19 |     cl::Context context_;
 20 |     cl::Kernel kernel_;
 21 |     cl::CommandQueue command_queue_;
 22 | 
 23 | public:
 24 |     BaseModule(std::string kernel_name) {
 25 |         this->kernel_name_ = kernel_name;
 26 |     }
 27 | 
 28 |     virtual ~BaseModule() {
 29 |         this->device_ = nullptr;
 30 |         this->context_ = nullptr;
 31 |         this->kernel_ = nullptr;
 32 |         this->command_queue_ = nullptr;
 33 |     }
 34 | 
 35 |     /*!
 36 |      * \brief Get the kernel name.
 37 |      * \return The kernel name.
 38 |      */
 39 |     std::string get_kernel_name() {
 40 |         return this->kernel_name_;
 41 |     }
 42 | 
 43 |     /*!
 44 |      * \brief Set the device.
 45 |      */
 46 |     void set_device(cl::Device device) {
 47 |         this->device_ = device;
 48 |     }
 49 | 
 50 |     /*!
 51 |      * \brief Set the context.
 52 |      */
 53 |     void set_context(cl::Context context) {
 54 |         this->context_ = context;
 55 |     }
 56 | 
 57 |     /*!
 58 |      * \brief Set the kernel.
 59 |      */
 60 |     void set_kernel(cl::Kernel kernel) {
 61 |         this->kernel_ = kernel;
 62 |     }
 63 | 
 64 |     /*!
 65 |      * \brief Set the command queue.
 66 |      */
 67 |     void set_command_queue(cl::CommandQueue command_queue) {
 68 |         this->command_queue_ = command_queue;
 69 |     }
 70 | 
 71 |     /*!
 72 |      * \brief Set the target.
 73 |      */
 74 |     void set_target(std::string target) {
 75 |         assert(target == "sw_emu" || target == "hw_emu" || target == "hw");
 76 |         this->target_ = target;
 77 |     }
 78 | 
 79 |     /*!
 80 |      * \brief Copy the contents of a buffer into another buffer without going through the host.
 81 |      */
 82 |     void copy_buffer_device_to_device(cl::Buffer src, cl::Buffer dst, size_t bytes) {
 83 |         this->command_queue_.enqueueCopyBuffer(src, dst, 0, 0, bytes);
 84 |         this->command_queue_.finish();
 85 |     }
 86 | 
 87 |     /*!
 88 |      * \brief Set unused arguments
 89 |      */
 90 |     virtual void set_unused_args() = 0;
 91 | 
 92 |     /*!
 93 |      * \brief Set the mode. SpMV and SpMSpV are merged into a single kernel; we need to select
 94 |      *        one of them, so called the mode. Similarly, all apply functions are merged into one kernel.
 95 |      */
 96 |     virtual void set_mode() = 0;
 97 | 
 98 |     /*!
 99 |      * \brief Load the xclbin file and set up runtime.
100 |      * \param xclbin_file_path The xclbin file path.
101 |      */
102 |     void set_up_runtime(std::string xclbin_file_path);
103 | };
104 | 
105 | 
106 | void BaseModule::set_up_runtime(std::string xclbin_file_path) {
107 |     cl_int err;
108 |     // Set this->device_ and this->context_
109 |     if (this->target_ == "sw_emu" || this->target_ == "hw_emu") {
110 |         setenv("XCL_EMULATION_MODE", this->target_.c_str(), true);
111 |     }
112 |     this->device_ = graphlily::find_device();
113 |     this->context_ = cl::Context(this->device_, NULL, NULL, NULL);
114 |     // Set this->kernel_
115 |     auto file_buf = xcl::read_binary_file(xclbin_file_path);
116 |     cl::Program::Binaries binaries{{file_buf.data(), file_buf.size()}};
117 |     cl::Program program(this->context_, {this->device_}, binaries, NULL, &err);
118 |     if (err != CL_SUCCESS) {
119 |         std::cout << "Failed to program device with xclbin file\n";
120 |     } else {
121 |         std::cout << "Successfully programmed device with xclbin file\n";
122 |     }
123 |     OCL_CHECK(err, this->kernel_ = cl::Kernel(program, this->kernel_name_.c_str(), &err));
124 |     // Set this->command_queue_
125 |     OCL_CHECK(err, this->command_queue_ = cl::CommandQueue(this->context_,
126 |                                                            this->device_,
127 |                                                            CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE,
128 |                                                            &err));
129 |     // Set unused arguments
130 |     this->set_unused_args();
131 |     // Set the mode
132 |     this->set_mode();
133 | }
134 | 
135 | }  // namespace module
136 | }  // namespace graphlily
137 | 
138 | #endif  // GRAPHLILY_BASE_MODULE_H_
139 | 


--------------------------------------------------------------------------------
/graphlily/synthesizer/base_synthesizer.h:
--------------------------------------------------------------------------------
  1 | #ifndef GRAPHLILY_BASE_SYNTHESIZER_H_
  2 | #define GRAPHLILY_BASE_SYNTHESIZER_H_
  3 | 
  4 | #include "graphlily/global.h"
  5 | 
  6 | 
  7 | namespace graphlily {
  8 | namespace synthesizer {
  9 | 
 10 | template<typename T>
 11 | void _generate_makefile_impl(T* t) {
 12 |     std::string command = "mkdir -p " + graphlily::proj_folder_name;
 13 |     std::cout << command << std::endl;
 14 |     system(command.c_str());
 15 |     std::ofstream makefile(graphlily::proj_folder_name + "/makefile");
 16 |     makefile << "TARGET := " << t->target_ << "\n" << std::endl;
 17 |     makefile << graphlily::makefile_prologue << t->makefile_body_ << graphlily::makefile_epilogue;
 18 |     makefile.close();
 19 | }
 20 | 
 21 | 
 22 | template<typename T>
 23 | void _synthesize_impl(T* t) {
 24 |     std::string command = "mkdir -p " + graphlily::proj_folder_name;
 25 |     std::cout << command << std::endl;
 26 |     system(command.c_str());
 27 |     t->link_kernel_code();
 28 |     t->generate_kernel_header();
 29 |     t->generate_kernel_ini();
 30 |     t->generate_makefile();
 31 |     command = "cd " + graphlily::proj_folder_name + "; " + "make build";
 32 |     std::cout << command << std::endl;
 33 |     system(command.c_str());
 34 |     if (t->target_ == "sw_emu" || t->target_ == "hw_emu") {
 35 |         command = "cp " + graphlily::proj_folder_name + "/emconfig.json " + ".";
 36 |         std::cout << command << std::endl;
 37 |         system(command.c_str());
 38 |     }
 39 | }
 40 | 
 41 | 
 42 | class BaseSynthesizer {
 43 | protected:
 44 |     /*! \brief The kernel name */
 45 |     std::string kernel_name_;
 46 |     /*! \brief The makefile body */
 47 |     std::string makefile_body_;
 48 |     /*! \brief The target; can be sw_emu, hw_emu, hw */
 49 |     std::string target_;
 50 | 
 51 | private:
 52 |     template<typename T> friend void _generate_makefile_impl(T* t);
 53 |     template<typename T> friend void _synthesize_impl(T* t);
 54 | 
 55 | public:
 56 |     BaseSynthesizer(std::string kernel_name) {
 57 |         this->kernel_name_ = kernel_name;
 58 |         this->makefile_body_ = graphlily::add_kernel_to_makefile(this->kernel_name_);
 59 |     }
 60 | 
 61 |     /*!
 62 |      * \brief Get the kernel name.
 63 |      * \return The kernel name.
 64 |      */
 65 |     std::string get_kernel_name() {
 66 |         return this->kernel_name_;
 67 |     }
 68 | 
 69 |     /*!
 70 |      * \brief Set the target.
 71 |      */
 72 |     void set_target(std::string target) {
 73 |        assert(target == "sw_emu" || target == "hw_emu" || target == "hw");
 74 |        this->target_ = target;
 75 |     }
 76 | 
 77 |     /*!
 78 |      * \brief Generate the kernel header file.
 79 |      */
 80 |     virtual void generate_kernel_header() = 0;
 81 | 
 82 |     /*!
 83 |      * \brief Generate the kernel .ini configuration file.
 84 |      */
 85 |     virtual void generate_kernel_ini() = 0;
 86 | 
 87 |     /*!
 88 |      * \brief Link the kernel cpp file to the proj directory.
 89 |      */
 90 |     virtual void link_kernel_code();
 91 | 
 92 |     /*!
 93 |      * \brief Generate the Makefile.
 94 |      */
 95 |     virtual void generate_makefile() {
 96 |         _generate_makefile_impl<BaseSynthesizer>(this);
 97 |     }
 98 | 
 99 |     /*!
100 |      * \brief Synthesize the kernel according to this->target_.
101 |      */
102 |     virtual void synthesize() {
103 |          _synthesize_impl<BaseSynthesizer>(this);
104 |     }
105 | };
106 | 
107 | 
108 | void BaseSynthesizer::link_kernel_code() {
109 |     std::string command = "cp " + graphlily::root_path + "/graphlily/hw/" + "*.h"
110 |                                 + " " + graphlily::proj_folder_name + "/";
111 |     std::cout << command << std::endl;
112 |     system(command.c_str());
113 | 
114 |     command = "cp " + graphlily::root_path + "/graphlily/hw/" + this->kernel_name_ + ".cpp"
115 |                     + " " + graphlily::proj_folder_name + "/" + this->kernel_name_ + ".cpp";
116 |     std::cout << command << std::endl;
117 |     system(command.c_str());
118 | 
119 |     command = "cp " + graphlily::root_path + "/graphlily/hw/" + this->kernel_name_ + ".ini"
120 |                     + " " + graphlily::proj_folder_name + "/" + this->kernel_name_ + ".ini";
121 |     std::cout << command << std::endl;
122 |     system(command.c_str());
123 | }
124 | 
125 | }  // namespace synthesizer
126 | }  // namespace graphlily
127 | 
128 | #endif  // GRAPHLILY_BASE_SYNTHESIZER_H_
129 | 


--------------------------------------------------------------------------------
/graphlily/synthesizer/overlay_synthesizer.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRAPHLILY_OVERLAY_SYNTHESIZER_H_
 2 | #define GRAPHLILY_OVERLAY_SYNTHESIZER_H_
 3 | 
 4 | #include "graphlily/synthesizer/base_synthesizer.h"
 5 | 
 6 | 
 7 | namespace graphlily {
 8 | namespace synthesizer {
 9 | 
10 | class OverlaySynthesizer : public BaseSynthesizer {
11 | private:
12 |     // Kernel configuration
13 |     uint32_t num_channels_;
14 |     uint32_t spmv_out_buf_len_;
15 |     uint32_t spmspv_out_buf_len_;
16 |     uint32_t vec_buf_len_;
17 | 
18 | public:
19 |     OverlaySynthesizer(uint32_t num_channels,
20 |                        uint32_t spmv_out_buf_len,
21 |                        uint32_t spmspv_out_buf_len,
22 |                        uint32_t vec_buf_len) : BaseSynthesizer("overlay") {
23 |         this->num_channels_ = num_channels;
24 |         this->spmv_out_buf_len_ = spmv_out_buf_len;
25 |         this->spmspv_out_buf_len_ = spmspv_out_buf_len;
26 |         this->vec_buf_len_ = vec_buf_len;
27 |     }
28 | 
29 |     void generate_kernel_header() override;
30 | 
31 |     void generate_kernel_ini() override;
32 | };
33 | 
34 | 
35 | void OverlaySynthesizer::generate_kernel_header() {
36 |     std::string command = "mkdir -p " + graphlily::proj_folder_name;
37 |     std::cout << command << std::endl;
38 |     system(command.c_str());
39 |     std::ofstream header(graphlily::proj_folder_name + "/" + this->kernel_name_ + ".h", std::ios_base::app);
40 |     header << "const unsigned SPMV_OUT_BUF_LEN = " << this->spmv_out_buf_len_ << ";" << std::endl;
41 |     header << "const unsigned SPMSPV_OUT_BUF_LEN = " << this->spmspv_out_buf_len_ << ";" << std::endl;
42 |     header << "const unsigned VEC_BUF_LEN = " << this->vec_buf_len_ << ";" << std::endl;
43 |     header << "#define NUM_HBM_CHANNEL " << this->num_channels_ << std::endl;
44 |     header << "#define SPMV_NUM_PE_TOTAL " << this->num_channels_ * graphlily::pack_size << std::endl;
45 |     header << std::endl;
46 |     header << "#endif  // GRAPHLILY_HW_OVERLAY_H_" << std::endl;
47 |     header.close();
48 | }
49 | 
50 | 
51 | void OverlaySynthesizer::generate_kernel_ini() {
52 |     std::string command = "mkdir -p " + graphlily::proj_folder_name;
53 |     std::cout << command << std::endl;
54 |     system(command.c_str());
55 |     std::ofstream ini(graphlily::proj_folder_name + "/" + this->kernel_name_ + ".ini");
56 |     ini << "[connectivity]" << std::endl;
57 |     // SpMV
58 |     for (size_t hbm_idx = 0; hbm_idx < this->num_channels_; hbm_idx++) {
59 |         ini << "sp=overlay_1.spmv_channel_" << hbm_idx << "_matrix:HBM["
60 |             << hbm_idx << "]" << std::endl;
61 |     }
62 |     ini << "sp=overlay_1.spmv_vector:HBM[20]" << std::endl;
63 |     ini << "sp=overlay_1.spmv_mask:HBM[21]" << std::endl;
64 |     ini << "sp=overlay_1.spmv_mask_w:HBM[21]" << std::endl;
65 |     ini << "sp=overlay_1.spmv_out:HBM[22]" << std::endl;
66 |     // SpMSpV
67 |     ini << "sp=overlay_1.spmspv_matrix:DDR[0]" << std::endl;
68 |     ini << "sp=overlay_1.spmspv_matrix_indptr:DDR[0]" << std::endl;
69 |     ini << "sp=overlay_1.spmspv_matrix_partptr:DDR[0]" << std::endl;
70 |     ini << "sp=overlay_1.spmspv_vector:HBM[20]" << std::endl;
71 |     ini << "sp=overlay_1.spmspv_mask:HBM[21]" << std::endl;
72 |     ini << "sp=overlay_1.spmspv_out:HBM[22]" << std::endl;
73 |     // enable retiming
74 |     ini << "[vivado]" << std::endl;
75 |     ini << "prop=run.__KERNEL__.{STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS}={-retiming}" << std::endl;
76 |     ini.close();
77 | }
78 | 
79 | 
80 | }  // namespace synthesizer
81 | }  // namespace graphlily
82 | 
83 | #endif  // GRAPHLILY_OVERLAY_SYNTHESIZER_H_
84 | 


--------------------------------------------------------------------------------
/tests/Makefile:
--------------------------------------------------------------------------------
 1 | BUILD_DIR = ./build
 2 | 
 3 | HOST_ARCH = x86
 4 | 
 5 | CXXFLAGS += -Wall -O3 -g -std=c++11
 6 | CXXFLAGS += -I$(GRAPHLILY_ROOT_PATH)
 7 | 
 8 | LDFLAGS += -lrt -lstdc++
 9 | 
10 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/xcl2/xcl2.mk
11 | CXXFLAGS += $(xcl2_CXXFLAGS)
12 | LDFLAGS += $(xcl2_LDFLAGS)
13 | 
14 | include $(GRAPHLILY_ROOT_PATH)/xrt/includes/opencl/opencl.mk
15 | CXXFLAGS += $(opencl_CXXFLAGS)
16 | LDFLAGS += $(opencl_LDFLAGS)
17 | 
18 | CXXFLAGS += -I/work/shared/common/project_build/graphblas/software/cnpy
19 | LDFLAGS += -L/work/shared/common/project_build/graphblas/software/cnpy/build -lcnpy
20 | 
21 | CXXFLAGS += -I/work/shared/common/project_build/graphblas/software/googletest/googletest/include
22 | LDFLAGS += -L/work/shared/common/project_build/graphblas/software/googletest/build/lib -lgtest
23 | 
24 | test_io: test_io.cpp $(xcl2_SRCS)
25 | 	g++ $(CXXFLAGS) test_io.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
26 | 	mkdir -p $(BUILD_DIR)
27 | 	mv test_io $(BUILD_DIR)/
28 | 	cd $(BUILD_DIR); ./test_io
29 | 
30 | test_module_spmv_spmspv: test_module_spmv_spmspv.cpp $(xcl2_SRCS)
31 | 	g++ $(CXXFLAGS) test_module_spmv_spmspv.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
32 | 	mkdir -p $(BUILD_DIR)
33 | 	mv test_module_spmv_spmspv $(BUILD_DIR)/
34 | 	cd $(BUILD_DIR); ./test_module_spmv_spmspv
35 | 
36 | test_module_apply: test_module_apply.cpp $(xcl2_SRCS)
37 | 	g++ $(CXXFLAGS) test_module_apply.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
38 | 	mkdir -p $(BUILD_DIR)
39 | 	mv test_module_apply $(BUILD_DIR)/
40 | 	cd $(BUILD_DIR); ./test_module_apply
41 | 
42 | test_app: test_app.cpp $(xcl2_SRCS)
43 | 	g++ $(CXXFLAGS) test_app.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
44 | 	mkdir -p $(BUILD_DIR)
45 | 	mv test_app $(BUILD_DIR)/
46 | 	cd $(BUILD_DIR); ./test_app
47 | 
48 | test_pe_cluster: test_pe_cluster.cpp $(xcl2_SRCS)
49 | 	g++ $(CXXFLAGS) test_pe_cluster.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
50 | 	mkdir -p $(BUILD_DIR)
51 | 	mv test_pe_cluster $(BUILD_DIR)/
52 | 	cd $(BUILD_DIR); ./test_pe_cluster
53 | 
54 | test_shuffle: test_shuffle.cpp $(xcl2_SRCS)
55 | 	g++ $(CXXFLAGS) test_shuffle.cpp $(xcl2_SRCS) -o $@ $(LDFLAGS)
56 | 	mkdir -p $(BUILD_DIR)
57 | 	mv test_shuffle $(BUILD_DIR)/
58 | 	cd $(BUILD_DIR); ./test_shuffle
59 | 
60 | all: test_io test_module_spmv_spmspv test_module_apply test_app test_pe_cluster
61 | 
62 | clean:
63 | 	rm -rf $(BUILD_DIR)
64 | 


--------------------------------------------------------------------------------
/tests/test_app.cpp:
--------------------------------------------------------------------------------
  1 | #pragma GCC diagnostic push
  2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context"
  3 | #pragma GCC diagnostic ignored "-Wuninitialized"
  4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  5 | 
  6 | 
  7 | #include "graphlily/synthesizer/overlay_synthesizer.h"
  8 | #include "graphlily/app/bfs.h"
  9 | #include "graphlily/app/pagerank.h"
 10 | #include "graphlily/app/sssp.h"
 11 | 
 12 | #include <iostream>
 13 | #include <ap_fixed.h>
 14 | #include <gtest/gtest.h>
 15 | 
 16 | 
 17 | std::string target = "sw_emu";
 18 | uint32_t spmv_out_buf_len = 1024;
 19 | uint32_t spmspv_out_buf_len = 512;
 20 | uint32_t vec_buf_len = 256;
 21 | 
 22 | 
 23 | void clean_proj_folder() {
 24 |     std::string command = "rm -rf ./" + graphlily::proj_folder_name;
 25 |     std::cout << command << std::endl;
 26 |     system(command.c_str());
 27 | }
 28 | 
 29 | 
 30 | template<typename data_t>
 31 | void verify(std::vector<float, aligned_allocator<float>> &reference_results,
 32 |             std::vector<data_t, aligned_allocator<data_t>> &kernel_results) {
 33 |     ASSERT_EQ(reference_results.size(), kernel_results.size());
 34 |     float epsilon = 0.0001;
 35 |     for (size_t i = 0; i < reference_results.size(); i++) {
 36 |         ASSERT_TRUE(abs(float(kernel_results[i]) - reference_results[i]) < epsilon);
 37 |     }
 38 | }
 39 | 
 40 | 
 41 | TEST(Synthesize, NULL) {
 42 |     graphlily::synthesizer::OverlaySynthesizer synthesizer(graphlily::num_hbm_channels,
 43 |                                                            spmv_out_buf_len,
 44 |                                                            spmspv_out_buf_len,
 45 |                                                            vec_buf_len);
 46 |     synthesizer.set_target(target);
 47 |     synthesizer.synthesize();
 48 | }
 49 | 
 50 | 
 51 | TEST(BFS, PullPush) {
 52 |     graphlily::app::BFS bfs(graphlily::num_hbm_channels, spmv_out_buf_len,
 53 |         spmspv_out_buf_len, vec_buf_len);
 54 |     bfs.set_target(target);
 55 |     bfs.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
 56 | 
 57 |     std::string csr_float_npz_path = "/work/shared/common/project_build/graphblas/"
 58 |                                      "data/sparse_matrix_graph/uniform_10K_10_csr_float32.npz";
 59 |     bool skip_empty_rows = true;
 60 |     bfs.load_and_format_matrix(csr_float_npz_path, skip_empty_rows);
 61 |     bfs.send_matrix_host_to_device();
 62 | 
 63 |     uint32_t source = 0;
 64 |     uint32_t num_iterations = 10;
 65 | 
 66 |     auto reference_results = bfs.compute_reference_results(source, num_iterations);
 67 | 
 68 |     // pull push
 69 |     float threshold = 0.1;
 70 |     auto kernel_results = bfs.pull_push(source, num_iterations, threshold);
 71 |     // for (int i = 0; i < 10; i++) {
 72 |     //     std::cout << reference_results[i] << " " << kernel_results[i] << std::endl;
 73 |     // }
 74 |     verify<graphlily::val_t>(reference_results, kernel_results);
 75 | 
 76 |     // pull
 77 |     kernel_results = bfs.pull(source, num_iterations);
 78 |     verify<graphlily::val_t>(reference_results, kernel_results);
 79 | 
 80 |     // push
 81 |     kernel_results = bfs.push(source, num_iterations);
 82 |     verify<graphlily::val_t>(reference_results, kernel_results);
 83 | }
 84 | 
 85 | 
 86 | TEST(PageRank, Pull) {
 87 |     graphlily::app::PageRank pagerank(graphlily::num_hbm_channels, spmv_out_buf_len, vec_buf_len);
 88 |     pagerank.set_target(target);
 89 |     pagerank.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
 90 | 
 91 |     std::string csr_float_npz_path = "/work/shared/common/project_build/graphblas/"
 92 |                                      "data/sparse_matrix_graph/uniform_10K_10_csr_float32.npz";
 93 |     float damping = 0.9;
 94 |     bool skip_empty_rows = true;
 95 |     pagerank.load_and_format_matrix(csr_float_npz_path, damping, skip_empty_rows);
 96 |     pagerank.send_matrix_host_to_device();
 97 | 
 98 |     uint32_t num_iterations = 10;
 99 |     auto kernel_results = pagerank.pull(damping, num_iterations);
100 |     auto reference_results = pagerank.compute_reference_results(damping, num_iterations);
101 |     verify<graphlily::val_t>(reference_results, kernel_results);
102 | }
103 | 
104 | 
105 | TEST(SSSP, PullPush) {
106 |     graphlily::app::SSSP sssp(graphlily::num_hbm_channels, spmv_out_buf_len,
107 |         spmspv_out_buf_len, vec_buf_len);
108 |     sssp.set_target(target);
109 |     sssp.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
110 | 
111 |     std::string csr_float_npz_path = "/work/shared/common/project_build/graphblas/"
112 |                                      "data/sparse_matrix_graph/uniform_10K_10_csr_float32.npz";
113 | 
114 |     bool skip_empty_rows = true;
115 |     sssp.load_and_format_matrix(csr_float_npz_path, skip_empty_rows);
116 |     sssp.send_matrix_host_to_device();
117 | 
118 |     uint32_t source = 0;
119 |     uint32_t num_iterations = 10;
120 | 
121 |     auto reference_results = sssp.compute_reference_results(source, num_iterations);
122 | 
123 |     // pull push
124 |     float threshold = 0.1;
125 |     auto kernel_results = sssp.pull_push(source, num_iterations, threshold);
126 |     verify<graphlily::val_t>(reference_results, kernel_results);
127 | 
128 |     // pull
129 |     kernel_results = sssp.pull(source, num_iterations);
130 |     verify<graphlily::val_t>(reference_results, kernel_results);
131 | 
132 |     // push
133 |     kernel_results = sssp.push(source, num_iterations);
134 |     verify<graphlily::val_t>(reference_results, kernel_results);
135 | }
136 | 
137 | 
138 | TEST(CleanOverlay, NULL) {
139 |     clean_proj_folder();
140 | }
141 | 
142 | 
143 | int main(int argc, char ** argv) {
144 |     testing::InitGoogleTest(&argc, argv);
145 |     return RUN_ALL_TESTS();
146 | }
147 | 
148 | #pragma GCC diagnostic pop
149 | 


--------------------------------------------------------------------------------
/tests/test_data/create_csr.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.sparse
 3 | 
 4 | num_rows = 10
 5 | num_cols = 10
 6 | nnz_per_row = 1
 7 | nnz = nnz_per_row * num_rows
 8 | 
 9 | indptr = np.array([i * nnz_per_row for i in range(num_rows + 1)], dtype='uint32')
10 | # indices = np.array([i * num_cols / nnz_per_row % num_cols for i in range(nnz)], dtype='uint32')
11 | indices = np.array([i for i in range(nnz)], dtype='uint32')
12 | data = np.ones(nnz)
13 | 
14 | M = scipy.sparse.csr_matrix((data, indices, indptr), shape=(num_rows, num_cols), dtype=np.float32)
15 | # print(M.toarray())
16 | scipy.sparse.save_npz("eye_10_csr_float32.npz", M)
17 | 


--------------------------------------------------------------------------------
/tests/test_data/eye_10_csr_float32.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cornell-zhang/GraphLily/f3438f9d8de0e5fe47b4348a2b5b2bb6c91c76ce/tests/test_data/eye_10_csr_float32.npz


--------------------------------------------------------------------------------
/tests/test_data/line_8_csr_float32.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cornell-zhang/GraphLily/f3438f9d8de0e5fe47b4348a2b5b2bb6c91c76ce/tests/test_data/line_8_csr_float32.npz


--------------------------------------------------------------------------------
/tests/test_module_apply.cpp:
--------------------------------------------------------------------------------
  1 | #pragma GCC diagnostic push
  2 | #pragma GCC diagnostic ignored "-Wint-in-bool-context"
  3 | #pragma GCC diagnostic ignored "-Wuninitialized"
  4 | #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
  5 | 
  6 | #include "graphlily/synthesizer/overlay_synthesizer.h"
  7 | 
  8 | #include "graphlily/module/assign_vector_dense_module.h"
  9 | #include "graphlily/module/assign_vector_sparse_module.h"
 10 | #include "graphlily/module/add_scalar_vector_dense_module.h"
 11 | 
 12 | #include <ap_fixed.h>
 13 | #include <gtest/gtest.h>
 14 | 
 15 | #include "graphlily/global.h"
 16 | #include "graphlily/io/data_loader.h"
 17 | #include "graphlily/io/data_formatter.h"
 18 | 
 19 | 
 20 | std::string target = "sw_emu";
 21 | uint32_t spmv_out_buf_len = 1024;
 22 | uint32_t spmspv_out_buf_len = 512;
 23 | uint32_t vec_buf_len = 256;
 24 | 
 25 | 
 26 | void clean_proj_folder() {
 27 |     std::string command = "rm -rf ./" + graphlily::proj_folder_name;
 28 |     std::cout << command << std::endl;
 29 |     system(command.c_str());
 30 | }
 31 | 
 32 | 
 33 | template<typename data_t>
 34 | void verify(std::vector<float, aligned_allocator<float>> &reference_results,
 35 |             std::vector<data_t, aligned_allocator<data_t>> &kernel_results) {
 36 |     ASSERT_EQ(reference_results.size(), kernel_results.size());
 37 |     float epsilon = 0.0001;
 38 |     for (size_t i = 0; i < reference_results.size(); i++) {
 39 |         ASSERT_TRUE(abs(float(kernel_results[i]) - reference_results[i]) < epsilon);
 40 |     }
 41 | }
 42 | 
 43 | 
 44 | TEST(Synthesize, NULL) {
 45 |     graphlily::synthesizer::OverlaySynthesizer synthesizer(graphlily::num_hbm_channels,
 46 |                                                            spmv_out_buf_len,
 47 |                                                            spmspv_out_buf_len,
 48 |                                                            vec_buf_len);
 49 |     synthesizer.set_target(target);
 50 |     synthesizer.synthesize();
 51 | }
 52 | 
 53 | 
 54 | TEST(AddScalarVectorDense, Basic) {
 55 |     graphlily::module::eWiseAddModule<graphlily::val_t> module;
 56 |     module.set_target(target);
 57 |     module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
 58 | 
 59 |     uint32_t length = 128;
 60 |     graphlily::val_t val = 1;
 61 |     float val_float = float(val);
 62 |     std::vector<float, aligned_allocator<float>> in_float(length);
 63 |     std::generate(in_float.begin(), in_float.end(), [&](){return float(rand() % 10) / 100;});
 64 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> in(in_float.begin(), in_float.end());
 65 | 
 66 |     module.send_in_host_to_device(in);
 67 |     module.allocate_out_buf(length);
 68 |     module.run(length, val);
 69 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> kernel_out =
 70 |         module.send_out_device_to_host();
 71 |     std::vector<float, aligned_allocator<float>> reference_out =
 72 |         module.compute_reference_results(in_float, length, val_float);
 73 | 
 74 |     verify<graphlily::val_t>(reference_out, kernel_out);
 75 | }
 76 | 
 77 | 
 78 | TEST(AssignVectorDense, Basic) {
 79 |     graphlily::module::AssignVectorDenseModule<graphlily::val_t> module;
 80 |     module.set_target(target);
 81 |     module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
 82 | 
 83 |     uint32_t length = 128;
 84 |     graphlily::val_t val = 23;
 85 |     float val_float = float(val);
 86 |     std::vector<float, aligned_allocator<float>> mask_float(length);
 87 |     std::generate(mask_float.begin(), mask_float.end(), [&](){return float(rand() % 2);});
 88 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> mask(mask_float.begin(),
 89 |                                                                             mask_float.end());
 90 |     std::vector<float, aligned_allocator<float>> reference_inout(length);
 91 |     std::generate(reference_inout.begin(), reference_inout.end(), [&](){return float(rand() % 2);});
 92 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> kernel_inout(reference_inout.begin(),
 93 |                                                                                     reference_inout.end());
 94 | 
 95 |     module.set_mask_type(graphlily::kMaskWriteToOne);
 96 |     module.send_mask_host_to_device(mask);
 97 |     module.send_inout_host_to_device(kernel_inout);
 98 |     module.run(length, val);
 99 |     kernel_inout = module.send_inout_device_to_host();
100 |     module.compute_reference_results(mask_float, reference_inout, length, val_float);
101 | 
102 |     verify<graphlily::val_t>(reference_inout, kernel_inout);
103 | }
104 | 
105 | 
106 | TEST(AssignVectorSparseNoNewFrontier, Basic) {
107 |     bool generate_new_frontier = false;
108 |     graphlily::module::AssignVectorSparseModule<graphlily::val_t,
109 |         graphlily::idx_val_t> module(generate_new_frontier);
110 |     module.set_target(target);
111 |     module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
112 | 
113 |     float mask_sparsity = 0.9;
114 |     uint32_t inout_size = 8192;
115 |     graphlily::val_t val = 3;
116 |     float val_float = float(val);
117 |     unsigned length = (unsigned)floor(inout_size * (1 - mask_sparsity));
118 |     unsigned mask_indices_increment = inout_size / length;
119 |     graphlily::aligned_sparse_float_vec_t mask_float(length + 1);
120 |     for (size_t i = 0; i < length; i++) {
121 |         mask_float[i + 1].val = float(rand() % 10);
122 |         mask_float[i + 1].index = i * mask_indices_increment;
123 |     }
124 |     mask_float[0].val = 0;
125 |     mask_float[0].index = length;
126 |     std::vector<graphlily::idx_val_t, aligned_allocator<graphlily::idx_val_t>> mask(length + 1);
127 |     for (size_t i = 0; i < length + 1; i++) {
128 |         mask[i].val = mask_float[i].val;
129 |         mask[i].index = mask_float[i].index;
130 |     }
131 |     graphlily::aligned_dense_float_vec_t reference_inout(inout_size);
132 |     std::generate(reference_inout.begin(), reference_inout.end(), [&](){return (rand() % 10);});
133 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> kernel_inout(reference_inout.begin(),
134 |                                                                                     reference_inout.end());
135 | 
136 |     module.send_mask_host_to_device(mask);
137 |     module.send_inout_host_to_device(kernel_inout);
138 |     module.run(val);
139 |     kernel_inout = module.send_inout_device_to_host();
140 |     module.compute_reference_results(mask_float, reference_inout, val_float);
141 | 
142 |     verify<graphlily::val_t>(reference_inout, kernel_inout);
143 | }
144 | 
145 | 
146 | TEST(AssignVectorSparseNewFrontier, Basic) {
147 |     bool generate_new_frontier = true;
148 |     graphlily::module::AssignVectorSparseModule<graphlily::val_t,
149 |         graphlily::idx_val_t> module(generate_new_frontier);
150 |     module.set_target(target);
151 |     module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
152 | 
153 |     float mask_sparsity = 0.9;
154 |     uint32_t inout_size = 128;
155 |     float inf;
156 |     if (std::is_same<graphlily::val_t, float>::value) {
157 |         inf = float(graphlily::FLOAT_INF);
158 |     } else if (std::is_same<graphlily::val_t, unsigned>::value) {
159 |         inf = float(graphlily::UINT_INF);
160 |     } else {
161 |         inf = float(graphlily::UFIXED_INF);
162 |     }
163 |     unsigned length = (unsigned)floor(inout_size * (1 - mask_sparsity));
164 |     unsigned mask_indices_increment = inout_size / length;
165 |     graphlily::aligned_sparse_float_vec_t mask_float(length + 1);
166 |     for (size_t i = 0; i < length; i++) {
167 |         mask_float[i + 1].val = float(rand() % 10);
168 |         mask_float[i + 1].index = i * mask_indices_increment;
169 |     }
170 |     mask_float[0].val = 0;
171 |     mask_float[0].index = length;
172 |     std::vector<graphlily::idx_val_t, aligned_allocator<graphlily::idx_val_t>> mask(length + 1);
173 |     for (size_t i = 0; i < length + 1; i++) {
174 |         mask[i].val = mask_float[i].val;
175 |         mask[i].index = mask_float[i].index;
176 |     }
177 | 
178 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> kernel_inout(inout_size);
179 |     std::generate(kernel_inout.begin(), kernel_inout.end(),
180 |         [&](){return (((rand() % 10) > 5) ? 5 : inf);});
181 |     graphlily::aligned_dense_float_vec_t reference_inout(kernel_inout.begin(), kernel_inout.end());
182 | 
183 |     std::vector<graphlily::idx_val_t, aligned_allocator<graphlily::idx_val_t>> kernel_new_frontier;
184 |     graphlily::aligned_sparse_float_vec_t reference_new_frontier;
185 | 
186 |     module.send_mask_host_to_device(mask);
187 |     module.send_inout_host_to_device(kernel_inout);
188 |     module.run();
189 |     kernel_inout = module.send_inout_device_to_host();
190 |     kernel_new_frontier = module.send_new_frontier_device_to_host();
191 |     module.compute_reference_results(mask_float, reference_inout, reference_new_frontier);
192 | 
193 |     // Verify kernel_inout
194 |     verify<graphlily::val_t>(reference_inout, kernel_inout);
195 | 
196 |     // Verify kernel_new_frontier
197 |     graphlily::aligned_dense_float_vec_t dense_ref_nf =
198 |         graphlily::convert_sparse_vec_to_dense_vec<graphlily::aligned_sparse_float_vec_t,
199 |             graphlily::aligned_dense_float_vec_t, float>(reference_new_frontier, inout_size, 0);
200 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> dense_knl_nf =
201 |         graphlily::convert_sparse_vec_to_dense_vec<
202 |             std::vector<graphlily::idx_val_t, aligned_allocator<graphlily::idx_val_t>>,
203 |             std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>>, graphlily::val_t>(
204 |                 kernel_new_frontier, inout_size, 0);
205 |     verify<graphlily::val_t>(dense_ref_nf, dense_knl_nf);
206 | }
207 | 
208 | 
209 | TEST(CopyBufferBindBuffer, Basic) {
210 |     graphlily::module::AssignVectorDenseModule<graphlily::val_t> module;
211 |     module.set_target(target);
212 |     module.set_up_runtime("./" + graphlily::proj_folder_name + "/build_dir." + target + "/fused.xclbin");
213 | 
214 |     uint32_t length = 128;
215 |     std::vector<float, aligned_allocator<float>> mask_float(length);
216 |     std::generate(mask_float.begin(), mask_float.end(), [&](){return float(rand() % 2);});
217 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> mask(mask_float.begin(),
218 |                                                                             mask_float.end());
219 |     std::vector<float, aligned_allocator<float>> inout_float(length);
220 |     std::fill(inout_float.begin(), inout_float.end(), 0);
221 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> inout(inout_float.begin(),
222 |                                                                              inout_float.end());
223 | 
224 |     module.set_mask_type(graphlily::kMaskWriteToOne);
225 | 
226 |     /*----------------------------- Copy buffer -------------------------------*/
227 |     {
228 |     module.send_mask_host_to_device(mask);
229 |     module.send_inout_host_to_device(inout);
230 |     module.copy_buffer_device_to_device(module.mask_buf, module.inout_buf, sizeof(graphlily::val_t) * length);
231 |     inout = module.send_inout_device_to_host();
232 |     verify<graphlily::val_t>(mask_float, inout);
233 |     }
234 | 
235 |     /*----------------------------- Bind buffer -------------------------------*/
236 |     {
237 |     std::vector<float, aligned_allocator<float>> x_float(length);
238 |     std::fill(x_float.begin(), x_float.end(), 0);
239 |     std::vector<graphlily::val_t, aligned_allocator<graphlily::val_t>> x(x_float.begin(), x_float.end());
240 |     cl_mem_ext_ptr_t x_ext;
241 |     x_ext.obj = x.data();
242 |     x_ext.param = 0;
243 |     x_ext.flags = graphlily::HBM[graphlily::num_hbm_channels + 1];
244 |     cl::Device device = graphlily::find_device();
245 |     cl::Context context = cl::Context(device, NULL, NULL, NULL);
246 |     cl::Buffer x_buf = cl::Buffer(context,
247 |                                   CL_MEM_EXT_PTR_XILINX | CL_MEM_USE_HOST_PTR,
248 |                                   sizeof(graphlily::val_t) * length,
249 |                                   &x_ext);
250 |     cl::CommandQueue command_queue = cl::CommandQueue(context, device);
251 | 
252 |     module.send_mask_host_to_device(mask);
253 |     module.bind_inout_buf(x_buf);
254 |     module.run(length, 2);
255 |     command_queue.enqueueMigrateMemObjects({x_buf}, CL_MIGRATE_MEM_OBJECT_HOST);
256 |     command_queue.finish();
257 | 
258 |     module.compute_reference_results(mask_float, inout_float, length, 2);
259 |     verify<graphlily::val_t>(inout_float, x);
260 |     }
261 | }
262 | 
263 | 
264 | TEST(Clean, NULL) {
265 |     clean_proj_folder();
266 | }
267 | 
268 | 
269 | int main(int argc, char ** argv) {
270 |     testing::InitGoogleTest(&argc, argv);
271 |     return RUN_ALL_TESTS();
272 | }
273 | 
274 | #pragma GCC diagnostic pop
275 | 


--------------------------------------------------------------------------------
/tests/testbench/pe_tb.cpp:
--------------------------------------------------------------------------------
  1 | #include "pe_tb.h"
  2 | #include "ufixed_pe_fwd.h"
  3 | #include "hls_stream.h"
  4 | #include <iostream>
  5 | #include <iomanip>
  6 | 
  7 | template<unsigned num_lanes>
  8 | static void data_feeder(
  9 |     PE_I_T input_buffer[num_lanes][IN_BUF_SIZE],
 10 |     hls::stream<PE_I_T> output_stream[num_lanes],
 11 |     hls::stream<IDX_T> &output_npld_stream
 12 | ) {
 13 |     loop_data_feeder:
 14 |     for (unsigned i = 0; i < IN_BUF_SIZE; i++) {
 15 |         #pragma HLS pipeline II=1
 16 |         for (unsigned Lid = 0; Lid < num_lanes; Lid++)  {
 17 |             #pragma HLS unroll
 18 |             output_stream[Lid].write(input_buffer[Lid][i]);
 19 |         }
 20 |     }
 21 |     output_npld_stream.write(IN_BUF_SIZE * num_lanes);
 22 | }
 23 | 
 24 | 
 25 | static void main_dataflow(
 26 |     PE_I_T input_buffer[NUM_PE][IN_BUF_SIZE],
 27 |     VAL_T output_buffer[NUM_PE][BANK_SIZE]
 28 | ) {
 29 |     hls::stream<PE_I_T> DF_to_PE_stream[NUM_PE];
 30 |     hls::stream<IDX_T> DF_to_PE_npld_stream;
 31 |     #pragma HLS stream variable=DF_to_PE_stream depth=8
 32 |     #pragma HLS stream variable=DF_to_PE_npld_stream depth=2
 33 | 
 34 |     #pragma HLS dataflow
 35 | 
 36 |     data_feeder<NUM_PE>(input_buffer, DF_to_PE_stream, DF_to_PE_npld_stream);
 37 | 
 38 |     ufixed_pe_cluster_spmv_uram<VAL_T, char, PE_I_T, NUM_PE, BANK_ID_NBITS, BANK_SIZE>(
 39 |         DF_to_PE_stream,
 40 |         output_buffer,
 41 |         MULADD,
 42 |         0,
 43 |         DF_to_PE_npld_stream
 44 |     );
 45 | }
 46 | 
 47 | extern "C" {
 48 | void pe_tb(
 49 |     const IDX_T *test_addr_gmem, //0
 50 |     const VAL_T *test_mat_gmem,  //1
 51 |     const VAL_T *test_vec_gmem,  //2
 52 |     VAL_T *result_gmem           //3
 53 | ) {
 54 |     #pragma HLS interface m_axi port=test_addr_gmem offset=slave bundle=gmem0
 55 |     #pragma HLS interface m_axi port=test_mat_gmem  offset=slave bundle=gmem1
 56 |     #pragma HLS interface m_axi port=test_vec_gmem  offset=slave bundle=gmem2
 57 |     #pragma HLS interface m_axi port=result_gmem    offset=slave bundle=gmem3
 58 | 
 59 |     #pragma HLS interface s_axilite port=test_addr_gmem bundle=control
 60 |     #pragma HLS interface s_axilite port=test_mat_gmem  bundle=control
 61 |     #pragma HLS interface s_axilite port=test_vec_gmem  bundle=control
 62 |     #pragma HLS interface s_axilite port=result_gmem    bundle=control
 63 | 
 64 |     #pragma HLS interface s_axilite port=return bundle=control
 65 | 
 66 |     // input buffer
 67 |     PE_I_T input_buffer[NUM_PE][IN_BUF_SIZE];
 68 |     #pragma HLS array_partition variable=input_buffer dim=1 complete
 69 |     #pragma HLS resource variable=input_buffer core=RAM_1P
 70 | 
 71 |     // output buffer
 72 |     VAL_T output_buffer[NUM_PE][BANK_SIZE];
 73 |     #pragma HLS array_partition variable=output_buffer dim=1 complete
 74 |     #pragma HLS resource variable=output_buffer core=RAM_2P latency=2
 75 | 
 76 |     // reset output buffer
 77 |     loop_reset_ob:
 78 |     for (unsigned i = 0; i < BANK_SIZE; i++) {
 79 |         #pragma HLS pipeline II=1
 80 |         for (unsigned PEid = 0; PEid < NUM_PE; PEid++) {
 81 |             #pragma HLS unroll
 82 |             output_buffer[PEid][i] = 0;
 83 |         }
 84 |     }
 85 | 
 86 |     // initialize input buffer
 87 |     loop_ini_ib:
 88 |     for (unsigned i = 0; i < NUM_PE * IN_BUF_SIZE; i++) {
 89 |         #pragma HLS pipeline II=1
 90 |         input_buffer[i / IN_BUF_SIZE][i % IN_BUF_SIZE].index = test_addr_gmem[i];
 91 |         input_buffer[i / IN_BUF_SIZE][i % IN_BUF_SIZE].data.mat_val = test_mat_gmem[i];
 92 |         input_buffer[i / IN_BUF_SIZE][i % IN_BUF_SIZE].data.vec_val = test_vec_gmem[i];
 93 |     }
 94 | 
 95 |     // run main dataflow
 96 |     main_dataflow(input_buffer, output_buffer);
 97 | 
 98 |     // write back to results
 99 |     loop_wb_2:
100 |     for (unsigned i = 0; i < BANK_SIZE; i++) {
101 |         loop_wb_1:
102 |         for (unsigned PEid = 0; PEid < NUM_PE; PEid++) {
103 |             #pragma HLS pipeline II=1
104 |             result_gmem[i * NUM_PE + PEid] = output_buffer[PEid][i];
105 |         }
106 |     }
107 | 
108 | } // extern "C"
109 | } // kernel
110 | 


--------------------------------------------------------------------------------
/tests/testbench/pe_tb.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRAPHLILY_TEST_TESTBENCH_PE_TB_H_
 2 | #define GRAPHLILY_TEST_TESTBENCH_PE_TB_H_
 3 | 
 4 | #include "ap_fixed.h"
 5 | 
 6 | #define MULADD 0
 7 | #define ANDOR  1
 8 | #define ADDMIN 2
 9 | 
10 | // data types
11 | typedef unsigned IDX_T;
12 | typedef ap_ufixed<32, 8, AP_RND, AP_SAT> VAL_T;
13 | 
14 | typedef struct pe_input_val_type {
15 |     VAL_T mat_val;
16 |     VAL_T vec_val;
17 | } PE_I_VAL_T;
18 | 
19 | typedef struct pe_input_type {
20 |     IDX_T index;
21 |     PE_I_VAL_T data;
22 | } PE_I_T;
23 | 
24 | // Below configurations will be overwritten by the compiler
25 | // const unsigned NUM_PE =
26 | // const unsigned BANK_ID_NBITS =
27 | // const unsigned BANK_SIZE =
28 | // const unsigned IN_BUF_SIZE =
29 | // #endif // GRAPHLILY_TEST_TESTBENCH_PE_TB_H_
30 | 


--------------------------------------------------------------------------------
/tests/testbench/shuffle_tb.h:
--------------------------------------------------------------------------------
 1 | #ifndef GRAPHLILY_TEST_TESTBENCH_SHUFFLE_TB_H_
 2 | #define GRAPHLILY_TEST_TESTBENCH_SHUFFLE_TB_H_
 3 | 
 4 | // data types
 5 | typedef struct shuffle_inout_data_type {
 6 |     unsigned uuid;
 7 |     unsigned padding;
 8 | } SF_IO_DATA_T;
 9 | 
10 | typedef struct shuffle_inout_type {
11 |     unsigned index;
12 |     SF_IO_DATA_T data;
13 | } SF_IO_T;
14 | 
15 | typedef struct testbench_interfece_type {
16 |     unsigned index;
17 |     unsigned uuid;
18 | } TB_IFC_T;
19 | 
20 | const unsigned INVALID_UUID = 0;
21 | 
22 | const unsigned NUM_IN_LANES = 8;
23 | const unsigned NUM_OUT_LANES = 8;
24 | const unsigned ADDR_MASK = 7;
25 | 
26 | // Below configurations will be overwritten by the compiler
27 | // const unsigned IN_BUF_SIZE =
28 | // const unsigned OUT_BUF_SIZE =
29 | // #endif // GRAPHLILY_TEST_TESTBENCH_SHUFFLE_TB_H_
30 | 


--------------------------------------------------------------------------------
/xrt/includes/cmdparser/cmdlineparser.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2019, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | #include "cmdlineparser.h"
 30 | #include "logger.h"
 31 | #include <assert.h>
 32 | #include <fstream>
 33 | #include <iostream>
 34 | #include <stdlib.h>
 35 | 
 36 | namespace sda {
 37 | namespace utils {
 38 | 
 39 | bool is_file(const std::string &name) {
 40 |     ifstream f(name.c_str());
 41 |     if (f.good()) {
 42 |         f.close();
 43 |         return true;
 44 |     } else {
 45 |         f.close();
 46 |         return false;
 47 |     }
 48 | }
 49 | 
 50 | bool is_number(const std::string &s) {
 51 |     std::string::const_iterator it = s.begin();
 52 |     while (it != s.end() && std::isdigit(*it))
 53 |         ++it;
 54 |     return !s.empty() && it == s.end();
 55 | }
 56 | 
 57 | bool starts_with(const string &src, const string &sub) {
 58 |     return (src.find(sub) == 0);
 59 | }
 60 | 
 61 | CmdLineParser::CmdLineParser() {
 62 |     // TODO Auto-generated constructor stub
 63 |     m_strDefaultKey = "";
 64 |     m_appname = "application.exe";
 65 |     addSwitch("--help", "-h", "prints this help list", "", true);
 66 | }
 67 | 
 68 | /*
 69 | CmdLineParser::CmdLineParser(int argc, char* argv[]) {
 70 |     // TODO Auto-generated constructor stub
 71 |     assert(parse(argc, argv) > 0);
 72 | 
 73 | }
 74 | */
 75 | 
 76 | CmdLineParser::~CmdLineParser() {
 77 |     // TODO Auto-generated destructor stub
 78 |     for (size_t i = 0; i < m_vSwitches.size(); i++) {
 79 |         delete m_vSwitches[i];
 80 |         m_vSwitches[i] = NULL;
 81 |     }
 82 | 
 83 |     m_vSwitches.resize(0);
 84 | }
 85 | 
 86 | bool CmdLineParser::addSwitch(const CmdSwitch &s) {
 87 | 
 88 |     CmdSwitch cmd = s;
 89 | 
 90 |     if (cmd.desc.length() == 0) {
 91 |         LogError("No description provided!");
 92 |         return false;
 93 |     }
 94 | 
 95 |     //check input
 96 |     if (cmd.key.find("--") != 0 || cmd.key.length() < 3) {
 97 |         LogError("The input key is invalid. Please start with -- and keep a "
 98 |                  "length >= 3");
 99 |         return false;
100 |     }
101 | 
102 |     if (m_mapKeySwitch.find(cmd.key) != m_mapKeySwitch.end()) {
103 |         LogError("This key %s is taken already!", cmd.key.c_str());
104 |         return false;
105 |     }
106 | 
107 |     if (cmd.shortcut.length() == 0) {
108 | 
109 |         string temp = "-" + cmd.key[2];
110 | 
111 |         int i = 3;
112 |         while (m_mapShortcutKeys.find(temp) != m_mapShortcutKeys.end() &&
113 |                (size_t)i < cmd.key.length()) {
114 |             temp = "-" + s.key[i];
115 |             i++;
116 |         }
117 | 
118 |         cmd.shortcut = temp;
119 |         LogInfo("Automatic shortcut assigned %s to %s",
120 |                 temp.c_str(),
121 |                 cmd.key.c_str());
122 |     }
123 | 
124 |     if (s.istoggle) {
125 |         cmd.default_value = string("false");
126 |         cmd.value = cmd.default_value;
127 |         cmd.isvalid = true;
128 |     } else {
129 |         cmd.value = cmd.default_value;
130 |         cmd.isvalid = false;
131 |     }
132 | 
133 |     //add
134 |     CmdSwitch *pcmd = new CmdSwitch(cmd);
135 |     m_vSwitches.push_back(pcmd);
136 |     m_mapShortcutKeys[s.shortcut] = cmd.key;
137 |     m_mapKeySwitch[s.key] = pcmd;
138 | 
139 |     return true;
140 | }
141 | 
142 | bool CmdLineParser::addSwitch(const string &name,
143 |                               const string &shortcut,
144 |                               const string &desc,
145 |                               const string &default_value,
146 |                               bool istoggle) {
147 | 
148 |     CmdSwitch s;
149 |     s.key = name;
150 |     s.shortcut = shortcut;
151 |     s.desc = desc;
152 |     s.default_value = default_value;
153 |     s.istoggle = istoggle;
154 | 
155 |     return addSwitch(s);
156 | }
157 | 
158 | bool CmdLineParser::setDefaultKey(const char *key) {
159 |     string strKey(key);
160 |     if (!starts_with(strKey, "--"))
161 |         strKey = "--" + strKey;
162 | 
163 |     if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end()) {
164 |         CmdSwitch *pcmd = m_mapKeySwitch[m_strDefaultKey];
165 |         if (pcmd != NULL) {
166 |             if (pcmd->istoggle) {
167 |                 LogError("Boolean command line options can not be used as "
168 |                          "default keys");
169 |                 return false;
170 |             }
171 |         }
172 | 
173 |         //set default key
174 |         m_strDefaultKey = strKey;
175 |         return true;
176 |     } else
177 |         return false;
178 | }
179 | 
180 | int CmdLineParser::parse(int argc, char *argv[]) {
181 | 
182 |     int i = 0;
183 |     int ctOptions = 0;
184 |     while (i < argc) {
185 |         string key, val;
186 |         bool iskey = false;
187 |         string token = string(argv[i]);
188 | 
189 |         bool isNextTokenKey = false;
190 |         if (i + 1 < argc) {
191 |             string peeknext = string(argv[i + 1]);
192 |             if (starts_with(peeknext, "-") || starts_with(peeknext, "--")) {
193 |                 string fullkey;
194 |                 isNextTokenKey = token_to_fullkeyname(peeknext, fullkey);
195 |             }
196 |         }
197 | 
198 |         //full-key
199 |         if (starts_with(token, string("--"))) {
200 |             if (m_mapKeySwitch.find(token) == m_mapKeySwitch.end()) {
201 |                 LogError("Unrecognized key passed %s", token.c_str());
202 |                 printHelp();
203 |                 return -1;
204 |             }
205 | 
206 |             key = token;
207 |             iskey = true;
208 |         }
209 |         //shortcut
210 |         else if (starts_with(token, "-")) {
211 |             if (m_mapShortcutKeys.find(token) == m_mapShortcutKeys.end()) {
212 |                 LogError("Unrecognized shortcut key passed %s", token.c_str());
213 |                 printHelp();
214 |                 return -1;
215 |             }
216 | 
217 |             key = m_mapShortcutKeys[token];
218 |             iskey = true;
219 |         }
220 |         //default key, the value for default key is the last argument
221 |         else if (isNextTokenKey == false && m_strDefaultKey.length() > 0 &&
222 |                  i == argc - 2) {
223 |             if (m_mapKeySwitch.find(m_strDefaultKey) == m_mapKeySwitch.end()) {
224 |                 LogError("Unrecognized default key %s",
225 |                          m_strDefaultKey.c_str());
226 |                 printHelp();
227 |                 return -1;
228 |             }
229 | 
230 |             LogInfo("Using default key: %s", m_strDefaultKey.c_str());
231 |             key = m_strDefaultKey;
232 |             iskey = true;
233 |         }
234 | 
235 |         //if iskey and needs param then read it
236 |         if (iskey) {
237 |             ctOptions++;
238 | 
239 |             if (key == "--help") {
240 |                 printHelp();
241 |                 return 1;
242 |             }
243 | 
244 |             //fetch value
245 |             CmdSwitch *pcmd = m_mapKeySwitch[key];
246 | 
247 |             //read next
248 |             if (pcmd->istoggle) {
249 |                 pcmd->value = string("true");
250 |                 pcmd->isvalid = true;
251 |             } else {
252 |                 i++;
253 |                 pcmd->value = string(argv[i]);
254 |                 pcmd->isvalid = true;
255 |             }
256 |         }
257 | 
258 |         //next token
259 |         i++;
260 |     }
261 | 
262 |     //capture real app name
263 |     if (argc > 0) {
264 |         m_appname = string(argv[0]);
265 |     }
266 | 
267 |     return ctOptions;
268 | }
269 | 
270 | bool CmdLineParser::token_to_fullkeyname(const string &token, string &fullkey) {
271 | 
272 |     fullkey = "";
273 |     int ctDashes = 0;
274 |     if (starts_with(token, string("--")))
275 |         ctDashes = 2;
276 |     else if (starts_with(token, string("-")))
277 |         ctDashes = 1;
278 | 
279 |     if (ctDashes == 0)
280 |         return false;
281 | 
282 |     if (ctDashes == 2) {
283 |         if (m_mapKeySwitch.find(token) == m_mapKeySwitch.end()) {
284 |             LogError("Unrecognized key passed %s", token.c_str());
285 |             return false;
286 |         }
287 |         fullkey = token;
288 |     } else if (ctDashes == 1) {
289 |         if (m_mapShortcutKeys.find(token) == m_mapShortcutKeys.end()) {
290 |             LogError("Unrecognized shortcut key passed %s", token.c_str());
291 |             return false;
292 |         }
293 | 
294 |         fullkey = m_mapShortcutKeys[token];
295 |     }
296 | 
297 |     return (fullkey.length() > 0);
298 | }
299 | 
300 | string CmdLineParser::value(const char *key) {
301 | 
302 |     string strKey(key);
303 |     if (!starts_with(strKey, "--"))
304 |         strKey = "--" + strKey;
305 | 
306 |     if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end())
307 |         return m_mapKeySwitch[strKey]->value;
308 |     else {
309 |         LogWarn("The input key %s is not recognized!", strKey.c_str());
310 |         return string("");
311 |     }
312 | }
313 | 
314 | int CmdLineParser::value_to_int(const char *key) {
315 |     string strVal = value(key);
316 |     if (strVal.length() == 0 || !is_number(strVal))
317 |         return -1;
318 |     return atoi(strVal.c_str());
319 | }
320 | 
321 | double CmdLineParser::value_to_double(const char *key) {
322 |     string strVal = value(key);
323 |     if (strVal.length() == 0)
324 |         return -1;
325 |     return atof(strVal.c_str());
326 | }
327 | 
328 | bool CmdLineParser::isValid(const char *key) {
329 |     string strKey(key);
330 |     if (!starts_with(strKey, "--"))
331 |         strKey = "--" + strKey;
332 | 
333 |     if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end())
334 |         return m_mapKeySwitch[strKey]->isvalid;
335 |     else {
336 |         LogWarn("The input key %s is not recognized!", strKey.c_str());
337 |         return false;
338 |     }
339 | }
340 | 
341 | void CmdLineParser::printHelp() {
342 |     printf("===========================================================\n");
343 |     string strAllShortcuts = "";
344 |     for (size_t i = 0; i < m_vSwitches.size(); i++) {
345 |         CmdSwitch *pcmd = m_vSwitches[i];
346 |         if (pcmd && pcmd->shortcut.length() > 0)
347 |             strAllShortcuts = strAllShortcuts + pcmd->shortcut;
348 |     }
349 |     //example
350 |     printf("Usage: %s -[%s]\n\n", m_appname.c_str(), strAllShortcuts.c_str());
351 | 
352 |     //row by row
353 |     for (size_t i = 0; i < m_vSwitches.size(); i++) {
354 |         CmdSwitch *pcmd = m_vSwitches[i];
355 | 
356 |         if (pcmd->default_value.length() > 0)
357 |             printf("\t%s, %s\t\t%s\t Default: [%s]\n",
358 |                    pcmd->key.c_str(),
359 |                    pcmd->shortcut.c_str(),
360 |                    pcmd->desc.c_str(),
361 |                    pcmd->default_value.c_str());
362 |         else
363 |             printf("\t%s, %s\t\t%s\n",
364 |                    pcmd->key.c_str(),
365 |                    pcmd->shortcut.c_str(),
366 |                    pcmd->desc.c_str());
367 |     }
368 | }
369 | 
370 | CmdLineParser::CmdSwitch *CmdLineParser::getCmdSwitch(const char *key) {
371 |     string strKey(key);
372 |     if (!starts_with(strKey, "--"))
373 |         strKey = "--" + strKey;
374 | 
375 |     if (m_mapKeySwitch.find(strKey) != m_mapKeySwitch.end())
376 |         return m_mapKeySwitch[strKey];
377 |     else
378 |         return NULL;
379 | }
380 | 
381 | } // namespace utils
382 | } // namespace sda
383 | 


--------------------------------------------------------------------------------
/xrt/includes/cmdparser/cmdlineparser.h:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2019, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | #ifndef CMDLINEPARSER_H_
 30 | #define CMDLINEPARSER_H_
 31 | 
 32 | #include <map>
 33 | #include <string>
 34 | #include <vector>
 35 | 
 36 | using namespace std;
 37 | 
 38 | namespace sda {
 39 | namespace utils {
 40 | 
 41 | bool is_file(const std::string& name);
 42 | 
 43 | /*!
 44 |  * Synopsis:
 45 |  * 1.Parses the command line passed in from the user and stores all enabled
 46 |  *      system options.
 47 |  * 2.Prints help for the user if an option is not valid.
 48 |  * 3.Stores options and provides a mechanism to read those options
 49 |  */
 50 | class CmdLineParser {
 51 | public:
 52 |     class CmdSwitch {
 53 |     public:
 54 |         CmdSwitch() {}
 55 |         CmdSwitch(const CmdSwitch& rhs) {
 56 |             copyfrom(rhs);
 57 |         }
 58 | 
 59 |         void copyfrom(const CmdSwitch& rhs) {
 60 |             this->key = rhs.key;
 61 |             this->shortcut = rhs.shortcut;
 62 |             this->default_value = rhs.default_value;
 63 |             this->value = rhs.value;
 64 |             this->desc = rhs.desc;
 65 |             this->istoggle = rhs.istoggle;
 66 |             this->isvalid = rhs.isvalid;
 67 |         }
 68 | 
 69 |         CmdSwitch& operator=(const CmdSwitch& rhs) {
 70 |             this->copyfrom(rhs);
 71 |             return *this;
 72 |         }
 73 |     public:
 74 |         string key;
 75 |         string shortcut;
 76 |         string default_value;
 77 |         string value;
 78 |         string desc;
 79 |         bool istoggle;
 80 |         bool isvalid;
 81 |     };
 82 | 
 83 | public:
 84 |     CmdLineParser();
 85 |     //CmdLineParser(int argc, char* argv[]);
 86 |     virtual ~CmdLineParser();
 87 | 
 88 | 
 89 |     bool addSwitch(const CmdSwitch& s);
 90 |     bool addSwitch(const string& name, const string& shortcut,
 91 |                     const string& desc, const string& default_value = "",
 92 |                     bool istoggle = false);
 93 | 
 94 |     /*!
 95 |      * sets default key to be able to read a 2 argumented call
 96 |      */
 97 |     bool setDefaultKey(const char* key);
 98 | 
 99 |     /*!
100 |      * parse and store command line
101 |      */
102 |     int parse(int argc, char* argv[]);
103 | 
104 |     /*!
105 |      * retrieve value using a key
106 |      */
107 |     string value(const char* key);
108 | 
109 |     int value_to_int(const char* key);
110 | 
111 | 
112 |     double value_to_double(const char* key);
113 | 
114 |     /*!
115 |      * Returns true if a valid value is supplied by user
116 |      */
117 |     bool isValid(const char* key);
118 | 
119 |     /*!
120 |      * prints the help menu in case the options are not correct.
121 |      */
122 |     virtual void printHelp();
123 | 
124 | protected:
125 |     /*!
126 |      * Retrieve command switch
127 |      */
128 |     CmdSwitch* getCmdSwitch(const char* key);
129 | 
130 |     bool token_to_fullkeyname(const string& token, string& fullkey);
131 | 
132 | 
133 | private:
134 |     map<string, CmdSwitch*> m_mapKeySwitch;
135 |     map<string, string> m_mapShortcutKeys;
136 |     vector<CmdSwitch*> m_vSwitches;
137 |     string m_strDefaultKey;
138 |     string m_appname;
139 | };
140 | 
141 | //bool starts_with(const string& src, const string& sub);
142 | 
143 | }
144 | }
145 | #endif /* CMDLINEPARSER_H_ */
146 | 


--------------------------------------------------------------------------------
/xrt/includes/cmdparser/cmdparser.mk:
--------------------------------------------------------------------------------
1 | cmdparser_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/cmdparser/cmdlineparser.cpp
2 | cmdparser_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/cmdparser/cmdlineparser.h
3 | cmdparser_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/cmdparser
4 | 


--------------------------------------------------------------------------------
/xrt/includes/logger/logger.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2019, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | #include "logger.h"
 30 | #include <algorithm>
 31 | #include <fstream>
 32 | #include <functional>
 33 | #include <stdarg.h>
 34 | #include <time.h>
 35 | #ifdef WINDOWS
 36 | #include <direct.h>
 37 | #else
 38 | #include <unistd.h>
 39 | #endif
 40 | 
 41 | using namespace std;
 42 | 
 43 | namespace sda {
 44 | 
 45 | ///////////////////////////////////////////////////////////////////////
 46 | string GetApplicationPath() {
 47 | #ifdef WINDOWS
 48 | #define GetCurrentDir _getcwd
 49 | #else
 50 | #define GetCurrentDir getcwd
 51 | #endif
 52 | 
 53 |     char strCurrentPath[FILENAME_MAX];
 54 | 
 55 |     if (!GetCurrentDir(strCurrentPath, sizeof(strCurrentPath))) {
 56 |         return string("");
 57 |     }
 58 | 
 59 |     /* not really required */
 60 |     strCurrentPath[sizeof(strCurrentPath) - 1] = '\0';
 61 |     return string(strCurrentPath);
 62 | }
 63 | 
 64 | string ToLower(const string &s) {
 65 |     string result = s;
 66 |     std::transform(result.begin(), result.end(), result.begin(), ::tolower);
 67 |     return result;
 68 | }
 69 | 
 70 | string ToUpper(const string &s) {
 71 |     string result = s;
 72 |     std::transform(result.begin(), result.end(), result.begin(), ::toupper);
 73 |     return result;
 74 | }
 75 | 
 76 | string GetTimeStamp() { return ""; }
 77 | 
 78 | // trim from start
 79 | string &ltrim(std::string &s) {
 80 |     s.erase(s.begin(),
 81 |             std::find_if(s.begin(),
 82 |                          s.end(),
 83 |                          std::not1(std::ptr_fun<int, int>(std::isspace))));
 84 |     return s;
 85 | }
 86 | 
 87 | // trim from end
 88 | string &rtrim(std::string &s) {
 89 |     s.erase(std::find_if(s.rbegin(),
 90 |                          s.rend(),
 91 |                          std::not1(std::ptr_fun<int, int>(std::isspace)))
 92 |                 .base(),
 93 |             s.end());
 94 |     return s;
 95 | }
 96 | 
 97 | // trim from both ends
 98 | string &trim(std::string &s) { return ltrim(rtrim(s)); }
 99 | 
100 | string GetFileExt(const string &s) {
101 |     string strext = s.substr(s.find_last_of(".") + 1);
102 |     return strext;
103 | }
104 | 
105 | string GetFileTitleOnly(const string &s) {
106 | 
107 |     string temp = s;
108 |     string::size_type d = temp.find_last_of("//");
109 |     if (d == string::npos)
110 |         d = temp.find_last_of("\\");
111 |     if (d != string::npos)
112 |         temp = temp.substr(d + 1);
113 | 
114 |     d = temp.find_last_of(".");
115 |     if (d != string::npos)
116 |         temp = temp.substr(0, d);
117 | 
118 |     return temp;
119 | }
120 | 
121 | void LogWrapper(int etype, const char *file, int line, const char *desc, ...) {
122 | 
123 |     //crop file name from full path
124 |     string strFileLoc(file);
125 |     strFileLoc = strFileLoc.substr(strFileLoc.find_last_of("\\/") + 1);
126 | 
127 |     string strHeader = "";
128 |     {
129 |         char header[512];
130 |         //source
131 |         switch (etype) {
132 |         case (sda::etError): {
133 |             snprintf(header,
134 |                      sizeof(header),
135 |                      "ERROR: [%s:%d]",
136 |                      strFileLoc.c_str(),
137 |                      line);
138 |             break;
139 |         }
140 |         case (sda::etInfo): {
141 |             snprintf(header,
142 |                      sizeof(header),
143 |                      "INFO: [%s:%d]",
144 |                      strFileLoc.c_str(),
145 |                      line);
146 |             break;
147 |         }
148 |         case (sda::etWarning): {
149 |             snprintf(header,
150 |                      sizeof(header),
151 |                      "WARN: [%s:%d]",
152 |                      strFileLoc.c_str(),
153 |                      line);
154 |             break;
155 |         }
156 |         }
157 |         strHeader = string(header);
158 |     }
159 | 
160 |     //time
161 |     string strTime = "";
162 | #ifdef ENABLE_LOG_TIME
163 |     {
164 |         time_t rawtime;
165 |         time(&rawtime);
166 | #ifdef ENABLE_SECURE_API
167 |         char buffer[64];
168 |         struct tm timeinfo;
169 |         localtime_s(&timeinfo, &rawtime);
170 |         asctime_s(timeinfo, buffer, sizeof(buffer))
171 |             snprintf(buffer, sizeof(buffer), "TIME: [%s]", asctime(timeinfo));
172 |         strTime = string(buffer);
173 | #else
174 |         char buffer[64];
175 |         struct tm *timeinfo = localtime(&rawtime);
176 |         string temp = string(asctime(timeinfo));
177 |         temp = trim(temp);
178 | 
179 |         //        strftime(buffer, sizeof(buffer), "TIME: []")
180 |         snprintf(buffer, sizeof(buffer), "TIME: [%s]", temp.c_str());
181 |         strTime = string(buffer);
182 | #endif
183 |     }
184 | #endif
185 | 
186 |     //format the message itself
187 |     string strMsg = "";
188 |     {
189 |         char msg[512];
190 |         va_list args;
191 |         va_start(args, desc);
192 |         vsnprintf(msg, sizeof(msg), desc, args);
193 |         va_end(args);
194 |         strMsg = string(msg);
195 |     }
196 | 
197 |     //combine
198 |     string strOut =
199 |         strHeader + string(" ") + strTime + string(" ") + strMsg + string("\n");
200 | 
201 |     //display
202 |     cout << strOut;
203 | 
204 |     //store
205 | #ifdef ENABLE_LOG_TOFILE
206 |     std::ofstream outfile;
207 |     outfile.open("benchapp.log", std::ios_base::app);
208 |     outfile << strOut;
209 | #endif
210 | 
211 |     return;
212 | }
213 | 
214 | } // namespace sda
215 | 


--------------------------------------------------------------------------------
/xrt/includes/logger/logger.h:
--------------------------------------------------------------------------------
 1 | /**********
 2 | Copyright (c) 2019, Xilinx, Inc.
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification,
 6 | are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice,
 9 | this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its contributors
16 | may be used to endorse or promote products derived from this software
17 | without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | **********/
29 | #ifndef LOGGER_H_
30 | #define LOGGER_H_
31 | 
32 | #include <iomanip>
33 | #include <iostream>
34 | #include <string>
35 | #include <vector>
36 | 
37 | 
38 | #define ENABLE_LOG_TOFILE 1
39 | #define ENABLE_LOG_TIME 1
40 | 
41 | //global logging
42 | #define LogInfo(desc, ...) sda::LogWrapper(0, __FILE__, __LINE__, desc, ##__VA_ARGS__)
43 | #define LogWarn(desc, ...) sda::LogWrapper(1, __FILE__, __LINE__, desc, ##__VA_ARGS__)
44 | #define LogError(desc, ...) sda::LogWrapper(2, __FILE__, __LINE__, desc, ##__VA_ARGS__)
45 | 
46 | using namespace std;
47 | 
48 | namespace sda {
49 | 
50 |     enum LOGTYPE {etInfo, etWarning, etError};
51 | 
52 |     //string
53 |     string& ltrim(string& s);
54 |     string& rtrim(string& s);
55 |     string& trim(string& s);
56 |     string GetFileExt(const string& s);
57 |     string GetFileTitleOnly(const string& s);
58 | 
59 |     string ToLower(const string& s);
60 |     string ToUpper(const string& s);
61 | 
62 |     //time
63 |     string GetTimeStamp();
64 | 
65 |     //paths
66 |     string GetApplicationPath();
67 | 
68 | 
69 |     //debug
70 |     template<typename T>
71 |     void PrintPOD(const vector<T>& pod, size_t display_count = 0, const int precision = 4) {
72 | 
73 |         size_t count = pod.size();
74 |         if(display_count > 0)
75 |             count = std::min<size_t>(pod.size(), display_count);
76 | 
77 |         for(size_t i = 0; i < count; i++) {
78 |             cout << std::setprecision(precision) << pod[i] << ", ";
79 |         }
80 |         cout << endl;
81 |     }
82 | 
83 |     //logging
84 |     void LogWrapper(int etype, const char* file, int line, const char* desc, ...);
85 | 
86 | }
87 | 
88 | 
89 | 
90 | #endif /* LOGGER_H_ */
91 | 


--------------------------------------------------------------------------------
/xrt/includes/logger/logger.mk:
--------------------------------------------------------------------------------
1 | logger_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/logger/logger.cpp
2 | logger_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/logger/logger.h
3 | logger_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/logger
4 | 


--------------------------------------------------------------------------------
/xrt/includes/oclHelper/oclErrorCodes.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2019, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | #include <map>
 30 | #include <string>
 31 | 
 32 | #include <CL/cl.h>
 33 | 
 34 | #define TO_STRING(x) #x
 35 | 
 36 | static const std::pair<cl_int, std::string> map_pairs[] = {
 37 |     std::make_pair(CL_SUCCESS, TO_STRING(CL_SUCCESS)),
 38 |     std::make_pair(CL_DEVICE_NOT_FOUND, TO_STRING(CL_DEVICE_NOT_FOUND)),
 39 |     std::make_pair(CL_DEVICE_NOT_AVAILABLE, TO_STRING(CL_DEVICE_NOT_AVAILABLE)),
 40 |     std::make_pair(CL_COMPILER_NOT_AVAILABLE,
 41 |                    TO_STRING(CL_COMPILER_NOT_AVAILABLE)),
 42 |     std::make_pair(CL_MEM_OBJECT_ALLOCATION_FAILURE,
 43 |                    TO_STRING(CL_MEM_OBJECT_ALLOCATION_FAILURE)),
 44 |     std::make_pair(CL_OUT_OF_RESOURCES, TO_STRING(CL_OUT_OF_RESOURCES)),
 45 |     std::make_pair(CL_OUT_OF_HOST_MEMORY, TO_STRING(CL_OUT_OF_HOST_MEMORY)),
 46 |     std::make_pair(CL_PROFILING_INFO_NOT_AVAILABLE,
 47 |                    TO_STRING(CL_PROFILING_INFO_NOT_AVAILABLE)),
 48 |     std::make_pair(CL_MEM_COPY_OVERLAP, TO_STRING(CL_MEM_COPY_OVERLAP)),
 49 |     std::make_pair(CL_IMAGE_FORMAT_MISMATCH,
 50 |                    TO_STRING(CL_IMAGE_FORMAT_MISMATCH)),
 51 |     std::make_pair(CL_IMAGE_FORMAT_NOT_SUPPORTED,
 52 |                    TO_STRING(CL_IMAGE_FORMAT_NOT_SUPPORTED)),
 53 |     std::make_pair(CL_BUILD_PROGRAM_FAILURE,
 54 |                    TO_STRING(CL_BUILD_PROGRAM_FAILURE)),
 55 |     std::make_pair(CL_MAP_FAILURE, TO_STRING(CL_MAP_FAILURE)),
 56 |     std::make_pair(CL_MISALIGNED_SUB_BUFFER_OFFSET,
 57 |                    TO_STRING(CL_MISALIGNED_SUB_BUFFER_OFFSET)),
 58 |     std::make_pair(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST,
 59 |                    TO_STRING(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_W)),
 60 |     std::make_pair(CL_INVALID_VALUE, TO_STRING(CL_INVALID_VALUE)),
 61 |     std::make_pair(CL_INVALID_DEVICE_TYPE, TO_STRING(CL_INVALID_DEVICE_TYPE)),
 62 |     std::make_pair(CL_INVALID_PLATFORM, TO_STRING(CL_INVALID_PLATFORM)),
 63 |     std::make_pair(CL_INVALID_DEVICE, TO_STRING(CL_INVALID_DEVICE)),
 64 |     std::make_pair(CL_INVALID_CONTEXT, TO_STRING(CL_INVALID_CONTEXT)),
 65 |     std::make_pair(CL_INVALID_QUEUE_PROPERTIES,
 66 |                    TO_STRING(CL_INVALID_QUEUE_PROPERTIES)),
 67 |     std::make_pair(CL_INVALID_COMMAND_QUEUE,
 68 |                    TO_STRING(CL_INVALID_COMMAND_QUEUE)),
 69 |     std::make_pair(CL_INVALID_HOST_PTR, TO_STRING(CL_INVALID_HOST_PTR)),
 70 |     std::make_pair(CL_INVALID_MEM_OBJECT, TO_STRING(CL_INVALID_MEM_OBJECT)),
 71 |     std::make_pair(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
 72 |                    TO_STRING(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR)),
 73 |     std::make_pair(CL_INVALID_IMAGE_SIZE, TO_STRING(CL_INVALID_IMAGE_SIZE)),
 74 |     std::make_pair(CL_INVALID_SAMPLER, TO_STRING(CL_INVALID_SAMPLER)),
 75 |     std::make_pair(CL_INVALID_BINARY, TO_STRING(CL_INVALID_BINARY)),
 76 |     std::make_pair(CL_INVALID_BUILD_OPTIONS,
 77 |                    TO_STRING(CL_INVALID_BUILD_OPTIONS)),
 78 |     std::make_pair(CL_INVALID_PROGRAM, TO_STRING(CL_INVALID_PROGRAM)),
 79 |     std::make_pair(CL_INVALID_PROGRAM_EXECUTABLE,
 80 |                    TO_STRING(CL_INVALID_PROGRAM_EXECUTABLE)),
 81 |     std::make_pair(CL_INVALID_KERNEL_NAME, TO_STRING(CL_INVALID_KERNEL_NAME)),
 82 |     std::make_pair(CL_INVALID_KERNEL_DEFINITION,
 83 |                    TO_STRING(CL_INVALID_KERNEL_DEFINITION)),
 84 |     std::make_pair(CL_INVALID_KERNEL, TO_STRING(CL_INVALID_KERNEL)),
 85 |     std::make_pair(CL_INVALID_ARG_INDEX, TO_STRING(CL_INVALID_ARG_INDEX)),
 86 |     std::make_pair(CL_INVALID_ARG_VALUE, TO_STRING(CL_INVALID_ARG_VALUE)),
 87 |     std::make_pair(CL_INVALID_ARG_SIZE, TO_STRING(CL_INVALID_ARG_SIZE)),
 88 |     std::make_pair(CL_INVALID_KERNEL_ARGS, TO_STRING(CL_INVALID_KERNEL_ARGS)),
 89 |     std::make_pair(CL_INVALID_WORK_DIMENSION,
 90 |                    TO_STRING(CL_INVALID_WORK_DIMENSION)),
 91 |     std::make_pair(CL_INVALID_WORK_GROUP_SIZE,
 92 |                    TO_STRING(CL_INVALID_WORK_GROUP_SIZE)),
 93 |     std::make_pair(CL_INVALID_WORK_ITEM_SIZE,
 94 |                    TO_STRING(CL_INVALID_WORK_ITEM_SIZE)),
 95 |     std::make_pair(CL_INVALID_GLOBAL_OFFSET,
 96 |                    TO_STRING(CL_INVALID_GLOBAL_OFFSET)),
 97 |     std::make_pair(CL_INVALID_EVENT_WAIT_LIST,
 98 |                    TO_STRING(CL_INVALID_EVENT_WAIT_LIST)),
 99 |     std::make_pair(CL_INVALID_EVENT, TO_STRING(CL_INVALID_EVENT)),
100 |     std::make_pair(CL_INVALID_OPERATION, TO_STRING(CL_INVALID_OPERATION)),
101 |     std::make_pair(CL_INVALID_GL_OBJECT, TO_STRING(CL_INVALID_GL_OBJECT)),
102 |     std::make_pair(CL_INVALID_BUFFER_SIZE, TO_STRING(CL_INVALID_BUFFER_SIZE)),
103 |     std::make_pair(CL_INVALID_MIP_LEVEL, TO_STRING(CL_INVALID_MIP_LEVEL)),
104 |     std::make_pair(CL_INVALID_GLOBAL_WORK_SIZE,
105 |                    TO_STRING(CL_INVALID_GLOBAL_WORK_SIZE)),
106 |     std::make_pair(CL_INVALID_PROPERTY, TO_STRING(CL_INVALID_PROPERTY))};
107 | 
108 | static const std::map<cl_int, std::string>
109 |     oclErrorCodes(map_pairs,
110 |                   map_pairs + sizeof(map_pairs) / sizeof(map_pairs[0]));
111 | 
112 | const char *oclErrorCode(cl_int code) {
113 |     std::map<cl_int, std::string>::const_iterator iter =
114 |         oclErrorCodes.find(code);
115 |     if (iter == oclErrorCodes.end())
116 |         return "UNKNOWN ERROR";
117 |     else
118 |         return iter->second.c_str();
119 | }
120 | 


--------------------------------------------------------------------------------
/xrt/includes/oclHelper/oclHelper.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2019, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | #include "oclHelper.h"
 30 | #include <cstring>
 31 | #include <fstream>
 32 | #include <iostream>
 33 | #include <vector>
 34 | 
 35 | //
 36 | // Load file to memory
 37 | //
 38 | static int loadFile2Memory(const char *filename, char **result) {
 39 |     int size = 0;
 40 | 
 41 |     std::ifstream stream(filename, std::ifstream::binary);
 42 |     if (!stream) {
 43 |         return -1;
 44 |     }
 45 | 
 46 |     stream.seekg(0, stream.end);
 47 |     size = stream.tellg();
 48 |     stream.seekg(0, stream.beg);
 49 | 
 50 |     *result = new char[size + 1];
 51 |     stream.read(*result, size);
 52 |     if (!stream) {
 53 |         return -2;
 54 |     }
 55 |     stream.close();
 56 |     (*result)[size] = 0;
 57 |     return size;
 58 | }
 59 | 
 60 | //
 61 | // Get device version
 62 | //
 63 | static void getDeviceVersion(oclHardware &hardware) {
 64 |     char versionString[512];
 65 |     size_t size = 0;
 66 |     cl_int err = clGetDeviceInfo(
 67 |         hardware.mDevice, CL_DEVICE_VERSION, 511, versionString, &size);
 68 |     if (err != CL_SUCCESS) {
 69 |         std::cout << oclErrorCode(err) << "\n";
 70 |         return;
 71 |     }
 72 |     unsigned major = 0;
 73 |     unsigned minor = 0;
 74 |     unsigned state = 0;
 75 |     for (size_t i = 0; i < size; i++) {
 76 |         if (!versionString[i]) {
 77 |             break;
 78 |         }
 79 |         if (versionString[i] == ' ') {
 80 |             state++;
 81 |             continue;
 82 |         }
 83 |         if (versionString[i] == '.') {
 84 |             state++;
 85 |             continue;
 86 |         }
 87 |         if (state == 0) {
 88 |             continue;
 89 |         }
 90 |         if (state == 1) {
 91 |             major *= 10;
 92 |             major += (versionString[i] - '0');
 93 |             continue;
 94 |         }
 95 |         if (state == 2) {
 96 |             minor *= 10;
 97 |             minor += (versionString[i] - '0');
 98 |             continue;
 99 |         }
100 |         break;
101 |     }
102 |     hardware.mMajorVersion = major;
103 |     hardware.mMinorVersion = minor;
104 | }
105 | 
106 | //
107 | // Get OCL hardware
108 | //
109 | oclHardware getOclHardware(cl_device_type type) {
110 |     oclHardware hardware = {0, 0, 0, 0, 0, 0};
111 |     cl_platform_id platforms[16] = {0};
112 |     cl_device_id devices[16];
113 |     char platformName[256];
114 |     char deviceName[256];
115 |     cl_uint platformCount = 0;
116 |     cl_int err = clGetPlatformIDs(0, 0, &platformCount);
117 |     err = clGetPlatformIDs(16, platforms, &platformCount);
118 |     if (err != CL_SUCCESS) {
119 |         std::cout << oclErrorCode(err) << "\n";
120 |         return hardware;
121 |     }
122 | 
123 |     for (cl_uint i = 0; i < platformCount; i++) {
124 |         err = clGetPlatformInfo(
125 |             platforms[i], CL_PLATFORM_NAME, 256, platformName, 0);
126 |         if (err != CL_SUCCESS) {
127 |             std::cout << oclErrorCode(err) << "\n";
128 |             return hardware;
129 |         }
130 |         cl_uint deviceCount = 0;
131 |         err = clGetDeviceIDs(platforms[i], type, 16, devices, &deviceCount);
132 |         if ((err != CL_SUCCESS) || (deviceCount == 0)) {
133 |             continue;
134 |         }
135 | 
136 |         err = clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 256, deviceName, 0);
137 |         if (err != CL_SUCCESS) {
138 |             std::cout << oclErrorCode(err) << "\n";
139 |             return hardware;
140 |         }
141 | 
142 |         cl_context_properties contextData[3] = {
143 |             CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0};
144 |         cl_context context =
145 |             clCreateContextFromType(contextData, type, 0, 0, &err);
146 |         if (err != CL_SUCCESS) {
147 |             continue;
148 |         }
149 |         cl_command_queue queue =
150 |             clCreateCommandQueue(context, devices[0], 0, &err);
151 |         if (err != CL_SUCCESS) {
152 |             std::cout << oclErrorCode(err) << "\n";
153 |             return hardware;
154 |         }
155 | 
156 |         hardware.mPlatform = platforms[i];
157 |         hardware.mContext = context;
158 |         hardware.mDevice = devices[0];
159 |         hardware.mQueue = queue;
160 |         getDeviceVersion(hardware);
161 |         std::cout << "Platform = " << platformName << "\n";
162 |         std::cout << "Device = " << deviceName << "\n";
163 |         std::cout << "OpenCL Version = " << hardware.mMajorVersion << '.'
164 |                   << hardware.mMinorVersion << "\n";
165 |         return hardware;
166 |     }
167 |     return hardware;
168 | }
169 | 
170 | //
171 | // Get OCL software
172 | //
173 | int getOclSoftware(oclSoftware &software, const oclHardware &hardware) {
174 |     cl_device_type deviceType = CL_DEVICE_TYPE_DEFAULT;
175 |     cl_int err = clGetDeviceInfo(
176 |         hardware.mDevice, CL_DEVICE_TYPE, sizeof(deviceType), &deviceType, 0);
177 |     if (err != CL_SUCCESS) {
178 |         std::cout << oclErrorCode(err) << "\n";
179 |         return -1;
180 |     }
181 | 
182 |     unsigned char *kernelCode = 0;
183 |     std::cout << "Loading " << software.mFileName << "\n";
184 | 
185 |     int size = loadFile2Memory(software.mFileName, (char **)&kernelCode);
186 |     if (size < 0) {
187 |         std::cout << "Failed to load kernel\n";
188 |         return -2;
189 |     }
190 | 
191 |     if (deviceType == CL_DEVICE_TYPE_ACCELERATOR) {
192 |         size_t n = size;
193 |         software.mProgram =
194 |             clCreateProgramWithBinary(hardware.mContext,
195 |                                       1,
196 |                                       &hardware.mDevice,
197 |                                       &n,
198 |                                       (const unsigned char **)&kernelCode,
199 |                                       0,
200 |                                       &err);
201 |     } else {
202 |         software.mProgram = clCreateProgramWithSource(
203 |             hardware.mContext, 1, (const char **)&kernelCode, 0, &err);
204 |     }
205 |     if (!software.mProgram || (err != CL_SUCCESS)) {
206 |         std::cout << oclErrorCode(err) << "\n";
207 |         return -3;
208 |     }
209 | 
210 |     software.mKernel =
211 |         clCreateKernel(software.mProgram, software.mKernelName, NULL);
212 |     if (software.mKernel == 0) {
213 |         std::cout << oclErrorCode(err) << "\n";
214 |         return -4;
215 |     }
216 | 
217 |     delete[] kernelCode;
218 |     return 0;
219 | }
220 | 
221 | //
222 | // Release software and hardware
223 | //
224 | void release(oclSoftware &software) {
225 |     clReleaseKernel(software.mKernel);
226 |     clReleaseProgram(software.mProgram);
227 | }
228 | 
229 | void release(oclHardware &hardware) {
230 |     clReleaseCommandQueue(hardware.mQueue);
231 |     clReleaseContext(hardware.mContext);
232 |     if ((hardware.mMajorVersion >= 1) && (hardware.mMinorVersion > 1)) {
233 |         // Only available in OpenCL >= 1.2
234 |         clReleaseDevice(hardware.mDevice);
235 |     }
236 | }
237 | 


--------------------------------------------------------------------------------
/xrt/includes/oclHelper/oclHelper.h:
--------------------------------------------------------------------------------
 1 | /**********
 2 | Copyright (c) 2019, Xilinx, Inc.
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without modification,
 6 | are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice,
 9 | this list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 | this list of conditions and the following disclaimer in the documentation
13 | and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its contributors
16 | may be used to endorse or promote products derived from this software
17 | without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | **********/
29 | #ifndef _OCL_HELP_H_
30 | #define _OCL_HELP_H_
31 | 
32 | #include <CL/cl.h>
33 | 
34 | struct oclHardware {
35 |     cl_platform_id mPlatform;
36 |     cl_context mContext;
37 |     cl_device_id mDevice;
38 |     cl_command_queue mQueue;
39 |     short mMajorVersion;
40 |     short mMinorVersion;
41 | };
42 | 
43 | struct oclSoftware {
44 |     cl_program mProgram;
45 |     cl_kernel mKernel;
46 |     char mKernelName[128];
47 |     char mFileName[1024];
48 |     char mCompileOptions[1024];
49 | };
50 | 
51 | oclHardware getOclHardware(cl_device_type type);
52 | 
53 | int getOclSoftware(oclSoftware &software, const oclHardware &hardware);
54 | 
55 | void release(oclSoftware& software);
56 | 
57 | void release(oclHardware& hardware);
58 | 
59 | const char *oclErrorCode(cl_int code);
60 | 
61 | #endif
62 | 


--------------------------------------------------------------------------------
/xrt/includes/oclHelper/oclHelper.mk:
--------------------------------------------------------------------------------
1 | oclHelper_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper/oclHelper.cpp ${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper/oclErrorCodes.cpp
2 | oclHelper_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper/oclHelper.h
3 | oclHelper_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/oclHelper
4 | 


--------------------------------------------------------------------------------
/xrt/includes/opencl/opencl.mk:
--------------------------------------------------------------------------------
 1 | # Definition of include file locations
 2 | xrt_path = $(XILINX_XRT)
 3 | ifneq ($(HOST_ARCH), x86)
 4 | 	xrt_path =  $(SYSROOT)/usr/
 5 | endif
 6 | 
 7 | OPENCL_INCLUDE:= $(xrt_path)/include
 8 | ifneq ($(HOST_ARCH), x86)
 9 | 	OPENCL_INCLUDE:= $(xrt_path)/include/xrt
10 | endif
11 | 
12 | VIVADO_INCLUDE:= $(XILINX_VIVADO)/include
13 | opencl_CXXFLAGS=-I$(OPENCL_INCLUDE) -I$(VIVADO_INCLUDE)
14 | OPENCL_LIB:= $(xrt_path)/lib
15 | opencl_LDFLAGS=-L$(OPENCL_LIB) -lOpenCL -lpthread 
16 | 


--------------------------------------------------------------------------------
/xrt/includes/xcl2/xcl2.cpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2019, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | 
 30 | #include "xcl2.hpp"
 31 | #include <limits.h>
 32 | #include <sys/stat.h>
 33 | #include <unistd.h>
 34 | 
 35 | namespace xcl {
 36 | std::vector<cl::Device> get_devices(const std::string &vendor_name) {
 37 |     size_t i;
 38 |     cl_int err;
 39 |     std::vector<cl::Platform> platforms;
 40 |     OCL_CHECK(err, err = cl::Platform::get(&platforms));
 41 |     cl::Platform platform;
 42 |     for (i = 0; i < platforms.size(); i++) {
 43 |         platform = platforms[i];
 44 |         OCL_CHECK(err,
 45 |                   std::string platformName =
 46 |                       platform.getInfo<CL_PLATFORM_NAME>(&err));
 47 |         if (platformName == vendor_name) {
 48 |             std::cout << "Found Platform" << std::endl;
 49 |             std::cout << "Platform Name: " << platformName.c_str() << std::endl;
 50 |             break;
 51 |         }
 52 |     }
 53 |     if (i == platforms.size()) {
 54 |         std::cout << "Error: Failed to find Xilinx platform" << std::endl;
 55 |         exit(EXIT_FAILURE);
 56 |     }
 57 |     //Getting ACCELERATOR Devices and selecting 1st such device
 58 |     std::vector<cl::Device> devices;
 59 |     OCL_CHECK(err,
 60 |               err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));
 61 |     return devices;
 62 | }
 63 | 
 64 | std::vector<cl::Device> get_xil_devices() { return get_devices("Xilinx"); }
 65 | 
 66 | std::vector<unsigned char>
 67 | read_binary_file(const std::string &xclbin_file_name) {
 68 |     std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
 69 | 
 70 |     if (access(xclbin_file_name.c_str(), R_OK) != 0) {
 71 |         printf("ERROR: %s xclbin not available please build\n",
 72 |                xclbin_file_name.c_str());
 73 |         exit(EXIT_FAILURE);
 74 |     }
 75 |     //Loading XCL Bin into char buffer
 76 |     std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
 77 |     std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
 78 |     bin_file.seekg(0, bin_file.end);
 79 |     auto nb = bin_file.tellg();
 80 |     bin_file.seekg(0, bin_file.beg);
 81 |     std::vector<unsigned char> buf;
 82 |     buf.resize(nb);
 83 |     bin_file.read(reinterpret_cast<char *>(buf.data()), nb);
 84 |     return buf;
 85 | }
 86 | 
 87 | bool is_emulation() {
 88 |     bool ret = false;
 89 |     char *xcl_mode = getenv("XCL_EMULATION_MODE");
 90 |     if (xcl_mode != NULL) {
 91 |         ret = true;
 92 |     }
 93 |     return ret;
 94 | }
 95 | 
 96 | bool is_hw_emulation() {
 97 |     bool ret = false;
 98 |     char *xcl_mode = getenv("XCL_EMULATION_MODE");
 99 |     if ((xcl_mode != NULL) && !strcmp(xcl_mode, "hw_emu")) {
100 |         ret = true;
101 |     }
102 |     return ret;
103 | }
104 | 
105 | bool is_xpr_device(const char *device_name) {
106 |     const char *output = strstr(device_name, "xpr");
107 | 
108 |     if (output == NULL) {
109 |         return false;
110 |     } else {
111 |         return true;
112 |     }
113 | }
114 | }; // namespace xcl
115 | 


--------------------------------------------------------------------------------
/xrt/includes/xcl2/xcl2.hpp:
--------------------------------------------------------------------------------
  1 | /**********
  2 | Copyright (c) 2018, Xilinx, Inc.
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without modification,
  6 | are permitted provided that the following conditions are met:
  7 | 
  8 | 1. Redistributions of source code must retain the above copyright notice,
  9 | this list of conditions and the following disclaimer.
 10 | 
 11 | 2. Redistributions in binary form must reproduce the above copyright notice,
 12 | this list of conditions and the following disclaimer in the documentation
 13 | and/or other materials provided with the distribution.
 14 | 
 15 | 3. Neither the name of the copyright holder nor the names of its contributors
 16 | may be used to endorse or promote products derived from this software
 17 | without specific prior written permission.
 18 | 
 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 28 | **********/
 29 | 
 30 | 
 31 | #pragma once
 32 | 
 33 | #define CL_HPP_CL_1_2_DEFAULT_BUILD
 34 | #define CL_HPP_TARGET_OPENCL_VERSION 120
 35 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 36 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
 37 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS
 38 | 
 39 | //OCL_CHECK doesn't work if call has templatized function call
 40 | #define OCL_CHECK(error,call)                                       \
 41 |     call;                                                           \
 42 |     if (error != CL_SUCCESS) {                                      \
 43 |       printf("%s:%d Error calling " #call ", error code is: %d\n",  \
 44 |               __FILE__,__LINE__, error);                            \
 45 |       exit(EXIT_FAILURE);                                           \
 46 |     }                                       
 47 | 
 48 | #include <CL/cl2.hpp>
 49 | #include <iostream>
 50 | #include <fstream>
 51 | #include <CL/cl_ext_xilinx.h>
 52 | // When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the hood
 53 | // User ptr is used if and only if it is properly aligned (page aligned). When not 
 54 | // aligned, runtime has no choice but to create its own host side buffer that backs
 55 | // user ptr. This in turn implies that all operations that move data to and from 
 56 | // device incur an extra memcpy to move data to/from runtime's own host buffer 
 57 | // from/to user pointer. So it is recommended to use this allocator if user wish to
 58 | // Create Buffer/Memory Object with CL_MEM_USE_HOST_PTR to align user buffer to the
 59 | // page boundary. It will ensure that user buffer will be used when user create 
 60 | // Buffer/Mem Object with CL_MEM_USE_HOST_PTR.
 61 | template <typename T>
 62 | struct aligned_allocator
 63 | {
 64 |   using value_type = T;
 65 |   T* allocate(std::size_t num)
 66 |   {
 67 |     void* ptr = nullptr;
 68 |     if (posix_memalign(&ptr,4096,num*sizeof(T)))
 69 |       throw std::bad_alloc();
 70 |     return reinterpret_cast<T*>(ptr);
 71 |   }
 72 |   void deallocate(T* p, std::size_t num)
 73 |   {
 74 |     free(p);
 75 |   }
 76 | };
 77 | 
 78 | namespace xcl {
 79 |   std::vector<cl::Device> get_xil_devices();
 80 |   std::vector<cl::Device> get_devices(const std::string& vendor_name);
 81 |   std::vector<unsigned char> read_binary_file(const std::string &xclbin_file_name); 
 82 |   bool is_emulation ();
 83 |   bool is_hw_emulation ();
 84 |   bool is_xpr_device (const char *device_name);
 85 |     class Stream{
 86 |       public:
 87 |         static decltype(&clCreateStream) createStream;
 88 |         static decltype(&clReleaseStream) releaseStream;
 89 |         static decltype(&clReadStream) readStream;
 90 |         static decltype(&clWriteStream) writeStream;
 91 |         static decltype(&clPollStreams) pollStreams;
 92 |         static void init(const cl_platform_id& platform) {
 93 |             void *bar = clGetExtensionFunctionAddressForPlatform(platform, "clCreateStream");
 94 |             createStream = (decltype(&clCreateStream))bar;
 95 |             bar = clGetExtensionFunctionAddressForPlatform(platform, "clReleaseStream");
 96 |             releaseStream = (decltype(&clReleaseStream))bar;
 97 |             bar = clGetExtensionFunctionAddressForPlatform(platform, "clReadStream");
 98 |             readStream = (decltype(&clReadStream))bar;
 99 |             bar = clGetExtensionFunctionAddressForPlatform(platform, "clWriteStream");
100 |             writeStream = (decltype(&clWriteStream))bar;
101 |             bar = clGetExtensionFunctionAddressForPlatform(platform, "clPollStreams");
102 |             pollStreams = (decltype(&clPollStreams))bar;
103 |         }
104 |     };
105 | }
106 | 


--------------------------------------------------------------------------------
/xrt/includes/xcl2/xcl2.mk:
--------------------------------------------------------------------------------
1 | xcl2_SRCS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/xcl2/xcl2.cpp
2 | xcl2_HDRS:=${GRAPHLILY_ROOT_PATH}/xrt/includes/xcl2/xcl2.hpp
3 | 
4 | xcl2_CXXFLAGS:=-I${GRAPHLILY_ROOT_PATH}/xrt/includes/xcl2
5 | 


--------------------------------------------------------------------------------