├── .gitmodules ├── _config.yml ├── sssp-cpu-only ├── test │ ├── util_test.cpp │ └── bellman_ford_test.cpp ├── include │ ├── util.hpp │ └── bellman_ford.hpp ├── src │ ├── util.cpp │ ├── main.cpp │ └── bellman_ford.cpp └── CMakeLists.txt ├── sssp-cuda ├── include │ ├── bellman_ford.h │ ├── locks.hpp │ └── util.h ├── CMakeLists.txt └── src │ ├── main.cpp │ ├── util.cpp │ └── bellman_ford.cu ├── data ├── sample-timing.csv └── v10-e20.graph ├── cmake ├── docopt.cmake └── gtest.cmake ├── scripts ├── vis.r └── cpu-timing.py ├── sssp-opencl ├── CMakeLists.txt ├── description.json ├── src │ ├── bf_kernel.cl │ └── host.cpp └── README.md ├── CMakeLists.txt ├── lib └── graph │ ├── src │ └── main.cpp │ ├── CMakeLists.txt │ ├── test │ └── graph_test.cpp │ └── include │ └── graph.hpp ├── LICENSE ├── .travis.yml ├── README.md └── .gitignore /.gitmodules: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /sssp-cpu-only/test/util_test.cpp: -------------------------------------------------------------------------------- 1 | #include "util.hpp" 2 | 3 | #include "gtest/gtest.h" 4 | 5 | TEST(UtilTest, Dummy) { EXPECT_EQ(0, 0); } 6 | -------------------------------------------------------------------------------- /sssp-cuda/include/bellman_ford.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "graph.hpp" 4 | #include 5 | #include 6 | 7 | struct BellmanFordOutput { 8 | std::vector distances; 9 | float elapsed; 10 | }; 11 | 12 | BellmanFordOutput bellmanFordCUDA(const graph::Graph &graph, size_t source); 13 | -------------------------------------------------------------------------------- /data/sample-timing.csv: -------------------------------------------------------------------------------- 1 | version,device,dataset,time 2 | cpu,f1,data1,100 3 | fpga,f1,data1,50 4 | cuda,f1,data1,75 5 | cpu,f1,data2,300 6 | fpga,f1,data2,200 7 | cuda,f1,data2,250 8 | cpu,eldar,data1,200 9 | fpga,eldar,data1,100 10 | cuda,eldar,data1,150 11 | cpu,eldar,data2,250 12 | fpga,eldar,data2,150 13 | cuda,eldar,data2,180 14 | -------------------------------------------------------------------------------- /cmake/docopt.cmake: -------------------------------------------------------------------------------- 1 | include(ExternalProject) 2 | ExternalProject_Add(docopt 3 | GIT_REPOSITORY https://github.com/docopt/docopt.cpp.git 4 | BUILD_COMMAND make docopt_s 5 | INSTALL_COMMAND "") 6 | ExternalProject_Get_Property(docopt binary_dir) 7 | ExternalProject_Get_Property(docopt source_dir) 8 | include_directories(${source_dir}) 9 | add_library(libdocopt STATIC IMPORTED GLOBAL) 10 | add_dependencies(libdocopt docopt) 11 | set_target_properties(libdocopt PROPERTIES 12 | IMPORTED_LOCATION ${binary_dir}/libdocopt.a 13 | ) 14 | -------------------------------------------------------------------------------- /scripts/vis.r: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(reshape2) 3 | 4 | df <- read.csv("~/workspace/sssp-fpga/data/timing.csv") 5 | 6 | # absolute timing graphs per dataset 7 | for(datasetName in levels(df$dataset)) { 8 | print(datasetName) 9 | subdf <- df[df$dataset == datasetName,] 10 | melted <- melt(subdf, id.vars = c('version', 'device'), measure.vars = 'time') 11 | 12 | ggplot(melted, aes(version, value)) + 13 | geom_bar(aes(fill = device), position = 'dodge', stat = 'identity') + 14 | xlab('Algorithm version') + 15 | ylab('Time to complete, ms') + 16 | ggtitle(datasetName) 17 | 18 | 19 | ggsave(paste(datasetName, '.png'), plot = last_plot(), width=10, height=5) 20 | } -------------------------------------------------------------------------------- /data/v10-e20.graph: -------------------------------------------------------------------------------- 1 | 10 2 | 0 5 80.751277 3 | 0 2 40.838119 4 | 0 3 34.157928 5 | 0 6 40.468479 6 | 1 8 79.530401 7 | 1 3 75.323691 8 | 1 6 39.291467 9 | 2 9 77.559261 10 | 2 0 40.838119 11 | 3 5 42.071643 12 | 3 9 12.831511 13 | 3 6 56.097090 14 | 3 5 75.516317 15 | 3 1 75.323691 16 | 3 0 34.157928 17 | 3 9 80.938769 18 | 3 4 21.262413 19 | 4 6 47.300014 20 | 4 3 21.262413 21 | 5 3 42.071643 22 | 5 0 80.751277 23 | 5 3 75.516317 24 | 5 7 50.223190 25 | 5 8 19.259838 26 | 6 3 56.097090 27 | 6 4 47.300014 28 | 6 8 57.464082 29 | 6 1 39.291467 30 | 6 0 40.468479 31 | 7 9 89.752116 32 | 7 5 50.223190 33 | 8 1 79.530401 34 | 8 6 57.464082 35 | 8 9 81.497697 36 | 8 5 19.259838 37 | 9 3 12.831511 38 | 9 2 77.559261 39 | 9 7 89.752116 40 | 9 8 81.497697 41 | 9 3 80.938769 42 | -------------------------------------------------------------------------------- /sssp-opencl/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(SSSP_OPENCL LANGUAGES CXX) 3 | 4 | ## C++ compiler options 5 | set(CMAKE_CXX_STANDARD 14) 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra") 8 | set(CMAKE_CXX_EXTENSIONS OFF) 9 | 10 | ## Sources and headers 11 | include_directories(include) 12 | set(SOURCES src/main.cpp) 13 | set(MAIN_EXE_NAME sssp_opencl) 14 | 15 | ## OpenCL 16 | find_package(OpenCL REQUIRED) 17 | include_directories(${OpenCL_INCLUDE_DIRS}) 18 | link_directories(${OpenCL_LIBRARY}) 19 | 20 | ## Generate executable 21 | add_executable(${MAIN_EXE_NAME} src/main.cpp) 22 | target_link_libraries(${MAIN_EXE_NAME} OpenCL::OpenCL) 23 | install(TARGETS ${MAIN_EXE_NAME} RUNTIME DESTINATION bin) 24 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(SSSP) 3 | 4 | # Load CMAKE configuration from environment variables 5 | set(CMAKE_MODULE_PATH $ENV{CMAKE_MODULE_PATH}) 6 | set(CMAKE_PREFIX_PATH $ENV{CMAKE_PREFIX_PATH}) 7 | 8 | # Set build type 9 | if (NOT CMAKE_BUILD_TYPE) 10 | set(CMAKE_BUILD_TYPE "Release" 11 | CACHE STRING "Build configuration 'Release' or 'Debug'." 12 | FORCE) 13 | endif() 14 | 15 | # Third-party dependencies 16 | include(CTest) 17 | file(GLOB cmakes ${PROJECT_SOURCE_DIR}/cmake/*.cmake) 18 | foreach(cmake ${cmakes}) 19 | include(${cmake}) 20 | endforeach(cmake) 21 | 22 | # Subdirectories 23 | add_subdirectory(${PROJECT_SOURCE_DIR}/lib/graph graph) 24 | add_subdirectory(${PROJECT_SOURCE_DIR}/sssp-cpu-only cpu) 25 | # add_subdirectory(${PROJECT_SOURCE_DIR}/sssp-cuda cuda) 26 | # add_subdirectory(${PROJECT_SOURCE_DIR}/sssp-opencl xcl) 27 | -------------------------------------------------------------------------------- /lib/graph/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "graph.hpp" 2 | #include "docopt.h" 3 | #include 4 | 5 | static const char USAGE[] = R"( 6 | Concurrency Final Project. 7 | 8 | Graph generator utility that writes graph binaries. 9 | 10 | Usage: 11 | graph_gen --vertices= --edges= --output= 12 | 13 | Options: 14 | --vertices= Number of vertices 15 | --edges= Number of edges 16 | --output= Path to output file 17 | )"; 18 | 19 | using graph::Graph; 20 | 21 | int main(int argc, const char *argv[]) { 22 | auto args = docopt::docopt(USAGE, {argv + 1, argv + argc}, true); 23 | auto graph = Graph::generateGraph( 24 | args.at("--vertices").asLong(), 25 | args.at("--edges").asLong()); 26 | graph.saveToFile(args.at("--output").asString()); 27 | std::cout << "Graph written successfully." << std::endl; 28 | } 29 | -------------------------------------------------------------------------------- /sssp-cpu-only/include/util.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace util { 8 | 9 | // Used to delineate the boundaries for a thread's work when partitioning a 10 | // dataset. 11 | struct Range { 12 | size_t start; 13 | size_t end; 14 | }; 15 | 16 | // Splits a integral range as evenly as possible among some number of threads, 17 | // returning the indices each thread should work with. 18 | std::vector partition(size_t size, size_t num_threads); 19 | 20 | // Helper synchronization class since std::barrier is still experimental. 21 | class Barrier { 22 | public: 23 | explicit Barrier(size_t count) 24 | : threshold_(count), count_(count), generation_(0) {} 25 | void wait(); 26 | 27 | private: 28 | std::mutex mutex_; 29 | std::condition_variable cond_; 30 | size_t threshold_; 31 | size_t count_; 32 | size_t generation_; 33 | }; 34 | 35 | } /* namespace util */ 36 | -------------------------------------------------------------------------------- /scripts/cpu-timing.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import csv 3 | import re 4 | 5 | DATASRC = 'data/' 6 | DATAFILES = ['v10-e20.graph', 'v10k-e25k.graph', 'v10k-e50k.graph', 'v10k-e100k.graph'] 7 | MAXTHREADS = 16 8 | 9 | DEVICE = 'kierkegaard' 10 | 11 | fout = csv.writer(open('data/timing.csv', 'w')) 12 | fout.writerow(['version', 'device', 'dataset', 'time']) 13 | 14 | for fname in DATAFILES: 15 | print(fname) 16 | for num_threads in range(1, MAXTHREADS+1): 17 | args = ['./build/bin/sssp_cpu_only', \ 18 | '--input='+DATASRC+fname, \ 19 | '--source=0', \ 20 | '--workers='+str(num_threads)] 21 | print(args) 22 | output = subprocess.check_output(args) 23 | last_line = output.split('\n')[-2] 24 | time_elapsed = re.match(r'Elapsed runtime: (\d+)ms', last_line).group(1) 25 | print(time_elapsed, 'ms') 26 | 27 | fout.writerow(['CPUx%02d' % (num_threads,), DEVICE, fname, time_elapsed]) 28 | -------------------------------------------------------------------------------- /sssp-cpu-only/include/bellman_ford.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "graph.hpp" 4 | #include 5 | 6 | namespace sssp { 7 | 8 | // Computes single-source shortest paths for a graph instance and source (vertex 9 | // id) using Bellman-Ford. This algorithm tolerates negative-weight edges. 10 | // Returns the cost of the shortest path from the source to every other vertex. 11 | std::vector bellmanFord(const graph::Graph &graph, size_t source); 12 | 13 | // Computes single-source shortest paths for a graph instance and source (vertex 14 | // id) using Bellman-Ford. Note that the edge set is partitioned evenly among 15 | // the number of threads specified. This algorithm tolerates negative-weight 16 | // edges. Returns the cost of the shortest path from the source to every other 17 | // vertex. 18 | std::vector bellmanFordParallel(const graph::Graph &graph, 19 | size_t source, size_t num_threads); 20 | 21 | } /* namespace sssp */ 22 | -------------------------------------------------------------------------------- /sssp-opencl/description.json: -------------------------------------------------------------------------------- 1 | { 2 | "runtime": ["OpenCL"], 3 | "example": "Hello World (CL)", 4 | "overview": [ 5 | "This example is a simple OpenCL application. It will highlight the basic flow of an OpenCL application." 6 | ], 7 | "key_concepts": [ "OpenCL API"], 8 | "os": [ 9 | "Linux" 10 | ], 11 | "libs": [ 12 | "xcl2" 13 | ], 14 | "em_cmd": "./helloworld", 15 | "hw_cmd": "../../../utility/nimbix/nimbix-run.py -- ./helloworld", 16 | "accelerators": [ 17 | { 18 | "container": "vector_addition", 19 | "name": "vector_add", 20 | "location": "src/vector_addition.cl" 21 | } 22 | ], 23 | "contributors" : [ 24 | { 25 | "group": "Xilinx", 26 | "url" : "http://www.xilinx.com" 27 | } 28 | ], 29 | "revision" : [ 30 | { 31 | "date" : "DEC2016", 32 | "version": "1.0", 33 | "description": "Initial Xilinx Release" 34 | } 35 | ] 36 | } 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Aaron Zou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/graph/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(GRAPH LANGUAGES CXX) 3 | 4 | ## C++ compiler options 5 | set(CMAKE_CXX_STANDARD 14) 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra") 8 | set(CMAKE_CXX_EXTENSIONS OFF) 9 | 10 | ## Sources and headers 11 | set(LIB_NAME graph) 12 | include_directories(include) 13 | 14 | ## Generate library 15 | add_library(${LIB_NAME} INTERFACE) 16 | target_include_directories(${LIB_NAME} INTERFACE 17 | $ 18 | $ 19 | ) 20 | install(TARGETS ${LIB_NAME} LIBRARY DESTINATION lib) 21 | 22 | ## Generate graph utility executable 23 | # add_executable(graph_gen src/main.cpp) 24 | # target_link_libraries(graph_gen graph) 25 | # target_link_libraries(graph_gen libdocopt) 26 | # install(TARGETS graph_gen RUNTIME DESTINATION bin) 27 | 28 | ## Generate test executable 29 | # enable_testing() 30 | # add_executable(graph_test test/graph_test.cpp) 31 | # target_link_libraries(graph_test graph) 32 | # target_link_libraries(graph_test gtest gtest_main) 33 | # add_test(GRAPH_TEST graph_test) 34 | -------------------------------------------------------------------------------- /sssp-cpu-only/src/util.cpp: -------------------------------------------------------------------------------- 1 | #include "util.hpp" 2 | 3 | using std::vector; 4 | 5 | namespace util { 6 | 7 | vector partition(size_t size, size_t num_threads) { 8 | auto ranges = vector{}; 9 | 10 | auto step_size = size / num_threads; 11 | auto get_extra = size % num_threads; 12 | 13 | size_t start = 0; 14 | size_t end = step_size; 15 | 16 | // Calculate the range for each thread 17 | for (size_t i = 0; i < num_threads; i++) { 18 | // Some threads are assigned additional work beyond minimum 19 | if (i < get_extra) { 20 | end++; 21 | } else if (i == num_threads - 1) { 22 | end = size; 23 | } 24 | ranges.push_back({start, end}); 25 | 26 | // Advance forward 27 | start = end; 28 | end = start + step_size; 29 | } 30 | 31 | return ranges; 32 | } 33 | 34 | void Barrier::wait() { 35 | std::unique_lock lock{mutex_}; 36 | auto current_gen = generation_; 37 | if (--count_ == 0) { 38 | generation_++; 39 | count_ = threshold_; 40 | cond_.notify_all(); 41 | } else { 42 | cond_.wait(lock, 43 | [this, current_gen]() { return current_gen != generation_; }); 44 | } 45 | } 46 | 47 | } /* namespace util */ 48 | -------------------------------------------------------------------------------- /sssp-cuda/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project (sssp_cuda LANGUAGES CUDA CXX) 3 | set(CMAKE_CXX_STANDARD 14) 4 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 5 | set(CMAKE_CXX_EXTENSIONS OFF) 6 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra") 7 | 8 | # Included headers # 9 | include_directories(include) 10 | 11 | # Util lib 12 | add_library(util ${PROJECT_SOURCE_DIR}/src/util.cpp) 13 | 14 | # Cuda lib 15 | add_library(bellman_ford_cuda STATIC ${PROJECT_SOURCE_DIR}/src/bellman_ford.cu) 16 | target_link_libraries(bellman_ford_cuda graph) 17 | set_target_properties(bellman_ford_cuda PROPERTIES CUDA_SEPARATE_COMPILATION ON CUDA_STANDARD 11) 18 | 19 | # Add sources # 20 | set(SOURCES ${PROJECT_SOURCE_DIR}/src/main.cpp) 21 | 22 | # Program binaries and linker 23 | add_executable(sssp_cuda ${SOURCES}) 24 | set_property(TARGET sssp_cuda PROPERTY CUDA_SEPARATE_COMPILATION ON) 25 | target_link_libraries(sssp_cuda PUBLIC util bellman_ford_cuda libdocopt graph ${CMAKE_THREAD_LIBS_INIT}) 26 | install(TARGETS sssp_cuda RUNTIME DESTINATION bin) 27 | 28 | # Run with the appropriate architecture >50 at the command line 29 | # cmake -DCMAKE_CUDA_FLAGS="-arch=sm_61" -DCMAKE_BUILD_TYPE=RelWithDebInfo .. 30 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: cpp 2 | compiler: gcc 3 | os: linux 4 | dist: trusty 5 | sudo: required 6 | 7 | env: 8 | global: 9 | - LINUX_DIST=trusty 10 | - DEPS_DIR=${TRAVIS_BUILD_DIR}/deps 11 | - RUN_TESTS=true 12 | - COVERAGE=false 13 | 14 | before_install: 15 | # C++14 16 | - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test 17 | - sudo apt-get update -qq 18 | 19 | install: 20 | # Boost 21 | - sudo apt-get install libboost-serialization-dev 22 | # C++14 23 | - sudo apt-get install -qq g++-6 24 | - sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-6 90 25 | - echo ${PATH} 26 | - echo ${CXX} 27 | - ${CXX} --version 28 | - ${CXX} -v 29 | 30 | # Download CMake 31 | - | 32 | if [[ ${TRAVIS_OS_NAME} == "linux" ]]; then 33 | CMAKE_URL="https://cmake.org/files/v3.11/cmake-3.11.0-Linux-x86_64.tar.gz" 34 | mkdir -p ${DEPS_DIR}/cmake 35 | travis_retry wget --no-check-certificate --quiet -O - ${CMAKE_URL} | tar --strip-components=1 -xz -C ${DEPS_DIR}/cmake 36 | export PATH=${DEPS_DIR}/cmake/bin:${PATH} 37 | fi 38 | 39 | script: 40 | - mkdir -p build 41 | - cd build 42 | - cmake -DCMAKE_BUILD_TYPE=Coverage -DBUILD_TESTS=ON .. 43 | - make 44 | - ctest 45 | 46 | -------------------------------------------------------------------------------- /sssp-cpu-only/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8 FATAL_ERROR) 2 | project(SSSP_CPU_ONLY LANGUAGES CXX) 3 | 4 | ## C++ compiler options 5 | set(CMAKE_CXX_STANDARD 14) 6 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra") 8 | set(CMAKE_CXX_EXTENSIONS OFF) 9 | 10 | ## Required imports 11 | find_package(Threads REQUIRED) 12 | 13 | ## Sources and headers 14 | include_directories(include) 15 | set(SOURCES src/main.cpp src/bellman_ford.cpp) 16 | set(MAIN_EXE_NAME sssp_cpu_only) 17 | 18 | ## Generate library 19 | add_library(bellmanford STATIC src/bellman_ford.cpp src/util.cpp) 20 | target_link_libraries(bellmanford ${CMAKE_THREAD_LIBS_INIT}) 21 | target_link_libraries(bellmanford graph) 22 | 23 | ## Generate executable 24 | add_executable(${MAIN_EXE_NAME} ${SOURCES}) 25 | target_link_libraries(${MAIN_EXE_NAME} bellmanford) 26 | target_link_libraries(${MAIN_EXE_NAME} libdocopt) 27 | install(TARGETS ${MAIN_EXE_NAME} RUNTIME DESTINATION bin) 28 | 29 | ## Generate test executable 30 | ## enable_testing() 31 | ## include(GoogleTest) 32 | ## add_executable(bellmanford_test test/util_test.cpp test/bellman_ford_test.cpp) 33 | ## target_link_libraries(bellmanford_test bellmanford) 34 | ## target_link_libraries(bellmanford_test gtest gtest_main) 35 | ## add_test(BELLMANFORD_TEST bellmanford_test) 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # sssp-fpga 2 | > Bellman-Ford implementations for shared memory systems accelerated using std::thread and FPGAs via AWS EC2 F1 3 | 4 | [![Build Status](https://travis-ci.org/aaron-zou/sssp-fpga.svg?branch=master)](https://travis-ci.org/aaron-zou/sssp-fpga) 5 | 6 | This repository contains multiple implementations of the Bellman-Ford algorithm for solving single-source shortest paths (SSSP). In addition to CPU-only sequential and multithreaded implementations, we also accelerate via FPGA an SDAccel Bellman-Ford implementation written in OpenCL for deployment on AWS F1 FPGA instances. 7 | 8 | ## Installation 9 | 10 | Note: this repository is only tested on Ubuntu 16.04 and Ubuntu 17.10. While CMake introduces portability for the CPU-only implementations, the FPGA requires Linux (see https://github.com/aws/aws-fpga). 11 | 12 | ```sh 13 | mkdir build && cd build 14 | cmake .. 15 | make 16 | ``` 17 | 18 | ### CUDA 19 | 20 | ### SDAccel 21 | export XILINX_SDX 22 | 23 | ### Testing 24 | 25 | Run all tests using: 26 | ```sh 27 | make test 28 | ``` 29 | 30 | ## Authors 31 | - [Aaron Zou](https://github.com/aaron-zou/) 32 | - [Shawn Wu](https://github.com/chudooder) 33 | 34 | ## License 35 | 36 | [![License](http://img.shields.io/:license-mit-blue.svg?style=flat-square)](http://badges.mit-license.org) This project is licensed under the MIT License. 37 | 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Subdirectories 2 | build 3 | bin 4 | third-party 5 | input 6 | data 7 | 8 | # Editor artifacts 9 | .clang_complete 10 | 11 | # CMake artifacts 12 | CMakeCache.txt 13 | CMakeFiles 14 | CMakeScripts 15 | Testing 16 | Makefile 17 | cmake_install.cmake 18 | install_manifest.txt 19 | compile_commands.json 20 | CTestTestfile.cmake 21 | 22 | # Prerequisites 23 | *.d 24 | 25 | # Compiled Object files 26 | *.slo 27 | *.lo 28 | *.o 29 | *.obj 30 | 31 | # Precompiled Headers 32 | *.gch 33 | *.pch 34 | 35 | # Compiled Dynamic libraries 36 | *.so 37 | *.dylib 38 | *.dll 39 | 40 | # Fortran module files 41 | *.mod 42 | *.smod 43 | 44 | # Compiled Static libraries 45 | *.lai 46 | *.la 47 | *.a 48 | *.lib 49 | 50 | # Executables 51 | *.exe 52 | *.out 53 | *.app 54 | 55 | # opencl stuff 56 | sssp-opencl/.Xil/ 57 | sssp-opencl/_xocc_compile_bf_kernel_bf_kernel.sw_emu.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0.dir/ 58 | sssp-opencl/_xocc_compile_vector_addition_vector_addition.sw_emu.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0.dir/ 59 | sssp-opencl/_xocc_link_bf_kernel.sw_emu.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0_bf_kernel.sw_emu.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0.dir/ 60 | sssp-opencl/_xocc_link_vector_addition.sw_emu.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0_vector_addition.sw_emu.xilinx_aws-vu9p-f1_4ddr-xpr-2pr_4_0.dir/ 61 | sssp-opencl/bellmanford 62 | sssp-opencl/emconfig.json 63 | sssp-opencl/helloworld 64 | sssp-opencl/sdaccel_profile_summary.csv 65 | sssp-opencl/sdaccel_profile_summary.html 66 | sssp-opencl/xclbin/ 67 | -------------------------------------------------------------------------------- /sssp-cpu-only/test/bellman_ford_test.cpp: -------------------------------------------------------------------------------- 1 | #include "util.hpp" 2 | #include "graph.hpp" 3 | #include "bellman_ford.hpp" 4 | 5 | #include "gtest/gtest.h" 6 | 7 | using graph::Graph; 8 | using sssp::bellmanFord; 9 | using sssp::bellmanFordParallel; 10 | 11 | class BellmanFordTest : public ::testing::Test { 12 | protected: 13 | BellmanFordTest() : the_graph(4) {} 14 | virtual void SetUp() { 15 | the_graph.addEdge(0, 1, 10); 16 | the_graph.addEdge(1, 2, 20); 17 | the_graph.addEdge(2, 3, 30); 18 | the_graph.addEdge(0, 2, 1); 19 | the_graph.addEdge(1, 3, 1); 20 | } 21 | Graph the_graph; 22 | }; 23 | 24 | TEST_F(BellmanFordTest, Sequential) { 25 | auto dists0 = bellmanFord(the_graph, 0); 26 | EXPECT_EQ(0, dists0.at(0)); // self distance should be 0 27 | EXPECT_EQ(10, dists0.at(1)); 28 | EXPECT_EQ(1, dists0.at(2)); 29 | EXPECT_EQ(11, dists0.at(3)); 30 | 31 | auto dists3 = bellmanFord(the_graph, 3); 32 | EXPECT_EQ(11, dists3.at(0)); 33 | EXPECT_EQ(1, dists3.at(1)); 34 | EXPECT_EQ(12, dists3.at(2)); 35 | EXPECT_EQ(0, dists3.at(3)); 36 | } 37 | 38 | TEST_F(BellmanFordTest, Parallel) { 39 | auto dists0 = bellmanFordParallel(the_graph, 0, 4); 40 | EXPECT_EQ(0, dists0.at(0)); // self distance should be 0 41 | EXPECT_EQ(10, dists0.at(1)); 42 | EXPECT_EQ(1, dists0.at(2)); 43 | EXPECT_EQ(11, dists0.at(3)); 44 | 45 | auto dists3 = bellmanFordParallel(the_graph, 3, 4); 46 | EXPECT_EQ(11, dists3.at(0)); 47 | EXPECT_EQ(1, dists3.at(1)); 48 | EXPECT_EQ(12, dists3.at(2)); 49 | EXPECT_EQ(0, dists3.at(3)); 50 | } 51 | -------------------------------------------------------------------------------- /sssp-cuda/include/locks.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace locks { 6 | 7 | enum class LockType { NONE, MUTEX, SPIN }; 8 | 9 | /** 10 | Thin wrapper around pthread_mutex_t meant to be tied to a data structure. 11 | */ 12 | class Mutex { 13 | public: 14 | explicit Mutex() { pthread_mutex_init(&_mutex, nullptr); } 15 | virtual ~Mutex() { pthread_mutex_destroy(&_mutex); } 16 | Mutex(const Mutex &) = delete; 17 | Mutex &operator=(const Mutex &) = delete; 18 | Mutex(Mutex &&) = delete; 19 | Mutex &operator=(Mutex &&) = delete; 20 | 21 | inline int lock() { return pthread_mutex_lock(&_mutex); } 22 | inline int trylock() { return pthread_mutex_trylock(&_mutex); } 23 | inline int unlock() { return pthread_mutex_unlock(&_mutex); } 24 | 25 | private: 26 | pthread_mutex_t _mutex; 27 | }; 28 | 29 | /** 30 | Thin wrapper around pthread_spinlock_t meant to be tied to a data structure. 31 | */ 32 | class Spinlock { 33 | public: 34 | explicit Spinlock() { 35 | pthread_spin_init(&_spinlock, PTHREAD_PROCESS_PRIVATE); 36 | } 37 | virtual ~Spinlock() { pthread_spin_destroy(&_spinlock); } 38 | Spinlock(const Spinlock &) = delete; 39 | Spinlock &operator=(const Spinlock &) = delete; 40 | Spinlock(Spinlock &&) = delete; 41 | Spinlock &operator=(Spinlock &&) = delete; 42 | 43 | inline int lock() { return pthread_spin_lock(&_spinlock); } 44 | inline int trylock() { return pthread_spin_trylock(&_spinlock); } 45 | inline int unlock() { return pthread_spin_unlock(&_spinlock); } 46 | 47 | private: 48 | pthread_spinlock_t _spinlock; 49 | }; 50 | } 51 | -------------------------------------------------------------------------------- /sssp-cuda/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "bellman_ford.h" 2 | #include "docopt.h" 3 | #include "graph.hpp" 4 | #include 5 | #include 6 | #include 7 | 8 | static const char USAGE[] = 9 | R"(Concurrency Final Project. 10 | 11 | Run GPU accelerated single-source shortest paths using Bellman-Ford. 12 | 13 | Usage: 14 | sssp_cpu_only --input= --source= 15 | 16 | Options: 17 | -h --help Show this screen. 18 | --input= String-valued path to an input graph text file. 19 | --source= Integer-valued source id, must be less than number of vertices. [default: 0] 20 | )"; 21 | 22 | using graph::Graph; 23 | 24 | void printArgs(const std::map &args) { 25 | std::cout << "{" << std::endl; 26 | for (const auto &arg : args) { 27 | std::cout << " " << arg.first << ": " << arg.second << std::endl; 28 | } 29 | std::cout << "}" << std::endl; 30 | } 31 | 32 | void printOutput(const std::vector &paths, 33 | const float duration) { 34 | std::cout << "Shortest paths to each vertex: " << std::endl; 35 | for (size_t i = 0; i < paths.size(); i++) { 36 | std::cout << " " << i << ": " << paths.at(i) << std::endl; 37 | } 38 | 39 | std::cout 40 | << "Elapsed runtime: " 41 | << duration 42 | << "ms" << std::endl; 43 | } 44 | 45 | int main(int argc, const char *argv[]) { 46 | auto args = docopt::docopt(USAGE, {argv + 1, argv + argc}, true); 47 | printArgs(args); 48 | 49 | auto graph = Graph{args.at("--input").asString()}; 50 | auto source = args.at("--source").asLong(); 51 | 52 | auto output = bellmanFordCUDA(graph, source); 53 | printOutput(output.distances, output.elapsed); 54 | 55 | return 0; 56 | } 57 | -------------------------------------------------------------------------------- /lib/graph/test/graph_test.cpp: -------------------------------------------------------------------------------- 1 | #include "graph.hpp" 2 | 3 | #include "gtest/gtest.h" 4 | #include 5 | #include 6 | #include 7 | 8 | using graph::Edge; 9 | using graph::Graph; 10 | 11 | TEST(GraphTest, AddEdge) { 12 | auto graph = Graph{3}; 13 | graph.addEdge(0, 1, 3); 14 | EXPECT_EQ(graph.cost(0, 1), 3); 15 | EXPECT_EQ(graph.cost(1, 0), 3); 16 | } 17 | 18 | TEST(GraphTest, OutOfRange) { 19 | auto graph = Graph{0}; 20 | ASSERT_THROW(graph.cost(0, 1), std::invalid_argument); 21 | } 22 | 23 | TEST(GraphTest, CostInvalidArg) { 24 | auto graph = Graph{3}; 25 | graph.addEdge(0, 2, 3); 26 | ASSERT_THROW(graph.cost(1, 2), std::invalid_argument); 27 | } 28 | 29 | TEST(GraphTest, GetNeighbors) { 30 | auto graph = Graph{3}; 31 | graph.addEdge(0, 1, 2); 32 | graph.addEdge(0, 2, 3); 33 | auto neighbors = graph.getNeighbors(0); 34 | auto expected = std::vector{{1, 2}}; 35 | EXPECT_EQ(neighbors, expected); 36 | } 37 | 38 | TEST(GraphTest, GetNeighborsEmpty) { 39 | auto graph = Graph{3}; 40 | auto neighbors = graph.getNeighbors(0); 41 | auto expected = std::vector{}; 42 | EXPECT_EQ(neighbors, expected); 43 | } 44 | 45 | TEST(GraphTest, SaveAndLoad) { 46 | auto graph = Graph{3}; 47 | graph.addEdge(0, 1, 2); 48 | graph.addEdge(0, 2, 3); 49 | graph.addEdge(1, 2, 4); 50 | 51 | // TODO: properly use tmpfile() later 52 | auto filename = "/tmp/gtest_graph_test.bin"; 53 | Graph new_graph{0}; 54 | try { 55 | graph.saveToFile(filename); 56 | new_graph = Graph(filename); 57 | } catch (const std::exception &e) { 58 | std::remove(filename); 59 | FAIL() << "IO error encountered"; 60 | } 61 | 62 | EXPECT_EQ(new_graph.num_vertices, graph.num_vertices); 63 | EXPECT_EQ(new_graph.cost(0, 1), 2); 64 | EXPECT_EQ(new_graph.cost(0, 2), 3); 65 | EXPECT_EQ(new_graph.cost(1, 2), 4); 66 | auto expected = std::vector{{1, 2}}; 67 | EXPECT_EQ(new_graph.getNumEdges(), graph.getNumEdges()); 68 | EXPECT_EQ(new_graph.getNeighbors(0), expected); 69 | 70 | // std::remove(filename); 71 | } 72 | -------------------------------------------------------------------------------- /sssp-cpu-only/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "bellman_ford.hpp" 2 | #include "docopt.h" 3 | #include "graph.hpp" 4 | #include 5 | #include 6 | #include 7 | 8 | static const char USAGE[] = 9 | R"(Concurrency Final Project. 10 | 11 | Run sequential or multithreaded single-source shortest paths using Bellman-Ford. 12 | 13 | Usage: 14 | sssp_cpu_only --input= --source= [--workers=] 15 | 16 | Options: 17 | -h --help Show this screen. 18 | --input= String-valued path to an input graph text file. 19 | --source= Integer-valued source id, must be less than number of vertices. [default: 0] 20 | --workers= Integer-valued number of threads. 21 | )"; 22 | 23 | using graph::Graph; 24 | 25 | void printArgs(const std::map &args) { 26 | std::cout << "{" << std::endl; 27 | for (const auto &arg : args) { 28 | std::cout << " " << arg.first << ": " << arg.second << std::endl; 29 | } 30 | std::cout << "}" << std::endl; 31 | } 32 | 33 | void printOutput(const std::vector &paths, 34 | const std::chrono::duration &duration) { 35 | std::cout << "Shortest paths to each vertex: " << std::endl; 36 | for (size_t i = 0; i < paths.size(); i++) { 37 | std::cout << " " << i << ": " << paths.at(i) << std::endl; 38 | } 39 | 40 | std::cout 41 | << "Elapsed runtime: " 42 | << std::chrono::duration_cast(duration).count() 43 | << "ms" << std::endl; 44 | } 45 | 46 | int main(int argc, const char *argv[]) { 47 | auto args = docopt::docopt(USAGE, {argv + 1, argv + argc}, true); 48 | auto num_workers = args.at("--workers"); 49 | if (num_workers && num_workers.asLong() <= 0) { 50 | throw std::invalid_argument("Must specify at least one worker!"); 51 | } 52 | printArgs(args); 53 | 54 | auto graph = Graph{args.at("--input").asString()}; 55 | auto source = args.at("--source").asLong(); 56 | 57 | auto start = std::chrono::high_resolution_clock::now(); 58 | auto paths = (!num_workers) 59 | ? sssp::bellmanFord(graph, source) 60 | : sssp::bellmanFordParallel(graph, source, num_workers.asLong()); 61 | auto elapsed = std::chrono::high_resolution_clock::now() - start; 62 | printOutput(paths, elapsed); 63 | 64 | return 0; 65 | } 66 | -------------------------------------------------------------------------------- /cmake/gtest.cmake: -------------------------------------------------------------------------------- 1 | find_package(Threads REQUIRED) 2 | 3 | include(ExternalProject) 4 | ExternalProject_Add(googletest 5 | GIT_REPOSITORY https://github.com/google/googletest.git 6 | UPDATE_COMMAND "" 7 | INSTALL_COMMAND "" 8 | LOG_DOWNLOAD ON 9 | LOG_CONFIGURE ON 10 | LOG_BUILD ON) 11 | 12 | ExternalProject_Get_Property(googletest source_dir) 13 | set(GTEST_INCLUDE_DIRS ${source_dir}/googletest/include) 14 | set(GMOCK_INCLUDE_DIRS ${source_dir}/googlemock/include) 15 | 16 | # The cloning of the above repo doesn't happen until make, however if the dir doesn't 17 | # exist, INTERFACE_INCLUDE_DIRECTORIES will throw an error. 18 | # To make it work, we just create the directory now during config. 19 | file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIRS}) 20 | file(MAKE_DIRECTORY ${GMOCK_INCLUDE_DIRS}) 21 | 22 | ExternalProject_Get_Property(googletest binary_dir) 23 | set(GTEST_LIBRARY_PATH ${binary_dir}/googlemock/gtest/${CMAKE_FIND_LIBRARY_PREFIXES}gtest.a) 24 | set(GTEST_LIBRARY gtest) 25 | add_library(${GTEST_LIBRARY} UNKNOWN IMPORTED) 26 | set_target_properties(${GTEST_LIBRARY} PROPERTIES 27 | "IMPORTED_LOCATION" "${GTEST_LIBRARY_PATH}" 28 | "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}" 29 | "INTERFACE_INCLUDE_DIRECTORIES" "${GTEST_INCLUDE_DIRS}") 30 | add_dependencies(${GTEST_LIBRARY} googletest) 31 | 32 | set(GTEST_MAIN_LIBRARY_PATH ${binary_dir}/googlemock/gtest/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main.a) 33 | set(GTEST_MAIN_LIBRARY gtest_main) 34 | add_library(${GTEST_MAIN_LIBRARY} UNKNOWN IMPORTED) 35 | set_target_properties(${GTEST_MAIN_LIBRARY} PROPERTIES 36 | "IMPORTED_LOCATION" "${GTEST_MAIN_LIBRARY_PATH}" 37 | "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}" 38 | "INTERFACE_INCLUDE_DIRECTORIES" "${GTEST_INCLUDE_DIRS}") 39 | add_dependencies(${GTEST_MAIN_LIBRARY} googletest) 40 | 41 | set(GMOCK_LIBRARY_PATH ${binary_dir}/googlemock/${CMAKE_FIND_LIBRARY_PREFIXES}gmock.a) 42 | set(GMOCK_LIBRARY gmock) 43 | add_library(${GMOCK_LIBRARY} UNKNOWN IMPORTED) 44 | set_target_properties(${GMOCK_LIBRARY} PROPERTIES 45 | "IMPORTED_LOCATION" "${GMOCK_LIBRARY_PATH}" 46 | "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}" 47 | "INTERFACE_INCLUDE_DIRECTORIES" "${GMOCK_INCLUDE_DIRS}") 48 | add_dependencies(${GMOCK_LIBRARY} googletest) 49 | 50 | set(GMOCK_MAIN_LIBRARY_PATH ${binary_dir}/googlemock/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main.a) 51 | set(GMOCK_MAIN_LIBRARY gmock_main) 52 | add_library(${GMOCK_MAIN_LIBRARY} UNKNOWN IMPORTED) 53 | set_target_properties(${GMOCK_MAIN_LIBRARY} PROPERTIES 54 | "IMPORTED_LOCATION" "${GMOCK_MAIN_LIBRARY_PATH}" 55 | "IMPORTED_LINK_INTERFACE_LIBRARIES" "${CMAKE_THREAD_LIBS_INIT}" 56 | "INTERFACE_INCLUDE_DIRECTORIES" "${GMOCK_INCLUDE_DIRS}") 57 | add_dependencies(${GMOCK_MAIN_LIBRARY} ${GTEST_LIBRARY}) 58 | -------------------------------------------------------------------------------- /sssp-opencl/src/bf_kernel.cl: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2017, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | 30 | #pragma OPENCL EXTENSION cl_khr_fp64: enable 31 | #pragma OPENCL EXTENSION cl_khr_int64_base_atomics: enable 32 | 33 | ulong atom_cmpxchg(volatile __global ulong *p, ulong cmp, ulong val); 34 | 35 | double __attribute__((overloadable)) atomic_min(global double *valq, double val) { 36 | 37 | double oldVal; 38 | double newVal; 39 | 40 | unsigned long* oldValPtr = (unsigned long*) &oldVal; 41 | unsigned long* newValPtr = (unsigned long*) &newVal; 42 | 43 | do { 44 | oldVal = *valq; 45 | newVal = val < oldVal ? val : oldVal; 46 | } while (atom_cmpxchg((volatile global unsigned long*) valq, *oldValPtr, *newValPtr) != *oldValPtr); 47 | return oldVal; 48 | } 49 | 50 | // This function represents an OpenCL kernel. The kernel will be call from 51 | // host application using the xcl_run_kernels call. The pointers in kernel 52 | // parameters with the global keyword represents cl_mem objects on the FPGA 53 | // DDR memory. 54 | // 55 | kernel __attribute__((reqd_work_group_size(1, 1, 1))) 56 | void bellman_ford(global double* distsRead, 57 | global double* distsWrite, 58 | global const ulong* sources, 59 | global const ulong* destinations, 60 | global const double* costs, 61 | const ulong numVertices, 62 | const ulong numEdges) 63 | { 64 | for(ulong iter = 0; iter < numVertices; iter++) { 65 | __attribute__((opencl_unroll_hint)) 66 | for(int i=0; i 5 | #include 6 | #include 7 | #include 8 | 9 | using std::vector; 10 | using graph::Edge; 11 | using graph::Graph; 12 | 13 | namespace sssp { 14 | 15 | constexpr double kInfinity = std::numeric_limits::infinity(); 16 | 17 | vector bellmanFord(const Graph &graph, size_t source) { 18 | if (source > graph.num_vertices) { 19 | throw std::invalid_argument("Source id cannot exceed num_vertices"); 20 | } 21 | 22 | // All vertices (except the source) are initially considered to be infinitely 23 | // far away 24 | auto distances = vector(graph.num_vertices, kInfinity); 25 | distances[source] = 0.0; 26 | 27 | // Iteratively relax edges 28 | for (size_t iter = 0; iter < graph.num_vertices; iter++) { 29 | for (const Edge &edge : graph.getAllEdges()) { 30 | // "Relax" this edge (does this edge represent a shorter path?) 31 | if (distances.at(edge.src) + edge.cost < distances.at(edge.dest)) { 32 | distances[edge.dest] = distances.at(edge.src) + edge.cost; 33 | } 34 | } 35 | } 36 | 37 | // Check for negative-weight cycles 38 | for (const Edge &edge : graph.getAllEdges()) { 39 | if (distances.at(edge.src) + edge.cost < distances.at(edge.dest)) { 40 | throw std::domain_error("Graph contains a negative weight cycle."); 41 | } 42 | } 43 | 44 | return distances; 45 | } 46 | 47 | void bellmanFordWorker(const graph::Graph &graph, vector &distances, 48 | const util::Range &range, util::Barrier &barrier, 49 | std::mutex &lock) { 50 | // Iteratively relax edges 51 | const auto &all_edges = graph.getAllEdges(); 52 | auto localDists = vector(distances); 53 | for (size_t iter = 0; iter < graph.num_vertices; iter++) { 54 | // relax edges 55 | for (size_t edge_id = range.start; edge_id < range.end; edge_id++) { 56 | // Relax this edge, ensuring mutual exclusion around access to distances 57 | const auto &edge = all_edges.at(edge_id); 58 | // std::lock_guard guard{lock}; // lock released at end of scope 59 | if (localDists.at(edge.src) + edge.cost < localDists.at(edge.dest)) { 60 | distances[edge.dest] = localDists.at(edge.src) + edge.cost; 61 | } 62 | } 63 | barrier.wait(); 64 | 65 | // read and copy distances 66 | for (size_t i = 0; i < graph.num_vertices; i++) { 67 | localDists[i] = distances.at(i); 68 | } 69 | barrier.wait(); 70 | } 71 | } 72 | 73 | vector bellmanFordParallel(const graph::Graph &graph, size_t source, 74 | size_t num_threads) { 75 | if (source > graph.num_vertices) { 76 | throw std::invalid_argument("Source id cannot exceed num_vertices"); 77 | } 78 | 79 | auto partition = util::partition(graph.getNumEdges(), num_threads); 80 | auto threads = vector{}; 81 | util::Barrier barrier{num_threads}; 82 | std::mutex lock{}; 83 | 84 | // Distances structure setup (non-source vertices initially considered 85 | // infinitely far away) 86 | auto distances = vector(graph.num_vertices, kInfinity); 87 | distances[source] = 0.0; 88 | 89 | // Delegate to the worker function 90 | for (size_t i = 0; i < num_threads; i++) { 91 | threads.emplace_back(&bellmanFordWorker, std::cref(graph), 92 | std::ref(distances), std::cref(partition.at(i)), 93 | std::ref(barrier), std::ref(lock)); 94 | } 95 | for (auto &thread : threads) { 96 | thread.join(); 97 | } 98 | 99 | return distances; 100 | } 101 | 102 | } /* namespace sssp */ 103 | -------------------------------------------------------------------------------- /sssp-cuda/include/util.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "locks.hpp" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace util { 10 | 11 | // Type aliases 12 | using label_t = size_t; 13 | using range_t = std::pair; 14 | 15 | /** 16 | Represents a point in n-space and supports useful operations. 17 | */ 18 | struct Point final { 19 | // Create a zero-vector of length dim 20 | explicit Point(size_t dim) : _point(std::vector(dim, 0.0)) {} 21 | 22 | // Create a vector with the values in point 23 | explicit Point(const std::vector &point) : _point(point) {} 24 | 25 | // Create a vector with random values in [0, 1) 26 | static Point random(size_t dim); 27 | 28 | // Convenience vector operations 29 | double dist(const Point &other) const; 30 | Point &operator+=(const Point &other); 31 | Point &operator/=(size_t divisor); 32 | friend std::ostream &operator<<(std::ostream &os, const Point &obj); 33 | friend bool operator==(const Point &p1, const Point &p2); 34 | size_t dim() const { return _point.size(); } 35 | 36 | // Data 37 | std::vector _point; 38 | }; 39 | 40 | std::ostream &operator<<(std::ostream &os, const Point &obj); 41 | inline bool operator==(const Point &p1, const Point &p2) { 42 | return p1._point == p2._point; 43 | } 44 | 45 | /** 46 | Generates k random centroids with values in [0, 1). 47 | 48 | @param num_features - dimensionality of each centroid 49 | @param k - number of centroids 50 | @return list of centroids (std::vector) 51 | */ 52 | std::vector random_centroids(int k, size_t num_features); 53 | 54 | /** 55 | Check how much centroids and old_centroids differ. 56 | */ 57 | std::pair converged(const std::vector ¢roids, 58 | const std::vector &old_centroids, 59 | double threshold); 60 | 61 | /** 62 | Helper to divide the data range among the threads and handle uneven partition. 63 | Last thread gets the remaining work. 64 | Each range has an inclusive `start` and exclusive `end`: [start, end) 65 | */ 66 | std::vector partition(size_t size, int num_threads); 67 | 68 | // Synchronization strategy 69 | enum class SyncType { COARSE, FINE, NONE }; 70 | 71 | /** 72 | This class represents a collection of points and operations on them 73 | relating to centroids. 74 | */ 75 | class DataSet final { 76 | public: 77 | explicit DataSet(const std::vector &points, locks::LockType lockType) 78 | : _points(points), _num_features(points[0].dim()), _mutex(), _spinlock(), 79 | _type(lockType) {} 80 | 81 | /** 82 | Factory that creates a DataSet from a file following the generate.py format. 83 | 84 | @param filename - path to file of correct format 85 | @return new DataSet instance 86 | */ 87 | static std::shared_ptr 88 | make_dataset(const std::string &filename, 89 | locks::LockType type = locks::LockType::NONE); 90 | 91 | inline range_t max_range() const { return range_t{0, _points.size()}; } 92 | inline size_t num_features() const { return _num_features; } 93 | inline size_t num_points() const { return _points.size(); } 94 | std::vector make_raw() const; 95 | 96 | /** 97 | Set an index for each data point indicating closest centroid. 98 | 99 | @param centroids - list of centroids 100 | @param labels - list of indices into centroids indicating closest one to 101 | each data point 102 | @param range - which data points to process 103 | */ 104 | void nearest_centroids(const std::vector ¢roids, 105 | std::vector &labels, range_t range) const; 106 | 107 | /** 108 | Accumulate matching data points to corresponding centroids. 109 | */ 110 | void sum_labeled_centroids(std::vector ¢roids, 111 | const std::vector &labels, 112 | std::vector &counts, range_t range, 113 | SyncType type) const; 114 | 115 | /** 116 | Divide centroids by counts of contributing points, handle empty cluster. 117 | */ 118 | void normalize_centroids(std::vector ¢roids, 119 | std::vector &counts, range_t k_range) const; 120 | 121 | private: 122 | void lock() const; 123 | void unlock() const; 124 | 125 | private: 126 | std::vector _points; 127 | size_t _num_features; 128 | mutable locks::Mutex _mutex; 129 | mutable locks::Spinlock _spinlock; 130 | locks::LockType _type; 131 | }; 132 | } /* namespace util */ 133 | -------------------------------------------------------------------------------- /lib/graph/include/graph.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | 20 | using std::string; 21 | using std::vector; 22 | 23 | namespace graph { 24 | 25 | constexpr double kEdgeWeightMin = 0.0; 26 | constexpr double kEdgeWeightMax = 100.0; 27 | 28 | namespace detail { 29 | inline void checkBounds(const vector &input, size_t max) { 30 | if (std::any_of(input.begin(), input.end(), 31 | [max](const auto &val) { return val >= max; })) { 32 | throw std::invalid_argument("Specified index out of range"); 33 | } 34 | } 35 | } /* end namespace detail */ 36 | 37 | struct Edge { 38 | size_t src; 39 | size_t dest; 40 | double cost; 41 | }; 42 | 43 | class Graph { 44 | public: 45 | // TODO: make private and move serialization logic into header 46 | // Don't mutate these fields, use the accessors below instead 47 | size_t num_vertices; 48 | std::vector> adjacency_list; 49 | 50 | // Create a graph instance with a max vertex id of num_vertices 51 | Graph(size_t num_vertices) 52 | : num_vertices(num_vertices), adjacency_list(num_vertices), all_edges() {}; 53 | 54 | // Create a graph instance from a text file with the following format: 55 | // 56 | // num_vertices\n 57 | // src_id dest_id cost\n 58 | // ... 59 | // src_id dest_id cost\n 60 | Graph(const std::string &filename) 61 | : num_vertices(), adjacency_list(), all_edges() { 62 | auto file = std::ifstream{filename}; 63 | auto line = std::string{}; 64 | 65 | // Read number of vertices 66 | std::getline(file, line); 67 | num_vertices = static_cast(std::stoi(line)); 68 | adjacency_list.resize(num_vertices); 69 | 70 | // Read in each edge 71 | size_t src; 72 | size_t dest; 73 | double cost; 74 | for (; std::getline(file, line);) { 75 | auto stream = std::stringstream{line}; 76 | stream >> src >> dest >> cost; 77 | addSingleEdge(src, dest, cost); 78 | }; 79 | } 80 | 81 | void addEdge(size_t src, size_t dest, double cost) { 82 | detail::checkBounds({src, dest}, num_vertices); 83 | adjacency_list[src].push_back({src, dest, cost}); 84 | adjacency_list[dest].push_back({dest, src, cost}); 85 | all_edges.push_back({src, dest, cost}); 86 | all_edges.push_back({dest, src, cost}); 87 | } 88 | 89 | void addSingleEdge(size_t src, size_t dest, double cost) { 90 | detail::checkBounds({src, dest}, num_vertices); 91 | adjacency_list[src].push_back({src, dest, cost}); 92 | all_edges.push_back({src, dest, cost}); 93 | } 94 | 95 | inline const std::vector &getEdges(size_t src) const { 96 | return adjacency_list.at(src); 97 | } 98 | inline size_t getNumEdges() const { return all_edges.size(); } 99 | inline const std::vector &getAllEdges() const { return all_edges; } 100 | 101 | std::vector getNeighbors(size_t src) const { 102 | detail::checkBounds({src}, num_vertices); 103 | auto neighbors = vector{}; 104 | for (const auto &edge : adjacency_list.at(src)) { 105 | if (edge.src == src) { 106 | neighbors.push_back(edge.dest); 107 | } 108 | } 109 | return neighbors; 110 | } 111 | 112 | double cost(size_t src, size_t dest) const { 113 | detail::checkBounds({src, dest}, num_vertices); 114 | const auto &vertex_list = adjacency_list.at(src); 115 | auto edge = 116 | std::find_if(vertex_list.begin(), vertex_list.end(), 117 | [dest](const auto &edge) { return edge.dest == dest; }); 118 | if (edge == vertex_list.end()) { 119 | throw std::invalid_argument("Specified edge doesn't exist"); 120 | } 121 | return (*edge).cost; 122 | } 123 | 124 | std::string toString() const { 125 | std::string str; 126 | str += std::to_string(num_vertices) + "\n"; 127 | for (size_t vertex = 0; vertex < num_vertices; vertex++) { 128 | const auto &edges = adjacency_list[vertex]; 129 | for (const Edge &e : edges) { 130 | str += std::to_string(e.src) + " " + std::to_string(e.dest) + " " + 131 | std::to_string(e.cost) + "\n"; 132 | } 133 | } 134 | return str; 135 | } 136 | 137 | // Write the graph instance to a file 138 | void saveToFile(const std::string &output_path) const { 139 | auto out_stream = std::ofstream{output_path}; 140 | out_stream << toString(); 141 | }; 142 | 143 | // Create a graph instance with the given number of vertices and edges. 144 | // Each edge has random unique endpoint vertices and random weight in [-100.0, 145 | // 100.0] 146 | static Graph generateGraph(size_t num_vertices, size_t num_edges) { 147 | std::random_device seed; 148 | auto engine = std::default_random_engine{seed()}; 149 | auto weight_dist = 150 | std::uniform_real_distribution{kEdgeWeightMin, kEdgeWeightMax}; 151 | auto vertex_dist = std::uniform_int_distribution{0, num_vertices - 1}; 152 | 153 | auto graph = Graph{num_vertices}; 154 | if (num_edges < num_vertices - 1) { 155 | auto buffer = std::stringstream{}; 156 | buffer << "Graph generation failed: number of vertices (" << num_vertices 157 | << ") was less than or equal to number of edges" << num_edges << "." 158 | << std::endl; 159 | throw std::invalid_argument(buffer.str()); 160 | } 161 | 162 | size_t src, dest; 163 | for (size_t i = 0; i < num_edges; i++) { 164 | do { 165 | src = vertex_dist(engine); 166 | dest = vertex_dist(engine); 167 | } while (src == dest); 168 | double cost = weight_dist(engine); 169 | graph.addEdge(src, dest, cost); 170 | } 171 | 172 | return graph; 173 | }; 174 | 175 | private: 176 | std::vector all_edges; 177 | }; 178 | } /* namespace graph */ 179 | -------------------------------------------------------------------------------- /sssp-opencl/README.md: -------------------------------------------------------------------------------- 1 | Hello World (CL) 2 | ====================== 3 | 4 | This README file contains the following sections: 5 | 6 | 1. OVERVIEW 7 | 2. HOW TO DOWLOAD THE REPOSITORY 8 | 3. SOFTWARE TOOLS AND SYSTEM REQUIREMENTS 9 | 4. DESIGN FILE HIERARCHY 10 | 5. COMPILATION AND EXECUTION 11 | 6. EXECUTION IN CLOUD ENVIRONMENTS 12 | 7. SUPPORT 13 | 8. LICENSE AND CONTRIBUTING TO THE REPOSITORY 14 | 9. ACKNOWLEDGEMENTS 15 | 10. REVISION HISTORY 16 | 17 | 18 | ## 1. OVERVIEW 19 | This example is a simple OpenCL application. It will highlight the basic flow of an OpenCL application. 20 | 21 | ***KEY CONCEPTS:*** OpenCL API 22 | 23 | ## 2. HOW TO DOWNLOAD THE REPOSITORY 24 | To get a local copy of the SDAccel example repository, clone this repository to the local system with the following command: 25 | ``` 26 | git clone https://github.com/Xilinx/SDAccel_Examples examples 27 | ``` 28 | where examples is the name of the directory where the repository will be stored on the local system.This command needs to be executed only once to retrieve the latest version of all SDAccel examples. The only required software is a local installation of git. 29 | 30 | ## 3. SOFTWARE AND SYSTEM REQUIREMENTS 31 | Board | Device Name | Software Version 32 | ------|-------------|----------------- 33 | Alpha Data ADM-PCIE-7V3|xilinx:adm-pcie-7v3:1ddr|SDAccel 2017.1 34 | Xilinx VU9P|xilinx:xil-accel-rd-vu9p:4ddr-xpr|SDAccel 2017.1 35 | AWS VU9P F1|xilinx:aws-vu9p-f1:4ddr-xpr-2pr|SDAccel 2017.1 36 | Xilinx KU115|xilinx:xil-accel-rd-ku115:4ddr-xpr|SDAccel 2017.1 37 | Alpha Data ADM-PCIE-KU3|xilinx:adm-pcie-ku3:2ddr-xpr|SDAccel 2017.1 38 | 39 | 40 | *NOTE:* The board/device used for compilation can be changed by adding the DEVICES variable to the make command as shown below 41 | ``` 42 | make DEVICES= 43 | ``` 44 | where the *DEVICES* variable accepts either 1 device from the table above or a comma separated list of device names. 45 | 46 | ## 4. DESIGN FILE HIERARCHY 47 | Application code is located in the src directory. Accelerator binary files will be compiled to the xclbin directory. The xclbin directory is required by the Makefile and its contents will be filled during compilation. A listing of all the files in this example is shown below 48 | 49 | ``` 50 | .gitignore 51 | Makefile 52 | README.md 53 | description.json 54 | src/host.cpp 55 | src/vector_addition.cl 56 | ``` 57 | 58 | ## 5. COMPILATION AND EXECUTION 59 | ### Compiling for Application Emulation 60 | As part of the capabilities available to an application developer, SDAccel includes environments to test the correctness of an application at both a software functional level and a hardware emulated level. 61 | These modes, which are named sw_emu and hw_emu, allow the developer to profile and evaluate the performance of a design before compiling for board execution. 62 | It is recommended that all applications are executed in at least the sw_emu mode before being compiled and executed on an FPGA board. 63 | ``` 64 | make TARGETS= all 65 | ``` 66 | where 67 | ``` 68 | sw_emu = software emulation 69 | hw_emu = hardware emulation 70 | ``` 71 | *NOTE:* The software emulation flow is a functional correctness check only. It does not estimate the performance of the application in hardware. 72 | The hardware emulation flow is a cycle accurate simulation of the hardware generated for the application. As such, it is expected for this simulation to take a long time. 73 | It is recommended that for this example the user skips running hardware emulation or modifies the example to work on a reduced data set. 74 | ### Executing Emulated Application 75 | ***Recommended Execution Flow for Example Applications in Emulation*** 76 | 77 | The makefile for the application can directly executed the application with the following command: 78 | ``` 79 | make TARGETS= check 80 | 81 | ``` 82 | where 83 | ``` 84 | sw_emu = software emulation 85 | hw_emu = hardware emulation 86 | ``` 87 | If the application has not been previously compiled, the check makefile rule will compile and execute the application in the emulation mode selected by the user. 88 | 89 | ***Alternative Execution Flow for Example Applications in Emulation*** 90 | 91 | An emulated application can also be executed directly from the command line without using the check makefile rule as long as the user environment has been properly configured. 92 | To manually configure the environment to run the application, set the following 93 | ``` 94 | export LD_LIBRARY_PATH=$XILINX_SDX/runtime/lib/x86_64/:$LD_LIBRARY_PATH 95 | export XCL_EMULATION_MODE= 96 | emconfigutil --xdevice 'xilinx:xil-accel-rd-ku115:4ddr-xpr' --nd 1 97 | ``` 98 | Once the environment has been configured, the application can be executed by 99 | ``` 100 | ./helloworld 101 | ``` 102 | This is the same command executed by the check makefile rule 103 | ### Compiling for Application Execution in the FPGA Accelerator Card 104 | The command to compile the application for execution on the FPGA acceleration board is 105 | ``` 106 | make all 107 | ``` 108 | The default target for the makefile is to compile for hardware. Therefore, setting the TARGETS option is not required. 109 | *NOTE:* Compilation for application execution in hardware generates custom logic to implement the functionality of the kernels in an application. 110 | It is typical for hardware compile times to range from 30 minutes to a couple of hours. 111 | 112 | ## 6. Execution in Cloud Environments 113 | FPGA acceleration boards have been deployed to the cloud. For information on how to execute the example within a specific cloud, take a look at the following guides. 114 | * [AWS F1 Application Execution on Xilinx Virtex UltraScale Devices] 115 | * [Nimbix Application Execution on Xilinx Kintex UltraScale Devices] 116 | * [IBM SuperVessel Research Cloud on Xilinx Virtex Devices] 117 | 118 | 119 | ## 7. SUPPORT 120 | For more information about SDAccel check the [SDAccel User Guides][] 121 | 122 | For questions and to get help on this project or your own projects, visit the [SDAccel Forums][]. 123 | 124 | To execute this example using the SDAccel GUI, follow the setup instructions in [SDAccel GUI README][] 125 | 126 | 127 | ## 8. LICENSE AND CONTRIBUTING TO THE REPOSITORY 128 | The source for this project is licensed under the [3-Clause BSD License][] 129 | 130 | To contribute to this project, follow the guidelines in the [Repository Contribution README][] 131 | 132 | ## 9. ACKNOWLEDGEMENTS 133 | This example is written by developers at 134 | - [Xilinx](http://www.xilinx.com) 135 | 136 | ## 10. REVISION HISTORY 137 | Date | README Version | Description 138 | -----|----------------|------------ 139 | DEC2016|1.0|Initial Xilinx Release 140 | 141 | [3-Clause BSD License]: ../../../LICENSE.txt 142 | [SDAccel Forums]: https://forums.xilinx.com/t5/SDAccel/bd-p/SDx 143 | [SDAccel User Guides]: http://www.xilinx.com/support/documentation-navigation/development-tools/software-development/sdaccel.html?resultsTablePreSelect=documenttype:SeeAll#documentation 144 | [Nimbix Getting Started Guide]: http://www.xilinx.com/support/documentation/sw_manuals/xilinx2016_2/ug1240-sdaccel-nimbix-getting-started.pdf 145 | [Walkthrough Video]: http://bcove.me/6pp0o482 146 | [Nimbix Application Submission README]: ../../../utility/nimbix/README.md 147 | [Repository Contribution README]: ../../../CONTRIBUTING.md 148 | [SDaccel GUI README]: ../../../GUIREADME.md 149 | [AWS F1 Application Execution on Xilinx Virtex UltraScale Devices]: https://github.com/aws/aws-fpga/blob/master/SDAccel/README.md 150 | [Nimbix Application Execution on Xilinx Kintex UltraScale Devices]: ../../../utility/nimbix/README.md 151 | [IBM SuperVessel Research Cloud on Xilinx Virtex Devices]: http://bcove.me/6pp0o482 152 | -------------------------------------------------------------------------------- /sssp-opencl/src/host.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2017, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #include "xcl2.hpp" 30 | #include "graph.hpp" 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | using std::vector; 37 | using std::string; 38 | using graph::Graph; 39 | using graph::Edge; 40 | 41 | static constexpr double kInfinity = std::numeric_limits::max(); 42 | static const std::string error_message = 43 | "Error: Result mismatch:\n" 44 | "i = %d CPU result = %d Device result = %d\n"; 45 | 46 | // This example illustrates the very simple OpenCL example that performs 47 | // an addition on two vectors 48 | int main(int argc, char **argv) { 49 | 50 | string inputFile = "../data/v10k-e25k.graph"; 51 | int source = 0; 52 | if(argc == 3) { 53 | string inputFile = argv[1]; 54 | string sourceStr = argv[2]; 55 | int source = std::stoi(sourceStr); 56 | } 57 | 58 | Graph graph = Graph(inputFile); 59 | 60 | // compute the size of array in bytes 61 | // size_t size_in_bytes = DATA_SIZE * sizeof(int); 62 | 63 | // Creates a vector of DATA_SIZE elements with an initial value of 10 and 32 64 | vector> distsRead(graph.num_vertices, kInfinity); 65 | vector> distsWrite(graph.num_vertices, kInfinity); 66 | distsRead[source] = 0.0; 67 | distsWrite[source] = 0.0; 68 | vector> sources(graph.getNumEdges()); 69 | vector> destinations(graph.getNumEdges()); 70 | vector> costs(graph.getNumEdges()); 71 | 72 | vector allEdges = graph.getAllEdges(); 73 | for(size_t i=0; i devices = xcl::get_xil_devices(); 83 | cl::Device device = devices[0]; 84 | 85 | //Creating Context and Command Queue for selected Device 86 | cl::Context context(device); 87 | cl::CommandQueue q(context, device, CL_QUEUE_PROFILING_ENABLE); 88 | std::string device_name = device.getInfo(); 89 | std::cout << "Found Device=" << device_name.c_str() << std::endl; 90 | 91 | // import_binary() command will find the OpenCL binary file created using the 92 | // xocc compiler load into OpenCL Binary and return as Binaries 93 | // OpenCL and it can contain many functions which can be executed on the 94 | // device. 95 | std::string binaryFile = xcl::find_binary_file(device_name,"bf_kernel"); 96 | cl::Program::Binaries bins = xcl::import_binary_file(binaryFile); 97 | devices.resize(1); 98 | cl::Program program(context, devices, bins); 99 | 100 | // These commands will allocate memory on the FPGA. The cl::Buffer objects can 101 | // be used to reference the memory locations on the device. The cl::Buffer 102 | // object cannot be referenced directly and must be passed to other OpenCL 103 | // functions. 104 | cl::Buffer bufferDistsRead(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, 105 | graph.num_vertices * sizeof(double), distsRead.data()); 106 | cl::Buffer bufferDistsWrite(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_WRITE, 107 | graph.num_vertices * sizeof(double), distsWrite.data()); 108 | cl::Buffer bufferSources(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, 109 | graph.getNumEdges() * sizeof(size_t), sources.data()); 110 | cl::Buffer bufferDestinations(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, 111 | graph.getNumEdges() * sizeof(size_t), destinations.data()); 112 | cl::Buffer bufferCosts(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, 113 | graph.getNumEdges() * sizeof(double), costs.data()); 114 | 115 | //Separate Read/write Buffer vector is needed to migrate data between host/device 116 | std::vector inBufVec, outBufVec; 117 | inBufVec.push_back(bufferDistsRead); 118 | inBufVec.push_back(bufferDistsWrite); 119 | inBufVec.push_back(bufferSources); 120 | inBufVec.push_back(bufferDestinations); 121 | inBufVec.push_back(bufferCosts); 122 | outBufVec.push_back(bufferDistsRead); 123 | 124 | 125 | // These commands will load the source_a and source_b vectors from the host 126 | // application and into the buffer_a and buffer_b cl::Buffer objects. The data 127 | // will be be transferred from system memory over PCIe to the FPGA on-board 128 | // DDR memory. 129 | q.enqueueMigrateMemObjects(inBufVec,0/* TN: 0 means from host*/); 130 | 131 | // This call will extract a kernel out of the program we loaded in the 132 | // previous line. A kernel is an OpenCL function that is executed on the 133 | // FPGA. This function is defined in the src/vetor_addition.cl file. 134 | cl::Kernel krnl_bellman_ford(program,"bellman_ford"); 135 | 136 | //set the kernel Arguments 137 | int narg=0; 138 | krnl_bellman_ford.setArg(narg++, bufferDistsRead); 139 | krnl_bellman_ford.setArg(narg++, bufferDistsWrite); 140 | krnl_bellman_ford.setArg(narg++, bufferSources); 141 | krnl_bellman_ford.setArg(narg++, bufferDestinations); 142 | krnl_bellman_ford.setArg(narg++, bufferCosts); 143 | krnl_bellman_ford.setArg(narg++, graph.num_vertices); 144 | krnl_bellman_ford.setArg(narg++, graph.getNumEdges()); 145 | 146 | std::cout << "Max work group size: " << krnl_bellman_ford.getWorkGroupInfo(device) << std::endl; 147 | 148 | //Launch the Kernel 149 | q.enqueueTask(krnl_bellman_ford); 150 | 151 | // The result of the previous kernel execution will need to be retrieved in 152 | // order to view the results. This call will write the data from the 153 | // buffer_result cl_mem object to the source_results vector 154 | q.enqueueMigrateMemObjects(outBufVec,CL_MIGRATE_MEM_OBJECT_HOST); 155 | q.finish(); 156 | 157 | 158 | for (int i = 0; i < graph.num_vertices; i++) { 159 | printf("%d: %f\n", i, distsRead[i]); 160 | } 161 | 162 | return 0; 163 | } 164 | -------------------------------------------------------------------------------- /sssp-cuda/src/util.cpp: -------------------------------------------------------------------------------- 1 | #include "util.h" 2 | 3 | #include "pthread.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | using std::vector; 13 | 14 | namespace util { 15 | 16 | vector random_centroids(int k, size_t num_features) { 17 | auto centroids = vector{}; 18 | for (int i = 0; i < k; i++) { 19 | centroids.push_back(Point::random(num_features)); 20 | } 21 | return centroids; 22 | } 23 | 24 | Point Point::random(size_t dim) { 25 | auto point = vector(dim); 26 | std::generate(point.begin(), point.end(), 27 | []() { return static_cast(std::rand()) / RAND_MAX; }); 28 | return Point{point}; 29 | } 30 | 31 | double Point::dist(const Point &other) const { 32 | if (this->dim() != other.dim()) { 33 | throw std::invalid_argument( 34 | "Point::dist(): two points must have same dimensionality!"); 35 | } 36 | double result = 0.0f; 37 | for (size_t i = 0; i < this->dim(); i++) { 38 | result += std::pow(this->_point[i] - other._point[i], 2); 39 | } 40 | return std::sqrt(result); 41 | } 42 | 43 | Point &Point::operator+=(const Point &other) { 44 | if (this->dim() != other.dim()) { 45 | throw std::invalid_argument( 46 | "Point::operator+=(): two points must have same dimensionality!"); 47 | } 48 | for (size_t i = 0; i < this->dim(); i++) { 49 | this->_point[i] += other._point[i]; 50 | } 51 | return *this; 52 | } 53 | 54 | Point &Point::operator/=(size_t divisor) { 55 | if (divisor == 0) { 56 | throw std::invalid_argument("Point::operator/=(): divide by zero error!"); 57 | } 58 | for (auto &coord : this->_point) { 59 | coord /= divisor; 60 | } 61 | return *this; 62 | } 63 | 64 | std::ostream &operator<<(std::ostream &os, const util::Point &obj) { 65 | os << "<"; 66 | for (size_t i = 0; i < obj._point.size(); i++) { 67 | if (i > 0) { 68 | os << ", "; 69 | } 70 | os << obj._point[i]; 71 | } 72 | os << ">"; 73 | return os; 74 | } 75 | 76 | std::pair converged(const vector ¢roids, 77 | const vector &old_centroids, 78 | double threshold) { 79 | // Check if the most movement for a given centroid is less than threshold 80 | auto delta_dists = vector{}; 81 | std::transform(centroids.begin(), centroids.end(), old_centroids.begin(), 82 | std::back_inserter(delta_dists), 83 | [](const auto &point1, const auto &point2) { 84 | return point1.dist(point2); 85 | }); 86 | auto max_change = *std::max_element(delta_dists.begin(), delta_dists.end()); 87 | return {max_change <= threshold, max_change}; 88 | } 89 | 90 | vector partition(size_t size, int num_threads) { 91 | auto ranges = vector{}; 92 | 93 | auto step_size = size / num_threads; 94 | int get_extra = size % num_threads; 95 | 96 | auto start = 0; 97 | auto end = step_size; 98 | 99 | for (int i = 0; i < num_threads; i++) { 100 | // Some threads are assigned additional work beyond minimum 101 | if (i < get_extra) { 102 | end++; 103 | } else if (i == num_threads - 1) { 104 | end = size; 105 | } 106 | ranges.push_back(range_t{start, end}); 107 | 108 | // Take a "step" forward 109 | start = end; 110 | end = start + step_size; 111 | } 112 | return ranges; 113 | } 114 | 115 | void DataSet::nearest_centroids(const vector ¢roids, 116 | vector &labels, range_t range) const { 117 | // No synchronization necessary, disjoint read-only intervals of centroids 118 | for (size_t i = range.first; i < range.second; i++) { 119 | // Calculate distance from current point to each centroid 120 | auto dists = vector{}; 121 | std::transform( 122 | centroids.begin(), centroids.end(), std::back_inserter(dists), 123 | [this, i](const auto ¢roid) { return _points[i].dist(centroid); }); 124 | 125 | // Store index of closest centroid 126 | labels[i] = std::distance(dists.begin(), 127 | std::min_element(dists.begin(), dists.end())); 128 | } 129 | } 130 | 131 | void DataSet::sum_labeled_centroids(vector ¢roids, 132 | const vector &labels, 133 | vector &counts, range_t range, 134 | SyncType type) const { 135 | if (type == SyncType::FINE) { 136 | // Increment local variables first, then bulk update shared state 137 | size_t k = centroids.size(); 138 | auto local_centroids = vector{k, Point{num_features()}}; 139 | auto local_counts = vector(k); 140 | 141 | for (size_t i = range.first; i < range.second; i++) { 142 | auto label = labels[i]; 143 | local_centroids[label] += this->_points[i]; 144 | local_counts[label]++; 145 | } 146 | 147 | for (size_t i = 0; i < k; i++) { 148 | this->lock(); 149 | if (counts[i] == 0) { 150 | centroids[i] = local_centroids[i]; 151 | } else { 152 | centroids[i] += local_centroids[i]; 153 | } 154 | counts[i] += local_counts[i]; 155 | this->unlock(); 156 | } 157 | } else { 158 | // Add each point to corresponding centroid 159 | for (size_t i = range.first; i < range.second; i++) { 160 | auto label = labels[i]; 161 | this->lock(); 162 | if (counts[label] == 0) { 163 | centroids[label] = this->_points[i]; 164 | } else { 165 | centroids[label] += this->_points[i]; 166 | } 167 | counts[label]++; 168 | this->unlock(); 169 | } 170 | } 171 | } 172 | 173 | void DataSet::normalize_centroids(vector ¢roids, 174 | vector &counts, 175 | range_t k_range) const { 176 | // Divide by number of points and handle case where no points are assigned 177 | for (size_t i = k_range.first; i < k_range.second; i++) { 178 | if (counts[i] > 0) { 179 | centroids[i] /= counts[i]; 180 | } else { 181 | // Assign a random point to this centroid 182 | auto index = std::rand() % this->_points.size(); 183 | centroids[i] = this->_points[index]; 184 | } 185 | // Always reset count 186 | counts[i] = 0; 187 | } 188 | } 189 | 190 | void DataSet::lock() const { 191 | if (_type == locks::LockType::MUTEX) { 192 | _mutex.lock(); 193 | } else if (_type == locks::LockType::SPIN) { 194 | _spinlock.lock(); 195 | } else { 196 | return; 197 | } 198 | } 199 | 200 | void DataSet::unlock() const { 201 | if (_type == locks::LockType::MUTEX) { 202 | _mutex.unlock(); 203 | } else if (_type == locks::LockType::SPIN) { 204 | _spinlock.unlock(); 205 | } else { 206 | return; 207 | } 208 | } 209 | 210 | vector DataSet::make_raw() const { 211 | auto raw_output = vector{}; 212 | for (const auto &point : _points) { 213 | raw_output.insert(raw_output.end(), point._point.begin(), 214 | point._point.end()); 215 | } 216 | return raw_output; 217 | } 218 | 219 | std::shared_ptr 220 | DataSet::make_dataset(const std::string &filename, locks::LockType lockType) { 221 | auto file = std::ifstream{filename}; 222 | auto line = std::string{}; 223 | auto points = vector{}; 224 | size_t num_dims = 0; 225 | 226 | // Get number of points 227 | std::getline(file, line); 228 | int num_points = std::stoi(line); 229 | points.reserve(num_points); 230 | if (num_points < 1) { 231 | throw std::invalid_argument("File must have at least one point!"); 232 | } 233 | 234 | // Read each line in as a n-dim point 235 | for (int i = 0; i < num_points; i++) { 236 | std::getline(file, line); 237 | auto stream = std::stringstream{line}; 238 | auto point = vector{}; 239 | 240 | // Discard point index (line number) 241 | int index; 242 | stream >> index; 243 | 244 | double coord; 245 | while (stream >> coord) { 246 | point.push_back(coord); 247 | } 248 | if (num_dims == 0) { 249 | num_dims = point.size(); 250 | } else if (point.size() != num_dims) { 251 | throw std::invalid_argument( 252 | "Points must all have the same number of dimensions!"); 253 | } 254 | points.push_back(Point{point}); 255 | } 256 | 257 | return std::make_shared(points, lockType); 258 | } 259 | } /* namespace util */ 260 | -------------------------------------------------------------------------------- /sssp-cuda/src/bellman_ford.cu: -------------------------------------------------------------------------------- 1 | #include "bellman_ford.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | using std::vector; 15 | using std::unique_ptr; 16 | using graph::Graph; 17 | using graph::Edge; 18 | 19 | #define CHECK_CUDA_ERROR(ans) \ 20 | { gpuAssert((ans), __FILE__, __LINE__); } 21 | inline void gpuAssert(cudaError_t code, const char *file, int line) { 22 | if (code != cudaSuccess) { 23 | fprintf(stderr, "GPU error: %s %s %d\n", cudaGetErrorString(code), file, 24 | line); 25 | exit(code); 26 | } 27 | } 28 | 29 | // Global variables 30 | constexpr double kInfinity = std::numeric_limits::infinity(); 31 | constexpr size_t kBlockSize = 512; 32 | constexpr size_t kWordSize = 8; 33 | __device__ double gMaxChange = 0.0; 34 | struct deleter { 35 | void operator()(void *ptr) { cudaFree(ptr); } 36 | }; 37 | 38 | class CudaEvent final { 39 | public: 40 | CudaEvent() { cudaEventCreate(&event); } 41 | ~CudaEvent() { cudaEventDestroy(event); } 42 | void wait() { cudaEventSynchronize(event); } 43 | void record() { cudaEventRecord(event); } 44 | float since(const CudaEvent &earlier) { 45 | float elapsed = 0.0f; 46 | cudaEventElapsedTime(&elapsed, earlier.event, event); 47 | return elapsed; 48 | } 49 | 50 | private: 51 | cudaEvent_t event; 52 | }; 53 | 54 | /** 55 | * Thin RAII wrapper managing device view and memory of a 1D vector. 56 | */ 57 | template struct CudaVector final { 58 | size_t num_elems; 59 | size_t num_bytes; 60 | unique_ptr device_ptr; 61 | 62 | CudaVector(size_t num_elems_) 63 | : num_elems(num_elems_), num_bytes(), device_ptr(nullptr) { 64 | // Align all allocations to 8-byte boundary 65 | num_bytes = num_elems_ * sizeof(T); 66 | size_t rem = num_bytes % kWordSize; 67 | num_bytes += kWordSize - rem; 68 | 69 | // Malloc memory and zero-initialize it 70 | CHECK_CUDA_ERROR(cudaMalloc((void **)&device_ptr, num_bytes)); 71 | CHECK_CUDA_ERROR(cudaMemset(device_ptr.get(), 0, num_bytes)); 72 | } 73 | 74 | CudaVector(const vector &data_) 75 | : num_elems(data_.size()), num_bytes(0), device_ptr(nullptr) { 76 | num_bytes = data_.size() * sizeof(T); 77 | size_t rem = num_bytes % kWordSize; 78 | num_bytes += kWordSize - rem; 79 | 80 | const char* host_ptr = reinterpret_cast(data_.data()); 81 | 82 | CHECK_CUDA_ERROR(cudaMalloc((void **)&device_ptr, num_bytes)); 83 | CHECK_CUDA_ERROR(cudaMemcpy(device_ptr.get(), host_ptr, num_elems * sizeof(T), 84 | cudaMemcpyHostToDevice)); 85 | } 86 | 87 | void copyFromDevice(T *host_ptr) { 88 | CHECK_CUDA_ERROR(cudaMemcpy(host_ptr, device_ptr.get(), num_elems * sizeof(T), 89 | cudaMemcpyDeviceToHost)); 90 | } 91 | 92 | const T *data() const { return device_ptr.get(); } 93 | T *data() { return device_ptr.get(); } 94 | void clear() { CHECK_CUDA_ERROR(cudaMemset(device_ptr.get(), 0, num_bytes)); } 95 | }; 96 | 97 | /** 98 | * Thin RAII wrapper managing device view and memory of a 2D array. 99 | */ 100 | template struct CudaArray final { 101 | size_t num_rows; 102 | size_t num_cols; 103 | size_t pitch; 104 | size_t num_bytes; 105 | unique_ptr device_ptr; 106 | 107 | CudaArray(size_t num_rows_, size_t num_cols_) 108 | : num_rows(num_rows_), num_cols(num_cols_), pitch(), num_bytes(), 109 | device_ptr(nullptr) { 110 | // Allocate 2D pitched memory on the device and get device ptr 111 | CHECK_CUDA_ERROR(cudaMallocPitch((void **)&device_ptr, &pitch, 112 | num_cols * sizeof(T), num_rows)); 113 | CHECK_CUDA_ERROR( 114 | cudaMemset2D(device_ptr.get(), pitch, 0, num_cols, num_rows)); 115 | num_bytes = num_rows * pitch; 116 | } 117 | 118 | CudaArray(const vector &data_, size_t num_cols_) 119 | : CudaArray(data_.size() / num_cols_, num_cols_) { 120 | // Additionally memcpy host src vector to device 121 | auto host_ptr = reinterpret_cast(data_.data()); 122 | CHECK_CUDA_ERROR(cudaMemcpy2D(device_ptr.get(), pitch, host_ptr, 123 | num_cols * sizeof(T), num_cols * sizeof(T), 124 | num_rows, cudaMemcpyHostToDevice)); 125 | } 126 | 127 | void copyFromDevice(T *host_ptr) { 128 | CHECK_CUDA_ERROR(cudaMemcpy2D(host_ptr, num_cols * sizeof(T), 129 | device_ptr.get(), pitch, num_cols * sizeof(T), 130 | num_rows, cudaMemcpyDeviceToHost)); 131 | } 132 | 133 | const T *data() const { return device_ptr.get(); } 134 | T *data() { return device_ptr.get(); } 135 | void clear() { 136 | CHECK_CUDA_ERROR( 137 | cudaMemset2D(device_ptr.get(), pitch, 0, num_cols, num_rows)); 138 | } 139 | }; 140 | 141 | __global__ void init(unsigned int seed, curandState_t *states) { 142 | curand_init(seed, threadIdx.x, 0, &states[threadIdx.x]); 143 | } 144 | 145 | // Helper to allow atomicMax to be invoked on a double 146 | // https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h 147 | __device__ __forceinline__ double atomicMax(double *address, double val) { 148 | unsigned long long ret = __double_as_longlong(*address); 149 | while (val > __longlong_as_double(ret)) { 150 | unsigned long long old = ret; 151 | if ((ret = atomicCAS((unsigned long long *)address, old, 152 | __double_as_longlong(val))) == old) 153 | break; 154 | } 155 | return __longlong_as_double(ret); 156 | } 157 | 158 | // Helper to allow atomicMin to be invoked on a double 159 | // https://github.com/treecode/Bonsai/blob/master/runtime/profiling/derived_atomic_functions.h 160 | __device__ __forceinline__ double atomicMin(double *address, double val) { 161 | unsigned long long ret = __double_as_longlong(*address); 162 | while (val < __longlong_as_double(ret)) { 163 | unsigned long long old = ret; 164 | if ((ret = atomicCAS((unsigned long long *)address, old, 165 | __double_as_longlong(val))) == old) 166 | break; 167 | } 168 | return __longlong_as_double(ret); 169 | } 170 | 171 | // Templated helper to index into a flat array 172 | template 173 | __device__ __forceinline__ T *address(T *src, size_t row, size_t col, 174 | size_t pitch) { 175 | return (T *)((char *)src + row * pitch) + col; 176 | } 177 | 178 | // Helper function for L2 distance 179 | __device__ __forceinline__ double 180 | dist(const double *__restrict__ p0, const double *__restrict__ p1, size_t dim) { 181 | double distance = 0.0; 182 | for (int i = 0; i < dim; i++) { 183 | distance += (p0[i] - p1[i]) * (p0[i] - p1[i]); 184 | } 185 | return distance; 186 | } 187 | 188 | __device__ __forceinline__ void atomicVecAdd(double *__restrict__ dest, 189 | const double *__restrict__ src, 190 | size_t dim) { 191 | for (int i = 0; i < dim; i++) { 192 | atomicAdd(&dest[i], src[i]); 193 | } 194 | } 195 | 196 | __global__ void relax(const size_t num_edges, const double* distsRead, 197 | double* distsWrite, const Edge* edges) { 198 | 199 | const size_t index = blockIdx.x * blockDim.x + threadIdx.x; 200 | if(index > num_edges) { 201 | return; 202 | } 203 | 204 | Edge edge = edges[index]; 205 | 206 | double val = distsRead[edge.src] + edge.cost; 207 | atomicMin(&distsWrite[edge.dest], val); 208 | } 209 | 210 | __global__ void copyBack(const size_t num_points, double* distsRead, 211 | const double* distsWrite) { 212 | const size_t index = blockIdx.x * blockDim.x + threadIdx.x; 213 | if(index > num_points) { 214 | return; 215 | } 216 | 217 | distsRead[index] = distsWrite[index]; 218 | }; 219 | 220 | BellmanFordOutput bellmanFordCUDA(const Graph &graph, size_t source) { 221 | auto start = CudaEvent{}; 222 | auto end = CudaEvent{}; 223 | 224 | start.record(); 225 | 226 | vector localDistances = vector(graph.num_vertices, kInfinity); 227 | localDistances[source] = 0.0; 228 | 229 | // data structure initialization 230 | CudaVector distsRead = CudaVector(localDistances); 231 | CudaVector distsWrite = CudaVector(localDistances); 232 | CudaVector edges = CudaVector(graph.getAllEdges()); 233 | 234 | // grid and block size calculation 235 | auto threads = dim3{kBlockSize}; 236 | auto blocks = dim3{}; 237 | blocks.x = (graph.getNumEdges() + threads.x - 1) / threads.x; 238 | 239 | // iteration 240 | for(size_t iter = 0; iter < graph.num_vertices; iter++) { 241 | relax<<>>(graph.getNumEdges(), distsRead.data(), distsWrite.data(), edges.data()); 242 | copyBack<<>>(graph.num_vertices, distsRead.data(), distsWrite.data()); 243 | } 244 | 245 | end.record(); 246 | distsRead.copyFromDevice(localDistances.data()); 247 | end.wait(); 248 | 249 | return {localDistances, end.since(start)}; 250 | } 251 | --------------------------------------------------------------------------------