├── .gitignore ├── .gitmodules ├── CMakeGTEST.txt.in ├── CMakeLists.txt ├── LICENSE ├── README.md ├── codegen ├── codegen.py ├── common.py ├── fpga-06-07-08-09.json ├── main.py ├── notes.txt ├── ops.py ├── program.py ├── requirements.txt ├── rewrite.py ├── routing.py ├── routing_table.py ├── serialization.py ├── templates │ ├── bcast.cl │ ├── ckr.cl │ ├── cks.cl │ ├── device.cl │ ├── gather.cl │ ├── host.cl │ ├── host_hlslib.cl │ ├── pop.cl │ ├── push.cl │ ├── reduce.cl │ ├── scatter.cl │ └── utils.cl ├── tests │ ├── conftest.py │ ├── data │ │ ├── buffer-size-expected.cl │ │ ├── buffer-size.cl │ │ ├── complex-expected.cl │ │ ├── complex.cl │ │ ├── constant-variable-expected.cl │ │ ├── constant-variable.cl │ │ ├── data-type-expected.cl │ │ ├── data-type.cl │ │ ├── kernel-attribute-expected.cl │ │ ├── kernel-attribute.cl │ │ ├── port-expected.cl │ │ ├── port.cl │ │ ├── reduce-expected.cl │ │ ├── reduce.cl │ │ ├── smi-device-1.h │ │ └── smi-host-1.h │ ├── test_codegen.py │ ├── test_parse.py │ ├── test_program.py │ ├── test_rewriter.py │ ├── test_routing.py │ ├── test_routing_table.py │ └── test_utils.py ├── topology_file_generator.py └── utils.py ├── examples ├── CMakeLists.txt ├── host │ ├── gesummv_onchip.cpp │ ├── gesummv_smi.cpp │ ├── kmeans_smi.cpp │ ├── stencil_onchip.cpp │ └── stencil_smi.cpp ├── include │ ├── common.h │ ├── fblas.h │ ├── kmeans.h.in │ └── stencil.h.in └── kernels │ ├── gesummv_onchip.cl │ ├── gesummv_rank0.cl │ ├── gesummv_rank1.cl │ ├── gesummv_smi.json │ ├── kmeans_smi.cl │ ├── kmeans_smi.json │ ├── stencil_onchip.cl.in │ ├── stencil_onchip.py │ ├── stencil_onchip_pe.cl.in │ ├── stencil_smi.cl │ └── stencil_smi.json ├── include ├── opencl-shim.h ├── smi.h ├── smi │ ├── bcast.h │ ├── channel_descriptor.h │ ├── communicator.h │ ├── data_types.h │ ├── gather.h │ ├── header_message.h │ ├── network_message.h │ ├── operation_type.h │ ├── pop.h │ ├── push.h │ ├── reduce.h │ ├── reduce_operations.h │ └── scatter.h └── utils │ ├── ocl_utils.hpp │ ├── smi_utils.hpp │ └── utils.hpp ├── microbenchmarks ├── CMakeLists.txt ├── host │ ├── bandwidth_benchmark.cpp │ ├── broadcast_benchmark.cpp │ ├── gather_benchmark.cpp │ ├── injection_rate_benchmark.cpp │ ├── latency_benchmark.cpp │ ├── multi_collectives_benchmark.cpp │ ├── reduce_benchmark.cpp │ └── scatter_benchmark.cpp └── kernels │ ├── bandwidth.json │ ├── bandwidth_0.cl │ ├── bandwidth_1.cl │ ├── broadcast.cl │ ├── broadcast.json │ ├── gather.cl │ ├── gather.json │ ├── injection_rate.json │ ├── injection_rate_0.cl │ ├── injection_rate_1.cl │ ├── latency.json │ ├── latency_0.cl │ ├── latency_1.cl │ ├── multi_collectives.cl │ ├── multi_collectives.json │ ├── reduce.cl │ ├── reduce.json │ ├── scatter.cl │ └── scatter.json ├── misc └── smi.png ├── source-rewriter ├── CMakeLists.txt └── src │ ├── action.cpp │ ├── action.h │ ├── main.cpp │ ├── ops │ ├── broadcast.cpp │ ├── broadcast.h │ ├── gather.cpp │ ├── gather.h │ ├── ops.cpp │ ├── ops.h │ ├── pop.cpp │ ├── pop.h │ ├── push.cpp │ ├── push.h │ ├── reduce.cpp │ ├── reduce.h │ ├── scatter.cpp │ ├── scatter.h │ ├── utils.cpp │ └── utils.h │ ├── rewrite.cpp │ ├── rewrite.h │ ├── third-party │ └── json.hpp │ ├── utils.cpp │ └── utils.h └── test ├── CMakeLists.txt ├── README.md ├── broadcast ├── broadcast.cl ├── broadcast.json └── test_broadcast.cpp ├── gather ├── gather.cl ├── gather.json └── test_gather.cpp ├── mixed ├── mixed.cl ├── mixed.json └── test_mixed.cpp ├── p2p ├── p2p.json ├── p2p_rank0.cl ├── p2p_rank1.cl └── test_p2p.cpp ├── reduce ├── reduce.cl ├── reduce.json └── test_reduce.cpp └── scatter ├── scatter.cl ├── scatter.json └── test_scatter.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | .ycm_extra_conf.py 2 | build* 3 | __pycache__ 4 | 5 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "hlslib"] 2 | path = hlslib 3 | url = https://github.com/definelicht/hlslib.git 4 | -------------------------------------------------------------------------------- /CMakeGTEST.txt.in: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | 3 | project(googletest-download NONE) 4 | 5 | include(ExternalProject) 6 | ExternalProject_Add(googletest 7 | GIT_REPOSITORY https://github.com/google/googletest.git 8 | GIT_TAG master 9 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" 10 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, SPCL - ETH Zurich 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Streaming Message Interface 4 | 5 | **Streaming Message Interface** is a a distributed memory HLS programming model for **FPGAs** that provides 6 | the convenience of message passing for HLS-programmed hardware devices. Instead of bulk transmission, typical of message passing model, 7 | with SMI messages are **streamed** across the network during computation, allowing communication to be seamlessly integrated into pipelined designs. 8 | 9 | This repository contains an high-level synthesis implementation of SMI targeting OpenCL and Intel FPGAs, and all the 10 | applications used for the evaluation perfomed in the paper: *"Streaming Message Interface: High-Performance Distributed Memory 11 | Programming on Reconfigurable Hardware"*, Tiziano De Matteis, Johannes de Fine Licht, Jakub Beránek, and Torsten Hofler. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, 2019 (SC 2019). 12 | 13 | 14 | Please refer to the [wiki](https://github.com/spcl/SMI/wiki) and to the paper for a reference on how to use SMI for your own distributed FPGA programs. 15 | 16 | 17 | ## Reproducing the paper experiments 18 | 19 | All the tests and evaluations reported in the paper have been performed on a set of Bittware 520N cards (Stratix 10), 20 | each of them equipped with 4 network connections (QSFP modules) operating at 40Gbit/s. 21 | 22 | ### Requirements 23 | 24 | The library depends on: 25 | 26 | * CMake for configuration 27 | * Intel FPGA SDK for OpenCL pro, version 18.1 ([http://fpgasoftware.intel.com/opencl/](http://fpgasoftware.intel.com/opencl/)). *Experimental: support for v19+* 28 | * GCC (version 5+) 29 | * An MPI implementation (e.g. OpenMPI) 30 | * Python (version 3+) 31 | * CLang (version 8+) 32 | 33 | ### Compilation 34 | 35 | After cloning this repository, make sure you clone the submodule dependency, by executing the following command: 36 | 37 | ``` 38 | git submodule update --init 39 | ``` 40 | 41 | The project uses CMake for configuration. To configure the project and build the bitstreams and executables: 42 | 43 | ```bash 44 | mkdir build 45 | cd build 46 | cmake .. 47 | ``` 48 | The experiments shown in the paper are organized in two subdirectories of the CMake folder, `microbenchmarks` and `examples`. 49 | 50 | For each of them the following targets are offered: 51 | 52 | - `make _emulator` builds the emulation version of the FPGA program; 53 | - `make _host` builds the host program; 54 | - `make __aoc_report` generates the report; 55 | - `make __aoc_build` builds the hardware (can take several hours). 56 | 57 | The applications presents in the repository are the following. For the details please refer to the paper: 58 | 59 | **Microbenchmarks** 60 | 61 | - `bandwidth`: bandwidth microbenchmark: an MPMD application composed by two programs, namely `bandwidth_0` (sender) and `bandwidth_1` (receiver); 62 | - `latency`: latency microbenchmark: an MPMD application composed by two programs, namely `latency_0` (source) and `latency_1` (destination). 63 | - `injection`: injection microbenchmark: an MPMD application composed by two programs, namely `injection_0` (sender) and `injection_1` (receiver). 64 | - `broadcast`: broadcast microbenchmark: an SPMD application (`broadcast`) 65 | - `reduce`: reduce microbenchmark: an SPMD application (`reduce`) 66 | - `scatter`: scatter microbenchmark (not included in the paper): an SPMD application (`scatter`) 67 | - `gather`: gather microbenchmark (not included in the paper): an SPMD application (`gather`) 68 | 69 | **Application examples** 70 | 71 | - `stencil_smi`: stencil application, smi implementation. It is composed by a single program (`stencil_smi`); 72 | - `stencil_onchip`: on chip version of the stencil application; 73 | - `gesummv_smi`: gesummv, smi implementation: composed by a two programs (`gesummv_rank0` and `gesummv_rank1`); 74 | - `gesummv_onchip`: on chip version of the gesummv application. 75 | 76 | **Unit tests** 77 | 78 | To enable unit tests, please execute `cmake` with the following flag `-DENABLE_TESTS=ON` 79 | 80 | 81 | **Please Note**: all the host programs have been written by considering the target architecture used in the paper, which is characterized by a set of nodes each one having 2 FPGAs. 82 | If you are using a different setup, please adjust the host programs. 83 | 84 | ### Example 85 | 86 | Suppose that the user wants to execute the `stencil_smi` application in emulation. 87 | The following steps must be performed: 88 | 89 | ```bash 90 | cd examples 91 | # Compile the emulation version 92 | make stencil_smi_emulator -j 93 | # Compile the host program 94 | make stencil_smi_host 95 | cd stencil_smi 96 | # Execute the program 97 | env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./stencil_smi_host emulator 98 | ``` 99 | 100 | To generate the report, from the `examples` directory in the CMake folder, the user must execute: 101 | ```bash 102 | make stencil_smi_stencil_smi_aoc_report 103 | ``` 104 | 105 | The report will be stored under `examples/stencil_smi/stencil_smi`. 106 | 107 | 108 | 109 | #### Stencil parameters 110 | 111 | For the stencil application, the stencil sizes and number of ranks in either dimension are configured using CMake parameters: 112 | 113 | ```bash 114 | cmake .. -DSMI_STENCIL_SIZE_X=8192 -DSMI_STENCIL_SIZE_Y=8192 -DSMI_STENCIL_NUM_PROCS_X=2 -DSMI_STENCIL_NUM_PROCS_Y=2 115 | ``` 116 | 117 | Other parameters include `SMI_VECTORIZATION_WIDTH`, `SMI_DATATYPE`, `SMI_FMAX`, and `SMI_ROUTING_FILE`. 118 | 119 | 120 | -------------------------------------------------------------------------------- /codegen/codegen.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from typing import List, Tuple 4 | 5 | import jinja2 6 | from networkx import Graph 7 | 8 | from program import Channel, target_index, FPGA, Program 9 | 10 | 11 | def read_template_file(path): 12 | templates = os.path.join(os.path.dirname(__file__), "templates") 13 | loader = jinja2.FileSystemLoader(searchpath=templates) 14 | 15 | logging.basicConfig() 16 | logger = logging.getLogger('logger') 17 | logger = jinja2.make_logging_undefined(logger=logger, base=jinja2.Undefined) 18 | 19 | env = jinja2.Environment(loader=loader, undefined=logger) 20 | env.lstrip_blocks = True 21 | env.trim_blocks = True 22 | return env.get_template(path) 23 | 24 | 25 | def channel_name(src: Channel, out: bool, graph: Graph) -> str: 26 | remote_channel = None 27 | for (_, to) in graph.edges(src): 28 | if to.fpga != src.fpga: 29 | remote_channel = to 30 | 31 | if remote_channel: 32 | remote_channel = "r{}c{}".format(remote_channel.fpga.rank, remote_channel.index) 33 | else: 34 | remote_channel = "unconnected" 35 | 36 | local_channel = "r{}c{}".format(src.fpga.rank, src.index) 37 | if not out: 38 | tmp = local_channel 39 | local_channel = remote_channel 40 | remote_channel = tmp 41 | 42 | return "{}_{}".format(local_channel, remote_channel) 43 | 44 | 45 | def generate_program_host(programs: List[Tuple[str, Program]]) -> str: 46 | template = read_template_file("host_hlslib.cl") 47 | return template.render(programs=programs) 48 | 49 | 50 | def generate_program_device(fpga: FPGA, fpgas: List[FPGA], graph: Graph, channels_per_fpga: int) -> str: 51 | template = read_template_file("device.cl") 52 | return template.render(channels=fpga.channels, 53 | channels_per_fpga=channels_per_fpga, 54 | target_index=target_index, 55 | program=fpga.program, 56 | fpgas=fpgas, 57 | channel_name=lambda channel, out: channel_name(channel, out, graph)) 58 | -------------------------------------------------------------------------------- /codegen/common.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from networkx import Graph 4 | 5 | from program import FPGA 6 | 7 | 8 | class RoutingContext: 9 | def __init__(self, graph: Graph, routes, fpgas: List[FPGA]): 10 | self.graph = graph 11 | self.routes = routes 12 | self.fpgas = fpgas 13 | 14 | 15 | def write_nodefile(fpgas: List[FPGA], stream): 16 | fpgas = sorted(fpgas, key=lambda f: f.rank) 17 | 18 | for (index, fpga) in enumerate(fpgas): 19 | stream.write("{} # {}, rank{}\n".format(fpga.node, fpga.name, index)) 20 | -------------------------------------------------------------------------------- /codegen/fpga-06-07-08-09.json: -------------------------------------------------------------------------------- 1 | { 2 | "fpgas": { 3 | "fpga-0006:acl0": "pg1.json", 4 | "fpga-0006:acl1": "pg1.json", 5 | "fpga-0007:acl0": "pg1.json", 6 | "fpga-0007:acl1": "pg1.json", 7 | "fpga-0008:acl0": "pg1.json", 8 | "fpga-0008:acl1": "pg1.json", 9 | "fpga-0009:acl0": "pg1.json", 10 | "fpga-0009:acl1": "pg1.json" 11 | }, 12 | "connections": { 13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3", 14 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2", 15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3", 16 | "fpga-0007:acl0:ch3": "fpga-0007:acl1:ch2", 17 | "fpga-0006:acl0:ch1": "fpga-0007:acl0:ch0", 18 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0", 19 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0", 20 | "fpga-0007:acl1:ch1": "fpga-0008:acl1:ch0", 21 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3", 22 | "fpga-0008:acl0:ch3": "fpga-0008:acl1:ch2", 23 | "fpga-0009:acl0:ch2": "fpga-0009:acl1:ch3", 24 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2", 25 | "fpga-0008:acl0:ch1": "fpga-0009:acl0:ch0", 26 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0", 27 | "fpga-0006:acl0:ch0": "fpga-0009:acl0:ch1", 28 | "fpga-0006:acl1:ch0": "fpga-0009:acl1:ch1" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /codegen/notes.txt: -------------------------------------------------------------------------------- 1 | # Nomenclature 2 | Logical port = number that the user enters into the SMI API (e.g. SMI_Open_send_channel(..., 5)) 3 | Channel = Intel channel used to write and read data 4 | 5 | # Routing tables 6 | There are 2 routing tables for each rank. 7 | 8 | ## CKS routing table 9 | - size is the number of Logical ports 10 | - maps Logical port to a number: 11 | 0 => send to connected QSFP 12 | 1 => send to connected CKR 13 | 2 => send to first neighbour CKS 14 | => send to ... neighbour CKS 15 | 16 | - neighbours are always counted from the first kernel 17 | Kernel 0 18 | Kernel 1 19 | Kernel 2 20 | 21 | K0 - first neighbour is K1, second neighbour is K2 22 | K1 - first neighbour is K0, second neighbour is K2 23 | K2 - first neighbour is K0, second neighbour is K1 24 | 25 | ## CKR routing table 26 | - size is the number of Logical ports * 2 27 | - contains entries for both data and control ports 28 | Logical port X: data is at rt[x * 2], control is at rt[x * 2 + 1] 29 | 30 | - maps Logical port and data/control to a number: 31 | 0 => invalid (the program does not expect to receive this combination) 32 | - in the current implementation, this will result in sending the packet to CK_S 33 | 1 => send to first neighbour CK_R 34 | => send to ... neighbour CK_R 35 | => first channel assigned to the given CKR 36 | => ... channel assigned to the given CKR 37 | 38 | - neighbours are counted in the same way as for the CKS routing table 39 | 40 | # Channel distribution amongst kernels 41 | Channels are assigned to CKS/CKR in a round-robin fashion. 42 | Data and control hardware ports are combined (in this order) and then distributed. 43 | Example: 44 | cks_data_channels = [0, 1, 2] 45 | cks_control_channels = [0, 1] 46 | cks_hw_ports = [("data", 0), ("data", 1), ("data", 2), ("control", 0), ("control", 1)] 47 | number of CKS = 3 48 | CKS_0 => [("data", 0), ("control", 0)] 49 | CKS_1 => [("data", 1), ("control", 1)] 50 | CKS_2 => [("data", 2)] 51 | -------------------------------------------------------------------------------- /codegen/requirements.txt: -------------------------------------------------------------------------------- 1 | bitstring 2 | click 3 | jinja2 4 | networkx 5 | pytest 6 | -------------------------------------------------------------------------------- /codegen/rewrite.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import subprocess 5 | 6 | import math 7 | 8 | from ops import SmiOperation 9 | from serialization import parse_smi_operation 10 | 11 | 12 | def copy_files(src_dir, dest_dir, files): 13 | """ 14 | Copies device source files from the source directory to the output directory. 15 | Returns a list of tuples (src, dest) path. 16 | """ 17 | for file in files: 18 | src_path = os.path.join(src_dir, file) 19 | dest_path = os.path.join(dest_dir, file) 20 | dest_dir = os.path.dirname(dest_path) 21 | os.makedirs(dest_dir, exist_ok=True) 22 | shutil.copyfile(src_path, dest_path, follow_symlinks=True) 23 | yield (src_path, dest_path) 24 | 25 | 26 | def transform_buffer_size(data, op: SmiOperation): 27 | """ 28 | Buffer size from the user is given in number of elements, it has to be translated into the number of messages. 29 | The transformed buffer size has to be a multiple of 8. 30 | """ 31 | buffer_size = data.get("buffer_size") 32 | if buffer_size is not None: 33 | op.buffer_size = math.ceil((max(1, op.buffer_size) / op.data_elements_per_packet()) / 8) * 8 34 | 35 | 36 | def rewrite(rewriter, file, include_dirs, log): 37 | log.write("Rewriting {}".format(file)) 38 | 39 | args = [rewriter, file] 40 | for dir in include_dirs: 41 | args += ["-extra-arg=-I{}".format(dir)] 42 | 43 | process = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) 44 | output = process.stdout.decode() 45 | 46 | log.write("STDOUT\n{}".format(output)) 47 | log.write("STDERR\n{}".format(process.stderr.decode())) 48 | 49 | ops = [] 50 | for line in output.splitlines(): 51 | if line: 52 | data = json.loads(line) 53 | op = parse_smi_operation(data) 54 | transform_buffer_size(data, op) 55 | ops.append(op) 56 | 57 | return ops 58 | -------------------------------------------------------------------------------- /codegen/routing.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict 2 | 3 | import networkx 4 | from networkx import Graph 5 | 6 | from common import RoutingContext 7 | from program import COST_INTRA_FPGA, COST_INTER_FPGA, FPGA, ProgramMapping 8 | 9 | """ 10 | Each CK_R/CK_S separate QSFP 11 | CK_S <-> CK_R interconnected 12 | CK_R/CK_S connected only to a single neighbour CKR/CK_S 13 | 14 | fpga-0014:acl0:ch0 - fpga-0014:acl1:ch0 15 | """ 16 | 17 | 18 | def create_routing_context(fpga_connections: Dict[Tuple[str, int], Tuple[str, int]], program_mapping: ProgramMapping): 19 | graph = networkx.Graph() 20 | fpgas = load_inter_fpga_connections(graph, fpga_connections, program_mapping) 21 | add_intra_fpga_connections(graph, fpgas) 22 | routes = shortest_paths(graph) 23 | fpgas = create_ranks_for_fpgas(fpgas) 24 | return RoutingContext(graph, routes, fpgas) 25 | 26 | 27 | def load_inter_fpga_connections(graph: networkx.Graph, 28 | fpga_connections: Dict[Tuple[str, int], Tuple[str, int]], 29 | program_mapping: ProgramMapping) -> List[FPGA]: 30 | """ 31 | Parses FPGA connections and embeds them into a graph. 32 | """ 33 | fpgas = {} 34 | 35 | def get_channel(fpga_key, channel): 36 | if fpga_key not in fpgas: 37 | node, fpga_name = fpga_key.split(":") 38 | fpgas[fpga_key] = FPGA(node, fpga_name, program_mapping.fpga_map[fpga_key]) 39 | fpga = fpgas[fpga_key] 40 | return fpga.channels[channel] 41 | 42 | for (src, dst) in fpga_connections.items(): 43 | src, dst = [get_channel(p[0], p[1]) for p in (src, dst)] 44 | graph.add_edge(src, dst, weight=COST_INTER_FPGA, label="{}-{}".format(src, dst)) 45 | 46 | return list(fpgas.values()) 47 | 48 | 49 | def add_intra_fpga_connections(graph: Graph, fpgas: List[FPGA]): 50 | for fpga in fpgas: 51 | for a in fpga.channels: 52 | for b in fpga.channels: 53 | if a is not b: 54 | graph.add_edge(a, b, weight=COST_INTRA_FPGA) 55 | 56 | 57 | def shortest_paths(graph): 58 | return networkx.shortest_path(graph, source=None, target=None, weight="weight") 59 | 60 | 61 | def create_ranks_for_fpgas(fpgas: List[FPGA]) -> List[FPGA]: 62 | """ 63 | Enumerates all channels and assigns ranks to individual FPGAs, sorted by their (node, fpga) 64 | name pair. 65 | """ 66 | fpgas = sorted(fpgas, key=lambda f: f.key()) 67 | for (rank, fpga) in enumerate(fpgas): 68 | fpga.rank = rank 69 | return fpgas 70 | -------------------------------------------------------------------------------- /codegen/routing_table.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import bitstring 4 | 5 | from ops import KEY_CKR_DATA, KEY_CKR_CONTROL 6 | from program import Channel, FPGA, Program 7 | 8 | CKS_TARGET_QSFP = 0 9 | CKS_TARGET_CKR = 1 10 | 11 | 12 | class NoRouteFound(BaseException): 13 | pass 14 | 15 | 16 | def closest_path_to_fpga(paths, channel: Channel, target: FPGA): 17 | routes = paths[channel] 18 | connections = [] 19 | for destination in routes: 20 | if destination.fpga == target: 21 | connections.append(routes[destination]) 22 | 23 | if not connections: 24 | raise NoRouteFound("No route found from {} to {}".format(channel, target)) 25 | return min(connections, key=lambda c: len(c)) 26 | 27 | 28 | def get_output_target(paths, channel: Channel, target: FPGA): 29 | """ 30 | 0 -> local QSFP 31 | 1 -> CK_R 32 | 2 -> first neighbour 33 | 3 -> second neighbour 34 | 4 -> ... 35 | """ 36 | if target == channel.fpga: 37 | return CKS_TARGET_CKR 38 | 39 | path = closest_path_to_fpga(paths, channel, target)[1:] # skip the channel itself 40 | if path[0].fpga == channel.fpga: 41 | return 2 + channel.target_index(path[0].index) 42 | else: 43 | return CKS_TARGET_QSFP 44 | 45 | 46 | def cks_routing_table(paths, fpgas: List[FPGA], channel: Channel) -> List[int]: 47 | table = [] 48 | for fpga in fpgas: 49 | target = get_output_target(paths, channel, fpga) 50 | table.append(target) 51 | return table 52 | 53 | 54 | def get_input_target(channel: Channel, logical_port: int, program: Program, 55 | channels_per_fpga: int, key) -> int: 56 | """ 57 | 0 -> local CK_S (never generated here) 58 | 1 -> CK_R_0 59 | 2 -> CK_R_1 60 | ... 61 | [channels_per_fpga - 1] -> CK_R_N-1 62 | N -> first hardware port assigned to the given channel 63 | N + 1 -> second hardware port assigned to the given channel 64 | """ 65 | 66 | target_channel_index = program.get_channel_for_port_key(logical_port, key) 67 | if target_channel_index is None: 68 | return 0 69 | if target_channel_index != channel.index: 70 | return 1 + channel.target_index(target_channel_index) 71 | 72 | allocations = tuple((op.logical_port, key) for (op, key) 73 | in program.get_channel_allocations_with_prefix(channel.index, "ckr")) 74 | return channels_per_fpga + allocations.index((logical_port, key)) 75 | 76 | 77 | def ckr_routing_table(channel: Channel, channels_per_fpga: int, program: Program) -> List[int]: 78 | table = [] 79 | for port in range(program.logical_port_count): 80 | table.append(get_input_target(channel, port, program, channels_per_fpga, KEY_CKR_DATA)) 81 | table.append(get_input_target(channel, port, program, channels_per_fpga, KEY_CKR_CONTROL)) 82 | return table 83 | 84 | 85 | def serialize_to_array(table: List[int], bytes=1): 86 | stream = bitstring.BitStream() 87 | bitcount = bytes * 8 88 | for target in table: 89 | stream.append("uintle:{}={}".format(bitcount, target)) 90 | return stream.bytes 91 | -------------------------------------------------------------------------------- /codegen/serialization.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import os 4 | from typing import List, Tuple, Dict 5 | 6 | from ops import Broadcast, Push, Pop, Reduce, Scatter, Gather 7 | from program import Program, SmiOperation, ProgramMapping 8 | 9 | SMI_OP_KEYS = { 10 | "push": Push, 11 | "pop": Pop, 12 | "broadcast": Broadcast, 13 | "reduce": Reduce, 14 | "scatter": Scatter, 15 | "gather": Gather 16 | } 17 | 18 | 19 | def parse_smi_operation(obj) -> SmiOperation: 20 | type = obj["type"] 21 | port = obj["port"] 22 | data_type = obj.get("data_type", "int") 23 | buffer_size = obj.get("buffer_size") 24 | args = obj.get("args", {}) 25 | 26 | assert type in SMI_OP_KEYS 27 | return SMI_OP_KEYS[type](port, data_type, buffer_size, **args) 28 | 29 | 30 | def serialize_smi_operation(op: SmiOperation): 31 | inv_map = {v: k for k, v in SMI_OP_KEYS.items()} 32 | 33 | return { 34 | "type": inv_map[op.__class__], 35 | "port": op.logical_port, 36 | "data_type": op.data_type, 37 | "buffer_size": op.buffer_size, 38 | "args": op.serialize_args() 39 | } 40 | 41 | 42 | def parse_operations(operations) -> List[SmiOperation]: 43 | return [parse_smi_operation(p) for p in operations] 44 | 45 | 46 | def parse_program(input: str) -> Program: 47 | prog = json.loads(input) 48 | return Program( 49 | parse_operations(prog["operations"]), 50 | prog.get("consecutive_reads"), 51 | prog.get("max_ranks"), 52 | """prog.get("p2p_rendezvous") TODO: fix""" 53 | ) 54 | 55 | 56 | def serialize_program(program: Program) -> str: 57 | return json.dumps({ 58 | "operations": [serialize_smi_operation(op) for op in program.operations] 59 | }) 60 | 61 | 62 | def parse_routing_file(data: str, metadata_paths=None, ignore_programs=False) -> Tuple[Dict[Tuple[str, int], Tuple[str, int]], ProgramMapping]: 63 | if metadata_paths is None: 64 | metadata_paths = [] 65 | 66 | path_index = {} 67 | for path in metadata_paths: 68 | path_index[os.path.splitext(os.path.basename(path))[0]] = path 69 | 70 | data = json.loads(data) 71 | program_cache = {} 72 | fpga_map = {} 73 | for (fpga, program_path) in data.get("fpgas", {}).items(): 74 | if program_path not in program_cache: 75 | if ignore_programs: 76 | program_cache[program_path] = None 77 | else: 78 | real_path = path_index[program_path] 79 | with open(real_path) as pf: 80 | program_cache[program_path] = parse_program(pf.read()) 81 | 82 | fpga_map[fpga] = program_cache[program_path] 83 | 84 | mapping = ProgramMapping(list(program_cache.values()), fpga_map) 85 | 86 | channel_regex = re.compile(r".*(\d+)$") 87 | connections = {} 88 | 89 | def parse_key(data): 90 | node, fpga_name, channel = data 91 | return "{}:{}".format(node, fpga_name) 92 | 93 | def parse_channel(data): 94 | node, fpga_name, channel = data 95 | match = channel_regex.match(channel) 96 | return int(match.group(1)) 97 | 98 | for (src, dst) in data.get("connections", {}).items(): 99 | src, dst = [item.split(":") for item in (src, dst)] 100 | src, dst = [(parse_key(d), parse_channel(d)) for d in (src, dst)] 101 | assert src not in connections 102 | assert dst not in connections 103 | connections[src] = dst 104 | connections[dst] = src 105 | 106 | return (connections, mapping) 107 | -------------------------------------------------------------------------------- /codegen/templates/ckr.cl: -------------------------------------------------------------------------------- 1 | {% import 'utils.cl' as utils %} 2 | 3 | {%- macro smi_ckr(program, channel, channel_count, target_index) -%} 4 | __kernel void smi_kernel_ckr_{{ channel.index }}(__global volatile char *restrict rt, const char rank) 5 | { 6 | // rt contains intertwined (dp0, cp0, dp1, cp1, ...) 7 | {% set logical_ports = program.logical_port_count %} 8 | char external_routing_table[{{ logical_ports }} /* logical port count */][2]; 9 | for (int i = 0; i < {{ logical_ports }}; i++) 10 | { 11 | for (int j = 0; j < 2; j++) 12 | { 13 | external_routing_table[i][j] = rt[i * 2 + j]; 14 | } 15 | } 16 | 17 | // QSFP + number of CK_Rs - 1 + CK_S 18 | const char num_sender = {{ channel_count + 1 }}; 19 | char sender_id = 0; 20 | SMI_Network_message message; 21 | 22 | char contiguous_reads = 0; 23 | while (1) 24 | { 25 | bool valid = false; 26 | switch (sender_id) 27 | { 28 | case 0: 29 | // QSFP 30 | message = read_channel_nb_intel(io_in_{{ channel.index }}, &valid); 31 | break; 32 | {% for ck_r in channel.neighbours() %} 33 | case {{ loop.index0 + 1 }}: 34 | // receive from CK_R_{{ ck_r }} 35 | message = read_channel_nb_intel(channels_interconnect_ck_r[{{ (channel_count - 1) * channel.index + loop.index0 }}], &valid); 36 | break; 37 | {% endfor %} 38 | case {{ channel_count }}: 39 | // receive from CK_S_{{ channel.index }} 40 | message = read_channel_nb_intel(channels_interconnect_ck_s_to_ck_r[{{ channel.index }}], &valid); 41 | break; 42 | } 43 | 44 | if (valid) 45 | { 46 | contiguous_reads++; 47 | char dest; 48 | if (GET_HEADER_DST(message.header) != rank) 49 | { 50 | dest = 0; 51 | } 52 | else dest = external_routing_table[GET_HEADER_PORT(message.header)][GET_HEADER_OP(message.header) == SMI_SYNCH]; 53 | 54 | switch (dest) 55 | { 56 | case 0: 57 | // send to CK_S_{{ channel.index }} 58 | write_channel_intel(channels_interconnect_ck_r_to_ck_s[{{ channel.index }}], message); 59 | break; 60 | {% for ck_r in channel.neighbours() %} 61 | case {{ loop.index0 + 1 }}: 62 | // send to CK_R_{{ ck_r }} 63 | write_channel_intel(channels_interconnect_ck_r[{{ (channel_count - 1) * ck_r + target_index(ck_r, channel.index) }}], message); 64 | break; 65 | {% endfor %} 66 | {% for (op, key) in program.get_channel_allocations_with_prefix(channel.index, "ckr") %} 67 | case {{ channel_count + loop.index0 }}: 68 | // send to {{ op }} 69 | write_channel_intel({{ op.get_channel(key) }}, message); 70 | break; 71 | {% endfor %} 72 | } 73 | } 74 | 75 | if (!valid || contiguous_reads == READS_LIMIT) 76 | { 77 | contiguous_reads = 0; 78 | sender_id++; 79 | if (sender_id == num_sender) 80 | { 81 | sender_id = 0; 82 | } 83 | } 84 | } 85 | } 86 | {%- endmacro %} 87 | -------------------------------------------------------------------------------- /codegen/templates/cks.cl: -------------------------------------------------------------------------------- 1 | {% import 'utils.cl' as utils %} 2 | 3 | {%- macro smi_cks(program, channel, channel_count, target_index) -%} 4 | __kernel void smi_kernel_cks_{{ channel.index }}(__global volatile char *restrict rt, const char num_ranks) 5 | { 6 | char external_routing_table[MAX_RANKS]; 7 | for (int i = 0; i < MAX_RANKS; i++) 8 | { 9 | if (i < num_ranks) 10 | { 11 | external_routing_table[i] = rt[i]; 12 | } 13 | } 14 | 15 | {% set allocations = program.get_channel_allocations_with_prefix(channel.index, "cks") %} 16 | // number of CK_S - 1 + CK_R + {{ allocations|length }} CKS hardware ports 17 | const char num_sender = {{ channel_count + allocations|length }}; 18 | char sender_id = 0; 19 | SMI_Network_message message; 20 | 21 | char contiguous_reads = 0; 22 | 23 | while (1) 24 | { 25 | bool valid = false; 26 | switch (sender_id) 27 | { 28 | {% for ck_s in channel.neighbours() %} 29 | case {{ loop.index0 }}: 30 | // receive from CK_S_{{ ck_s }} 31 | message = read_channel_nb_intel(channels_interconnect_ck_s[{{ (channel_count - 1) * channel.index + loop.index0 }}], &valid); 32 | break; 33 | {% endfor %} 34 | case {{ channel_count - 1 }}: 35 | // receive from CK_R_{{ channel.index }} 36 | message = read_channel_nb_intel(channels_interconnect_ck_r_to_ck_s[{{ channel.index }}], &valid); 37 | break; 38 | {% for (op, key) in allocations %} 39 | case {{ channel_count + loop.index0 }}: 40 | // receive from {{ op }} 41 | message = read_channel_nb_intel({{ op.get_channel(key) }}, &valid); 42 | break; 43 | {% endfor %} 44 | } 45 | 46 | if (valid) 47 | { 48 | contiguous_reads++; 49 | char idx = external_routing_table[GET_HEADER_DST(message.header)]; 50 | switch (idx) 51 | { 52 | case 0: 53 | // send to QSFP 54 | write_channel_intel(io_out_{{ channel.index }}, message); 55 | break; 56 | case 1: 57 | // send to CK_R_{{ channel.index }} 58 | write_channel_intel(channels_interconnect_ck_s_to_ck_r[{{ channel.index }}], message); 59 | break; 60 | {% for ck_s in channel.neighbours() %} 61 | case {{ 2 + loop.index0 }}: 62 | // send to CK_S_{{ ck_s }} 63 | write_channel_intel(channels_interconnect_ck_s[{{ (channel_count - 1) * ck_s + target_index(ck_s, channel.index) }}], message); 64 | break; 65 | {% endfor %} 66 | } 67 | } 68 | if (!valid || contiguous_reads == READS_LIMIT) 69 | { 70 | contiguous_reads = 0; 71 | sender_id++; 72 | if (sender_id == num_sender) 73 | { 74 | sender_id = 0; 75 | } 76 | } 77 | } 78 | } 79 | {%- endmacro %} 80 | -------------------------------------------------------------------------------- /codegen/templates/device.cl: -------------------------------------------------------------------------------- 1 | #include "smi/network_message.h" 2 | {% import 'utils.cl' as utils %} 3 | {% import 'ckr.cl' as smi_ckr %} 4 | {% import 'cks.cl' as smi_cks %} 5 | 6 | {% import 'push.cl' as smi_push %} 7 | {% import 'pop.cl' as smi_pop %} 8 | {% import 'bcast.cl' as smi_bcast %} 9 | {% import 'reduce.cl' as smi_reduce %} 10 | {% import 'scatter.cl' as smi_scatter %} 11 | {% import 'gather.cl' as smi_gather %} 12 | 13 | // the maximum number of consecutive reads that each CKs/CKr can do from the same channel 14 | #define READS_LIMIT {{ program.consecutive_read_limit }} 15 | // maximum number of ranks in the cluster 16 | #define MAX_RANKS {{ program.max_ranks }} 17 | {% if program.p2p_rendezvous %} 18 | //P2P communications use synchronization 19 | #define P2P_RENDEZVOUS 20 | {% else %} 21 | //P2P communications use eager transmission protocol 22 | {% endif %} 23 | 24 | // QSFP channels 25 | #ifndef SMI_EMULATION_RANK 26 | {% for channel in channels %} 27 | channel SMI_Network_message io_out_{{ channel.index }} __attribute__((depth(16))) __attribute__((io("kernel_output_ch{{ channel.index }}"))); 28 | channel SMI_Network_message io_in_{{ channel.index }} __attribute__((depth(16))) __attribute__((io("kernel_input_ch{{ channel.index }}"))); 29 | {% endfor %} 30 | #else 31 | {% for fpga in fpgas %} 32 | #if SMI_EMULATION_RANK == {{ fpga.rank }} 33 | {% for channel in range(channels_per_fpga) %} 34 | channel SMI_Network_message io_out_{{ channel }} __attribute__((depth(16))) __attribute__((io("emulated_channel_{{ channel_name(fpga.channels[channel], true) }}"))); 35 | channel SMI_Network_message io_in_{{ channel }} __attribute__((depth(16))) __attribute__((io("emulated_channel_{{ channel_name(fpga.channels[channel], false) }}"))); 36 | {% endfor %} 37 | #endif 38 | {% endfor %} 39 | #endif 40 | 41 | {% for op in program.operations %} 42 | // {{ op }} 43 | {% for (channel, depth) in op.get_channel_defs(program.p2p_rendezvous) %} 44 | channel SMI_Network_message {{ channel }} __attribute__((depth({{ depth }}))); 45 | {% endfor %} 46 | {% endfor %} 47 | 48 | __constant char QSFP_COUNT = {{ channels_per_fpga }}; 49 | 50 | // connect all CK_S together 51 | channel SMI_Network_message channels_interconnect_ck_s[QSFP_COUNT*(QSFP_COUNT-1)] __attribute__((depth(16))); 52 | 53 | // connect all CK_R together 54 | channel SMI_Network_message channels_interconnect_ck_r[QSFP_COUNT*(QSFP_COUNT-1)] __attribute__((depth(16))); 55 | 56 | // connect corresponding CK_S/CK_R pairs 57 | channel SMI_Network_message channels_interconnect_ck_s_to_ck_r[QSFP_COUNT] __attribute__((depth(16))); 58 | 59 | // connect corresponding CK_R/CK_S pairs 60 | channel SMI_Network_message channels_interconnect_ck_r_to_ck_s[QSFP_COUNT] __attribute__((depth(16))); 61 | 62 | #include "smi/pop.h" 63 | #include "smi/push.h" 64 | #include "smi/bcast.h" 65 | #include "smi/reduce.h" 66 | #include "smi/scatter.h" 67 | #include "smi/gather.h" 68 | #include "smi/communicator.h" 69 | 70 | {% for channel in channels %} 71 | {{ smi_cks.smi_cks(program, channel, channels|length, target_index) }} 72 | {{ smi_ckr.smi_ckr(program, channel, channels|length, target_index) }} 73 | {% endfor %} 74 | 75 | {%- macro generate_op_impl(key, fn) %} 76 | {% for op in program.get_ops_by_type(key) %} 77 | {{ fn(program, op) }} 78 | {% endfor %} 79 | {%- endmacro %} 80 | 81 | // Push 82 | {{ generate_op_impl("push", smi_push.smi_push_channel) }} 83 | {{ generate_op_impl("push", smi_push.smi_push_impl) }} 84 | // Pop 85 | {{ generate_op_impl("pop", smi_pop.smi_pop_channel) }} 86 | {{ generate_op_impl("pop", smi_pop.smi_pop_impl) }} 87 | // Broadcast 88 | {{ generate_op_impl("broadcast", smi_bcast.smi_bcast_kernel) }} 89 | {{ generate_op_impl("broadcast", smi_bcast.smi_bcast_channel) }} 90 | {{ generate_op_impl("broadcast", smi_bcast.smi_bcast_impl) }} 91 | // Scatter 92 | {{ generate_op_impl("scatter", smi_scatter.smi_scatter_kernel) }} 93 | {{ generate_op_impl("scatter", smi_scatter.smi_scatter_channel) }} 94 | {{ generate_op_impl("scatter", smi_scatter.smi_scatter_impl) }} 95 | // Gather 96 | {{ generate_op_impl("gather", smi_gather.smi_gather_kernel) }} 97 | {{ generate_op_impl("gather", smi_gather.smi_gather_channel) }} 98 | {{ generate_op_impl("gather", smi_gather.smi_gather_impl) }} 99 | // Reduce 100 | {{ generate_op_impl("reduce", smi_reduce.smi_reduce_kernel) }} 101 | {{ generate_op_impl("reduce", smi_reduce.smi_reduce_channel) }} 102 | {{ generate_op_impl("reduce", smi_reduce.smi_reduce_impl) }} 103 | -------------------------------------------------------------------------------- /codegen/templates/host.cl: -------------------------------------------------------------------------------- 1 | #define __HOST_PROGRAM__ 2 | #include 3 | #include 4 | #include 5 | 6 | {% for (name, program) in programs -%} 7 | SMI_Comm SmiInit_{{ name }}( 8 | int rank, 9 | int ranks_count, 10 | const char* program_path, 11 | const char* routing_dir, 12 | cl::Platform &platform, 13 | cl::Device &device, 14 | cl::Context &context, 15 | cl::Program &program, 16 | int fpga, 17 | std::vector &buffers) 18 | { 19 | std::vector kernels; 20 | std::vector queues; 21 | std::vector kernel_names; 22 | 23 | // channel kernels 24 | {% for channel in range(program.channel_count) %} 25 | kernel_names.push_back("smi_kernel_cks_{{ channel }}"); 26 | kernel_names.push_back("smi_kernel_ckr_{{ channel }}"); 27 | {% endfor %} 28 | {%- macro generate_collective_kernels(key, kernel_name) %} 29 | {% set ops = program.get_ops_by_type(key) %} 30 | // {{ key }} kernels 31 | {% for op in ops %} 32 | kernel_names.push_back("{{ kernel_name }}_{{ op.logical_port }}"); 33 | {% endfor %} 34 | {%- endmacro %} 35 | 36 | {{ generate_collective_kernels("broadcast", "smi_kernel_bcast") }} 37 | {{ generate_collective_kernels("reduce", "smi_kernel_reduce") }} 38 | {{ generate_collective_kernels("scatter", "smi_kernel_scatter") }} 39 | {{ generate_collective_kernels("gather", "smi_kernel_gather") }} 40 | 41 | IntelFPGAOCLUtils::initEnvironment( 42 | platform, device, fpga, context, 43 | program, program_path, kernel_names, kernels, queues 44 | ); 45 | 46 | // create buffers for CKS/CKR 47 | const int ports = {{ program.logical_port_count }}; 48 | const int cks_table_size = ranks_count; 49 | const int ckr_table_size = ports * 2; 50 | {% for channel in range(program.channel_count) %} 51 | cl::Buffer routing_table_ck_s_{{ channel }}(context, CL_MEM_READ_ONLY, cks_table_size); 52 | cl::Buffer routing_table_ck_r_{{ channel }}(context, CL_MEM_READ_ONLY, ckr_table_size); 53 | {% endfor %} 54 | 55 | // load routing tables 56 | char routing_tables_cks[{{ program.channel_count}}][cks_table_size]; 57 | char routing_tables_ckr[{{ program.channel_count}}][ckr_table_size]; 58 | for (int i = 0; i < {{ program.channel_count }}; i++) 59 | { 60 | LoadRoutingTable(rank, i, cks_table_size, routing_dir, "cks", &routing_tables_cks[i][0]); 61 | LoadRoutingTable(rank, i, ckr_table_size, routing_dir, "ckr", &routing_tables_ckr[i][0]); 62 | } 63 | 64 | {% for channel in range(program.channel_count) %} 65 | queues[0].enqueueWriteBuffer(routing_table_ck_s_{{ channel }}, CL_TRUE, 0, cks_table_size, &routing_tables_cks[{{ channel }}][0]); 66 | queues[0].enqueueWriteBuffer(routing_table_ck_r_{{ channel }}, CL_TRUE, 0, ckr_table_size, &routing_tables_ckr[{{ channel }}][0]); 67 | {% endfor %} 68 | 69 | char char_ranks_count=ranks_count; 70 | char char_rank=rank; 71 | {% set ctx = namespace(kernel=0) %} 72 | {% for channel in range(program.channel_count) %} 73 | // cks_{{ channel }} 74 | kernels[{{ ctx.kernel }}].setArg(0, sizeof(cl_mem), &routing_table_ck_s_{{ channel }}); 75 | kernels[{{ ctx.kernel }}].setArg(1, sizeof(char), &char_ranks_count); 76 | 77 | // ckr_{{ channel }} 78 | {% set ctx.kernel = ctx.kernel + 1 %} 79 | kernels[{{ ctx.kernel }}].setArg(0, sizeof(cl_mem), &routing_table_ck_r_{{ channel }}); 80 | kernels[{{ ctx.kernel }}].setArg(1, sizeof(char), &char_rank); 81 | {% set ctx.kernel = ctx.kernel + 1 %} 82 | {% endfor %} 83 | 84 | {%- macro setup_collective_kernels(key) %} 85 | {% set ops = program.get_ops_by_type(key) %} 86 | {% for op in ops %} 87 | // {{ key }} {{ op.logical_port }} 88 | kernels[{{ ctx.kernel }}].setArg(0, sizeof(char), &char_ranks_count); 89 | {% set ctx.kernel = ctx.kernel + 1 %} 90 | {% endfor %} 91 | {%- endmacro %} 92 | {{ setup_collective_kernels("broadcast") }} 93 | {{ setup_collective_kernels("reduce") }} 94 | {{ setup_collective_kernels("scatter") }} 95 | {{ setup_collective_kernels("gather") }} 96 | 97 | // move buffers 98 | {% for channel in range(program.channel_count) %} 99 | buffers.push_back(std::move( routing_table_ck_s_{{ channel }})); 100 | buffers.push_back(std::move( routing_table_ck_r_{{ channel }})); 101 | {% endfor %} 102 | 103 | // start the kernels 104 | const int num_kernels = kernel_names.size(); 105 | for (int i = num_kernels - 1; i >= 0; i--) 106 | { 107 | queues[i].enqueueTask(kernels[i]); 108 | queues[i].flush(); 109 | } 110 | 111 | // return the communicator 112 | SMI_Comm comm{ char_rank, char_ranks_count }; 113 | return comm; 114 | 115 | } 116 | {% endfor %} 117 | -------------------------------------------------------------------------------- /codegen/templates/host_hlslib.cl: -------------------------------------------------------------------------------- 1 | #define __HOST_PROGRAM__ 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | {% for (name, program) in programs -%} 8 | SMI_Comm SmiInit_{{ name }}( 9 | int rank, 10 | int ranks_count, 11 | const char* routing_dir, 12 | hlslib::ocl::Context &context, 13 | hlslib::ocl::Program &program, 14 | std::vector> &buffers) 15 | { 16 | 17 | const int ports = {{ program.logical_port_count }}; 18 | const int cks_table_size = ranks_count; 19 | const int ckr_table_size = ports * 2; 20 | // load routing tables 21 | std::vector> routing_tables_ckr({{ program.channel_count}}, std::vector(ckr_table_size)); 22 | std::vector> routing_tables_cks({{ program.channel_count}}, std::vector(cks_table_size)); 23 | for (int i = 0; i < {{ program.channel_count }}; i++) 24 | { 25 | LoadRoutingTable(rank, i, cks_table_size, routing_dir, "cks", &routing_tables_cks[i][0]); 26 | LoadRoutingTable(rank, i, ckr_table_size, routing_dir, "ckr", &routing_tables_ckr[i][0]); 27 | } 28 | 29 | // create buffers for CKS/CKR and copy routing tables 30 | {% for channel in range(program.channel_count) %} 31 | hlslib::ocl::Buffer routing_table_device_ck_s_{{ channel }} = 32 | context.MakeBuffer( routing_tables_cks[{{channel}}].cbegin(), 33 | routing_tables_cks[{{channel}}].cend()); 34 | 35 | hlslib::ocl::Buffer routing_table_device_ck_r_{{ channel }} = 36 | context.MakeBuffer( routing_tables_ckr[{{channel}}].cbegin(), 37 | routing_tables_ckr[{{channel}}].cend()); 38 | {% endfor %} 39 | 40 | 41 | char char_ranks_count=ranks_count; 42 | char char_rank=rank; 43 | 44 | // CK kernels 45 | std::vector comm_kernels; 46 | {% for channel in range(program.channel_count) %} 47 | // cks_{{ channel }} 48 | comm_kernels.emplace_back(program.MakeKernel("smi_kernel_cks_{{ channel }}", routing_table_device_ck_s_{{channel}}, (char)char_ranks_count)); 49 | 50 | // ckr_{{ channel }} 51 | comm_kernels.emplace_back(program.MakeKernel("smi_kernel_ckr_{{ channel }}", routing_table_device_ck_r_{{ channel }}, (char)char_rank)); 52 | 53 | {% endfor %} 54 | 55 | // Collective kernels 56 | std::vector collective_kernels; 57 | {%- macro generate_collective_kernels(key, kernel_name) %} 58 | {% set ops = program.get_ops_by_type(key) %} 59 | {% for op in ops %} 60 | // {{ key }} {{ op.logical_port }} 61 | collective_kernels.emplace_back(program.MakeKernel("{{ kernel_name }}_{{ op.logical_port }}", (char)char_ranks_count)); 62 | {% endfor %} 63 | {%- endmacro %} 64 | 65 | {{ generate_collective_kernels("broadcast", "smi_kernel_bcast") }} 66 | {{ generate_collective_kernels("reduce", "smi_kernel_reduce") }} 67 | {{ generate_collective_kernels("scatter", "smi_kernel_scatter") }} 68 | {{ generate_collective_kernels("gather", "smi_kernel_gather") }} 69 | 70 | // start the kernels 71 | for (auto &k : comm_kernels) { 72 | // Will never terminate, so we don't care about the return value of fork 73 | k.ExecuteTaskFork(); 74 | } 75 | 76 | for (auto &k : collective_kernels) { 77 | // Will never terminate, so we don't care about the return value of fork 78 | k.ExecuteTaskFork(); 79 | } 80 | 81 | //move created buffers to the vector given my the user 82 | {% for channel in range(program.channel_count) %} 83 | buffers.push_back(std::move(routing_table_device_ck_s_{{channel}})); 84 | buffers.push_back(std::move(routing_table_device_ck_r_{{channel}})); 85 | {% endfor %} 86 | 87 | // return the communicator 88 | SMI_Comm comm{ char_rank, char_ranks_count }; 89 | return comm; 90 | 91 | } 92 | {% endfor %} 93 | -------------------------------------------------------------------------------- /codegen/templates/pop.cl: -------------------------------------------------------------------------------- 1 | {% import 'utils.cl' as utils %} 2 | 3 | {%- macro smi_pop_impl(program, op) -%} 4 | void {{ utils.impl_name_port_type("SMI_Pop", op) }}(SMI_Channel *chan, void *data) 5 | { 6 | // in this case we have to copy the data into the target variable 7 | if (chan->packet_element_id == 0) 8 | { 9 | // no data to be unpacked...receive from the network 10 | chan->net = read_channel_intel({{ op.get_channel("ckr_data") }}); 11 | } 12 | chan->processed_elements++; 13 | char *data_recvd = chan->net.data; 14 | 15 | #pragma unroll 16 | for (int ee = 0; ee < {{ op.data_elements_per_packet() }}; ee++) 17 | { 18 | if (ee == chan->packet_element_id) 19 | { 20 | #pragma unroll 21 | for (int jj = 0; jj < {{ op.data_size() }}; jj++) 22 | { 23 | ((char *)data)[jj] = data_recvd[(ee * {{ op.data_size() }}) + jj]; 24 | } 25 | } 26 | } 27 | 28 | chan->packet_element_id++; 29 | if (chan->packet_element_id == GET_HEADER_NUM_ELEMS(chan->net.header)) 30 | { 31 | chan->packet_element_id = 0; 32 | } 33 | // TODO: This is used to prevent this funny compiler to re-oder the two *_channel_intel operations 34 | // mem_fence(CLK_CHANNEL_MEM_FENCE); 35 | #if defined P2P_RENDEZVOUS 36 | //echange tokens 37 | chan->tokens--; 38 | if (chan->tokens == 0) 39 | { 40 | // At this point, the sender has still max_tokens*7/8 tokens: we have to consider this while we send 41 | // the new tokens to it 42 | unsigned int sender = ((int) ((int) chan->message_size - (int) chan->processed_elements - (int) chan->max_tokens * 7 / 8)) < 0 ? 0: chan->message_size - chan->processed_elements - chan -> max_tokens * 7 / 8; 43 | chan->tokens = (unsigned int) (MIN(chan->max_tokens / 8, sender)); // b/2 44 | SMI_Network_message mess; 45 | *(unsigned int*) mess.data = chan->tokens; 46 | SET_HEADER_DST(mess.header, chan->sender_rank); 47 | SET_HEADER_PORT(mess.header, chan->port); 48 | SET_HEADER_OP(mess.header, SMI_SYNCH); 49 | write_channel_intel({{ op.get_channel("cks_control") }}, mess); 50 | } 51 | #endif 52 | } 53 | {%- endmacro %} 54 | 55 | {%- macro smi_pop_channel(program, op) -%} 56 | SMI_Channel {{ utils.impl_name_port_type("SMI_Open_receive_channel", op) }}(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm) 57 | { 58 | SMI_Channel chan; 59 | // setup channel descriptor 60 | chan.port = (char) port; 61 | chan.sender_rank = (char) source; 62 | chan.message_size = (unsigned int) count; 63 | chan.data_type = data_type; 64 | chan.op_type = SMI_RECEIVE; 65 | chan.elements_per_packet = {{ op.data_elements_per_packet() }}; 66 | chan.max_tokens = {{ op.buffer_size * op.data_elements_per_packet() }}; 67 | 68 | #if defined P2P_RENDEZVOUS 69 | chan.tokens = MIN(chan.max_tokens / ((unsigned int) 8), count); // needed to prevent the compiler to optimize-away channel connections 70 | #else 71 | chan.tokens = count; // in this way, the last rendezvous is done at the end of the message. This is needed to prevent the compiler to cut-away internal FIFO buffer connections 72 | #endif 73 | // The receiver sends tokens to the sender once every chan.max_tokens/8 received data elements 74 | // chan.tokens = chan.max_tokens / ((unsigned int) 8); 75 | SET_HEADER_NUM_ELEMS(chan.net.header, 0); // at the beginning no data 76 | chan.packet_element_id = 0; // data per packet 77 | chan.processed_elements = 0; 78 | chan.sender_rank = chan.sender_rank; 79 | chan.receiver_rank = comm[0]; 80 | // comm is not directly used in this first implementation 81 | return chan; 82 | } 83 | {%- endmacro -%} 84 | -------------------------------------------------------------------------------- /codegen/templates/push.cl: -------------------------------------------------------------------------------- 1 | {% import 'utils.cl' as utils %} 2 | 3 | {%- macro smi_push_impl(program, op) -%} 4 | void {{ utils.impl_name_port_type("SMI_Push_flush", op) }}(SMI_Channel *chan, void* data, int immediate) 5 | { 6 | char* conv = (char*) data; 7 | COPY_DATA_TO_NET_MESSAGE(chan, chan->net, conv); 8 | chan->processed_elements++; 9 | chan->packet_element_id++; 10 | 11 | // send the network packet if it full or we reached the message size 12 | if (chan->packet_element_id == chan->elements_per_packet || immediate || chan->processed_elements == chan->message_size) 13 | { 14 | SET_HEADER_NUM_ELEMS(chan->net.header, chan->packet_element_id); 15 | chan->packet_element_id = 0; 16 | write_channel_intel({{ op.get_channel("cks_data") }}, chan->net); 17 | } 18 | // This fence is not mandatory, the two channel operations can be 19 | // performed independently 20 | // mem_fence(CLK_CHANNEL_MEM_FENCE); 21 | #if defined P2P_RENDEZVOUS 22 | chan->tokens--; 23 | if (chan->tokens == 0) 24 | { 25 | // receives also with tokens=0 26 | // wait until the message arrives 27 | SMI_Network_message mess = read_channel_intel({{ op.get_channel("ckr_control") }}); 28 | unsigned int tokens = *(unsigned int *) mess.data; 29 | chan->tokens += tokens; // tokens 30 | } 31 | #endif 32 | } 33 | void {{ utils.impl_name_port_type("SMI_Push", op) }}(SMI_Channel *chan, void* data) 34 | { 35 | {{ utils.impl_name_port_type("SMI_Push_flush", op) }}(chan, data, 0); 36 | } 37 | {%- endmacro %} 38 | 39 | {%- macro smi_push_channel(program, op) -%} 40 | SMI_Channel {{ utils.impl_name_port_type("SMI_Open_send_channel", op) }}(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm) 41 | { 42 | SMI_Channel chan; 43 | // setup channel descriptor 44 | chan.port = (char) port; 45 | chan.message_size = (unsigned int) count; 46 | chan.data_type = data_type; 47 | chan.op_type = SMI_SEND; 48 | chan.receiver_rank = (char) destination; 49 | // At the beginning, the sender can sends as many data items as the buffer size 50 | // in the receiver allows 51 | chan.elements_per_packet = {{ op.data_elements_per_packet() }}; 52 | chan.max_tokens = {{ op.buffer_size * op.data_elements_per_packet() }}; 53 | 54 | // setup header for the message 55 | SET_HEADER_DST(chan.net.header, chan.receiver_rank); 56 | SET_HEADER_PORT(chan.net.header, chan.port); 57 | SET_HEADER_OP(chan.net.header, SMI_SEND); 58 | #if defined P2P_RENDEZVOUS 59 | chan.tokens = MIN(chan.max_tokens, count); // needed to prevent the compiler to optimize-away channel connections 60 | #else // eager transmission protocol 61 | chan.tokens = count; // in this way, the last rendezvous is done at the end of the message. This is needed to prevent the compiler to cut-away internal FIFO buffer connections 62 | #endif 63 | chan.receiver_rank = destination; 64 | chan.processed_elements = 0; 65 | chan.packet_element_id = 0; 66 | chan.sender_rank = comm[0]; 67 | // chan.comm = comm; // comm is not used in this first implemenation 68 | return chan; 69 | } 70 | {%- endmacro -%} 71 | -------------------------------------------------------------------------------- /codegen/templates/utils.cl: -------------------------------------------------------------------------------- 1 | {%- macro impl_name_port_type(name, op) -%}{{ name }}_{{ op.logical_port }}_{{ op.data_type }}{%- endmacro -%} 2 | -------------------------------------------------------------------------------- /codegen/tests/conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | from typing import Union, List 7 | 8 | import pytest 9 | from networkx import Graph 10 | 11 | from common import RoutingContext 12 | from ops import SmiOperation 13 | from serialization import parse_smi_operation 14 | 15 | sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(__file__)))) 16 | 17 | from program import Channel, ProgramMapping, Program 18 | from routing import create_routing_context 19 | 20 | PYTEST_DIR = os.path.dirname(__file__) 21 | WORK_DIR = os.path.join(PYTEST_DIR, "work") 22 | DATA_DIR = os.path.join(PYTEST_DIR, "data") 23 | 24 | ROOT_DIR = os.path.dirname(os.path.dirname(PYTEST_DIR)) 25 | REWRITER_BUILD_DIR = os.environ.get("REWRITER_DIR", ROOT_DIR.join("build/rewriter")) 26 | 27 | 28 | def prepare(): 29 | """Prepare working directory 30 | If directory exists then it is cleaned; 31 | If it does not exists then it is created. 32 | """ 33 | if os.path.isdir(WORK_DIR): 34 | for root, dirs, files in os.walk(WORK_DIR): 35 | for d in dirs: 36 | os.chmod(os.path.join(root, d), 0o700) 37 | for f in files: 38 | os.chmod(os.path.join(root, f), 0o700) 39 | for item in os.listdir(WORK_DIR): 40 | path = os.path.join(WORK_DIR, item) 41 | if os.path.isfile(path): 42 | os.unlink(path) 43 | else: 44 | shutil.rmtree(path) 45 | else: 46 | os.makedirs(WORK_DIR) 47 | os.chdir(WORK_DIR) 48 | 49 | 50 | def get_routing_ctx(program: Program, connections) -> RoutingContext: 51 | fpgas = tuple(fpga for (fpga, _) in connections.keys()) + tuple(fpga for (fpga, _) in connections.values()) 52 | fpga_map = { 53 | fpga: program for fpga in fpgas 54 | } 55 | for (k, v) in dict(connections).items(): 56 | connections[v] = k 57 | 58 | mapping = ProgramMapping([program], fpga_map) 59 | return create_routing_context(connections, mapping) 60 | 61 | 62 | def get_channel(graph: Graph, key: str, index: int) -> Union[Channel, None]: 63 | for channel in graph.nodes: 64 | if channel.fpga.key() == key and channel.index == index: 65 | return channel 66 | return None 67 | 68 | 69 | def get_data(path: str) -> str: 70 | return os.path.join(DATA_DIR, path) 71 | 72 | 73 | class FileTester: 74 | def check(self, path: str, content: str): 75 | file_path = get_data(path) 76 | with open(file_path) as f: 77 | file_content = f.read() 78 | 79 | ok = False 80 | try: 81 | assert file_content == content 82 | ok = True 83 | finally: 84 | if not ok: 85 | with open("{}.fail".format(os.path.basename(path)), "w") as f: 86 | f.write(content) 87 | 88 | 89 | @pytest.yield_fixture(autouse=True, scope="function") 90 | def file_tester(): 91 | prepare() 92 | yield FileTester() 93 | 94 | 95 | class RewriteTester: 96 | def check(self, path: str, operations: List[SmiOperation]): 97 | orig_file = get_data("{}.cl".format(path)) 98 | work_file = os.path.join(WORK_DIR, "{}.cl".format(path)) 99 | expected_file = get_data("{}-expected.cl".format(path)) 100 | 101 | shutil.copyfile(orig_file, work_file) 102 | ok = False 103 | 104 | try: 105 | result = subprocess.run([ 106 | os.path.join(REWRITER_BUILD_DIR, "rewriter"), 107 | "-extra-arg=-I{}".format(os.path.join(ROOT_DIR, "include")), 108 | work_file 109 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) 110 | stdout = result.stdout.decode() 111 | parsed_ops = [parse_smi_operation(json.loads(line)) for line in stdout.splitlines() if line] 112 | assert parsed_ops == operations 113 | 114 | with open(expected_file) as expected: 115 | with open(work_file) as work: 116 | assert work.read() == expected.read() 117 | 118 | ok = True 119 | finally: 120 | if ok: 121 | shutil.rmtree(WORK_DIR, ignore_errors=True) 122 | else: 123 | print(result.stderr.decode()) 124 | 125 | 126 | @pytest.yield_fixture(autouse=True, scope="function") 127 | def rewrite_tester(): 128 | prepare() 129 | yield RewriteTester() 130 | -------------------------------------------------------------------------------- /codegen/tests/data/buffer-size-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Push_0_int(SMI_Channel* chan, void* data); 6 | SMI_Channel SMI_Open_send_channel_0_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 7 | __kernel void app_0(const int N, const char dst) 8 | { 9 | SMI_Comm comm; 10 | for (int i = 0; i < N; i++) 11 | { 12 | SMI_Channel chan_send = SMI_Open_send_channel_0_int(1, SMI_INT, dst, 0, comm); 13 | SMI_Push_0_int(&chan_send, &i); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /codegen/tests/data/buffer-size.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | for (int i = 0; i < N; i++) 9 | { 10 | SMI_Channel chan_send = SMI_Open_send_channel_ad(1, SMI_INT, dst, 0, comm, 128); 11 | SMI_Push(&chan_send, &i); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /codegen/tests/data/complex-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Gather_5_int(SMI_GatherChannel* chan, void* send_data, void* rcv_data); 6 | SMI_GatherChannel SMI_Open_gather_channel_5_int(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm); 7 | void SMI_Scatter_4_short(SMI_ScatterChannel* chan, void* data_snd, void* data_rcv); 8 | SMI_ScatterChannel SMI_Open_scatter_channel_4_short(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm); 9 | void SMI_Reduce_3_float(SMI_RChannel* chan, void* data_snd, void* data_rcv); 10 | SMI_RChannel SMI_Open_reduce_channel_3_float(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm); 11 | SMI_BChannel SMI_Open_bcast_channel_2_int(int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm); 12 | void SMI_Pop_1_double(SMI_Channel* chan, void* data); 13 | SMI_Channel SMI_Open_receive_channel_1_double(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm); 14 | void SMI_Push_0_char(SMI_Channel* chan, void* data); 15 | SMI_Channel SMI_Open_send_channel_0_char(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 16 | __kernel void app_0(const int N, const char dst) 17 | { 18 | SMI_Comm comm; 19 | 20 | #pragma unroll 21 | for (int i = 0; i < N; i++) 22 | { 23 | float16 var1; 24 | uint var2; 25 | // push 26 | SMI_Channel chan_send = SMI_Open_send_channel_0_char(1, SMI_CHAR, dst, 0, comm); 27 | SMI_Push_0_char(&chan_send, &var1); 28 | 29 | // pop 30 | SMI_Channel chan_recv = SMI_Open_receive_channel_1_double(1, SMI_DOUBLE, dst, 1, comm); 31 | SMI_Pop_1_double(&chan_recv, &var2); 32 | 33 | // broadcast 34 | SMI_BChannel chan_bcast = SMI_Open_bcast_channel_2_int(1, SMI_INT, 2, 1, comm); 35 | SMI_Bcast(&chan_bcast, &i, &i); 36 | 37 | // reduce 38 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel_3_float(1, SMI_FLOAT, SMI_ADD, 3, 1, comm); 39 | SMI_Reduce_3_float(&chan_reduce, &i, &i); 40 | 41 | // scatter 42 | SMI_ScatterChannel chan_scatter = SMI_Open_scatter_channel_4_short(1, 1, SMI_SHORT, 4, 1, comm); 43 | SMI_Scatter_4_short(&chan_scatter, &i, &i); 44 | 45 | // gather 46 | SMI_GatherChannel chan_gather = SMI_Open_gather_channel_5_int(1, 1, SMI_INT, 5, 1, comm); 47 | SMI_Gather_5_int(&chan_gather, &i, &i); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /codegen/tests/data/complex.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | 9 | #pragma unroll 10 | for (int i = 0; i < N; i++) 11 | { 12 | float16 var1; 13 | uint var2; 14 | // push 15 | SMI_Channel chan_send = SMI_Open_send_channel(1, SMI_CHAR, dst, 0, comm); 16 | SMI_Push(&chan_send, &var1); 17 | 18 | // pop 19 | SMI_Channel chan_recv = SMI_Open_receive_channel(1, SMI_DOUBLE, dst, 1, comm); 20 | SMI_Pop(&chan_recv, &var2); 21 | 22 | // broadcast 23 | SMI_BChannel chan_bcast = SMI_Open_bcast_channel(1, SMI_INT, 2, 1, comm); 24 | SMI_Bcast(&chan_bcast, &i, &i); 25 | 26 | // reduce 27 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel(1, SMI_FLOAT, SMI_ADD, 3, 1, comm); 28 | SMI_Reduce(&chan_reduce, &i, &i); 29 | 30 | // scatter 31 | SMI_ScatterChannel chan_scatter = SMI_Open_scatter_channel(1, 1, SMI_SHORT, 4, 1, comm); 32 | SMI_Scatter(&chan_scatter, &i, &i); 33 | 34 | // gather 35 | SMI_GatherChannel chan_gather = SMI_Open_gather_channel(1, 1, SMI_INT, 5, 1, comm); 36 | SMI_Gather(&chan_gather, &i, &i); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /codegen/tests/data/constant-variable-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Push_5_int(SMI_Channel* chan, void* data); 6 | SMI_Channel SMI_Open_send_channel_5_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 7 | __kernel void app_0(const int N, const char dst) 8 | { 9 | SMI_Comm comm; 10 | for (int i = 0; i < N; i++) 11 | { 12 | const int port = 5; 13 | SMI_Channel chan_send1 = SMI_Open_send_channel_5_int(1, SMI_INT, dst, port, comm); 14 | SMI_Push_5_int(&chan_send1, &i); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /codegen/tests/data/constant-variable.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | for (int i = 0; i < N; i++) 9 | { 10 | const int port = 5; 11 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, port, comm); 12 | SMI_Push(&chan_send1, &i); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /codegen/tests/data/data-type-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Pop_4_int(SMI_Channel* chan, void* data); 6 | SMI_Channel SMI_Open_receive_channel_4_int(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm); 7 | void SMI_Pop_3_float(SMI_Channel* chan, void* data); 8 | SMI_Channel SMI_Open_receive_channel_3_float(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm); 9 | void SMI_Pop_1_double(SMI_Channel* chan, void* data); 10 | SMI_Channel SMI_Open_receive_channel_1_double(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm); 11 | void SMI_Push_0_char(SMI_Channel* chan, void* data); 12 | SMI_Channel SMI_Open_send_channel_0_char(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 13 | __kernel void app_0(const int N, const char dst) 14 | { 15 | SMI_Comm comm; 16 | for (int i = 0; i < N; i++) 17 | { 18 | SMI_Channel chan_send = SMI_Open_send_channel_0_char(1, SMI_CHAR, dst, 0, comm); 19 | SMI_Push_0_char(&chan_send, &i); 20 | SMI_Channel chan_recv = SMI_Open_receive_channel_1_double(1, SMI_DOUBLE, dst, 1, comm); 21 | SMI_Pop_1_double(&chan_recv, &i); 22 | SMI_Channel chan_recv1 = SMI_Open_receive_channel_3_float(1, SMI_FLOAT, dst, 3, comm); 23 | SMI_Pop_3_float(&chan_recv1, &i); 24 | SMI_Channel chan_recv2 = SMI_Open_receive_channel_4_int(1, SMI_INT, dst, 4, comm); 25 | SMI_Pop_4_int(&chan_recv2, &i); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /codegen/tests/data/data-type.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | for (int i = 0; i < N; i++) 9 | { 10 | SMI_Channel chan_send = SMI_Open_send_channel(1, SMI_CHAR, dst, 0, comm); 11 | SMI_Push(&chan_send, &i); 12 | SMI_Channel chan_recv = SMI_Open_receive_channel(1, SMI_DOUBLE, dst, 1, comm); 13 | SMI_Pop(&chan_recv, &i); 14 | SMI_Channel chan_recv1 = SMI_Open_receive_channel(1, SMI_FLOAT, dst, 3, comm); 15 | SMI_Pop(&chan_recv1, &i); 16 | SMI_Channel chan_recv2 = SMI_Open_receive_channel(1, SMI_INT, dst, 4, comm); 17 | SMI_Pop(&chan_recv2, &i); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /codegen/tests/data/kernel-attribute-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Push_0_int(SMI_Channel* chan, void* data); 6 | SMI_Channel SMI_Open_send_channel_0_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 7 | __kernel void app_0(const int N, const char dst) 8 | { 9 | SMI_Comm comm; 10 | for (int i = 0; i < N; i++) 11 | { 12 | SMI_Channel chan_send1 = SMI_Open_send_channel_0_int(1, SMI_INT, dst, 0, comm); 13 | SMI_Push_0_int(&chan_send1, &i); 14 | } 15 | } 16 | 17 | void SMI_Push_1_int(SMI_Channel* chan, void* data); 18 | SMI_Channel SMI_Open_send_channel_1_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 19 | kernel void app_1(const int N, const char dst) 20 | { 21 | SMI_Comm comm; 22 | for (int i = 0; i < N; i++) 23 | { 24 | SMI_Channel chan_send1 = SMI_Open_send_channel_1_int(1, SMI_INT, dst, 1, comm); 25 | SMI_Push_1_int(&chan_send1, &i); 26 | } 27 | } 28 | 29 | kernelx void app_2(const int N, const char dst) 30 | { 31 | SMI_Comm comm; 32 | for (int i = 0; i < N; i++) 33 | { 34 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 2, comm); 35 | SMI_Push(&chan_send1, &i); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /codegen/tests/data/kernel-attribute.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | for (int i = 0; i < N; i++) 9 | { 10 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 0, comm); 11 | SMI_Push(&chan_send1, &i); 12 | } 13 | } 14 | 15 | kernel void app_1(const int N, const char dst) 16 | { 17 | SMI_Comm comm; 18 | for (int i = 0; i < N; i++) 19 | { 20 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 1, comm); 21 | SMI_Push(&chan_send1, &i); 22 | } 23 | } 24 | 25 | kernelx void app_2(const int N, const char dst) 26 | { 27 | SMI_Comm comm; 28 | for (int i = 0; i < N; i++) 29 | { 30 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 2, comm); 31 | SMI_Push(&chan_send1, &i); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /codegen/tests/data/port-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Pop_3_int(SMI_Channel* chan, void* data); 6 | SMI_Channel SMI_Open_receive_channel_3_int(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm); 7 | void SMI_Push_0_int(SMI_Channel* chan, void* data); 8 | SMI_Channel SMI_Open_send_channel_0_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 9 | __kernel void app_0(const int N, const char dst) 10 | { 11 | SMI_Comm comm; 12 | for (int i = 0; i < N; i++) 13 | { 14 | SMI_Channel chan_send = SMI_Open_send_channel_0_int(1, SMI_INT, dst, 0, comm); 15 | SMI_Push_0_int(&chan_send, &i); 16 | SMI_Channel chan_recv = SMI_Open_receive_channel_3_int(1, SMI_INT, dst, 3, comm); 17 | SMI_Pop_3_int(&chan_recv, &i); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /codegen/tests/data/port.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | for (int i = 0; i < N; i++) 9 | { 10 | SMI_Channel chan_send = SMI_Open_send_channel(1, SMI_INT, dst, 0, comm); 11 | SMI_Push(&chan_send, &i); 12 | SMI_Channel chan_recv = SMI_Open_receive_channel(1, SMI_INT, dst, 3, comm); 13 | SMI_Pop(&chan_recv, &i); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /codegen/tests/data/reduce-expected.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | void SMI_Reduce_2_int(SMI_RChannel* chan, void* data_snd, void* data_rcv); 6 | SMI_RChannel SMI_Open_reduce_channel_2_int(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm); 7 | void SMI_Reduce_1_int(SMI_RChannel* chan, void* data_snd, void* data_rcv); 8 | SMI_RChannel SMI_Open_reduce_channel_1_int(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm); 9 | void SMI_Reduce_0_int(SMI_RChannel* chan, void* data_snd, void* data_rcv); 10 | SMI_RChannel SMI_Open_reduce_channel_0_int(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm); 11 | __kernel void app_0(const int N, const char dst) 12 | { 13 | SMI_Comm comm; 14 | for (int i = 0; i < N; i++) 15 | { 16 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel_0_int(1, SMI_INT, SMI_ADD, 0, 1, comm); 17 | SMI_Reduce_0_int(&chan_reduce, &i, &i); 18 | 19 | SMI_RChannel chan_reduce1 = SMI_Open_reduce_channel_1_int(1, SMI_INT, SMI_MIN, 1, 1, comm); 20 | SMI_Reduce_1_int(&chan_reduce1, &i, &i); 21 | 22 | SMI_RChannel chan_reduce2 = SMI_Open_reduce_channel_2_int(1, SMI_INT, SMI_MAX, 2, 1, comm); 23 | SMI_Reduce_2_int(&chan_reduce2, &i, &i); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /codegen/tests/data/reduce.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_intel_channels : enable 2 | 3 | #include 4 | 5 | __kernel void app_0(const int N, const char dst) 6 | { 7 | SMI_Comm comm; 8 | for (int i = 0; i < N; i++) 9 | { 10 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel(1, SMI_INT, SMI_ADD, 0, 1, comm); 11 | SMI_Reduce(&chan_reduce, &i, &i); 12 | 13 | SMI_RChannel chan_reduce1 = SMI_Open_reduce_channel(1, SMI_INT, SMI_MIN, 1, 1, comm); 14 | SMI_Reduce(&chan_reduce1, &i, &i); 15 | 16 | SMI_RChannel chan_reduce2 = SMI_Open_reduce_channel(1, SMI_INT, SMI_MAX, 2, 1, comm); 17 | SMI_Reduce(&chan_reduce2, &i, &i); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /codegen/tests/test_codegen.py: -------------------------------------------------------------------------------- 1 | from codegen import generate_program_device, generate_program_host 2 | from ops import Push, Pop, Broadcast, Reduce, Scatter, Gather 3 | from program import Program, ProgramMapping 4 | from routing import create_routing_context 5 | 6 | 7 | def test_codegen_device(file_tester): 8 | program = Program([ 9 | Push(0, "short", 8), 10 | Pop(0), 11 | Push(1), 12 | Pop(2, "char", 8), 13 | Broadcast(3, "float", 64), 14 | Broadcast(4, "int"), 15 | Push(5, "double", 32), 16 | Reduce(6, "float", 16, "add"), 17 | Scatter(7, "double"), 18 | Gather(8, "char") 19 | ]) 20 | 21 | mapping = ProgramMapping([program], { 22 | "n1:f1": program, 23 | "n1:f2": program, 24 | "n2:f1": program, 25 | "n3:f1": program 26 | }) 27 | 28 | connections = { 29 | ("n1:f1", 0): ("n1:f2", 0), 30 | ("n1:f2", 1): ("n2:f1", 1), 31 | ("n1:f2", 2): ("n3:f1", 1), 32 | ("n2:f1", 0): ("n1:f1", 1), 33 | } 34 | 35 | ctx = create_routing_context(connections, mapping) 36 | 37 | file_tester.check("smi-device-1.h", generate_program_device(ctx.fpgas[0], ctx.fpgas, ctx.graph, 4)) 38 | 39 | 40 | def test_codegen_host(file_tester): 41 | program = Program([ 42 | Push(0), 43 | Pop(0), 44 | Push(1), 45 | Pop(2), 46 | Broadcast(3), 47 | Broadcast(4), 48 | Push(5), 49 | Reduce(6, "float", 16, "min") 50 | ]) 51 | 52 | file_tester.check("smi-host-1.h", generate_program_host([("program", program)])) 53 | -------------------------------------------------------------------------------- /codegen/tests/test_parse.py: -------------------------------------------------------------------------------- 1 | from ops import Push, Pop, Broadcast, Reduce 2 | from serialization import parse_program, parse_routing_file 3 | 4 | 5 | def test_parse_program(): 6 | program = parse_program(""" 7 | { 8 | "consecutive_reads": 16, 9 | "max_ranks": 16, 10 | "p2p_randezvous": false, 11 | "operations": [{ 12 | "port": 0, 13 | "type": "push", 14 | "data_type": "int" 15 | }, { 16 | "port": 1, 17 | "type": "push", 18 | "data_type": "char" 19 | }, { 20 | "port": 2, 21 | "type": "pop" 22 | }, { 23 | "port": 3, 24 | "type": "broadcast", 25 | "data_type": "int" 26 | }, { 27 | "port": 4, 28 | "type": "reduce", 29 | "data_type": "float", 30 | "args": { 31 | "op_type": "add" 32 | } 33 | }, { 34 | "port": 5, 35 | "type": "scatter", 36 | "data_type": "short" 37 | }, { 38 | "port": 6, 39 | "type": "gather", 40 | "data_type": "double", 41 | "buffer_size": 32 42 | }] 43 | } 44 | """) 45 | assert program.consecutive_read_limit == 16 46 | assert program.max_ranks == 16 47 | assert len(program.operations) == 7 48 | assert isinstance(program.operations[0], Push) 49 | assert program.operations[0].logical_port == 0 50 | assert isinstance(program.operations[2], Pop) 51 | assert program.operations[2].logical_port == 2 52 | 53 | assert isinstance(program.operations[3], Broadcast) 54 | assert isinstance(program.operations[4], Reduce) 55 | assert program.operations[4].data_type == "float" 56 | 57 | assert program.operations[6].buffer_size == 32 58 | 59 | 60 | def test_parse_connections(): 61 | (connections, _) = parse_routing_file(""" 62 | { 63 | "fpgas": {}, 64 | "connections": { 65 | "fpga-0015:acl0:ch0": "fpga-0016:acl0:ch0", 66 | "fpga-0015:acl0:ch1": "fpga-0015:acl1:ch1", 67 | "fpga-0015:acl0:ch2": "fpga-0016:acl1:ch2", 68 | "fpga-0015:acl1:ch0": "fpga-0016:acl1:ch0", 69 | "fpga-0015:acl1:ch2": "fpga-0016:acl0:ch2", 70 | "fpga-0016:acl0:ch1": "fpga-0016:acl1:ch1" 71 | } 72 | } 73 | """, ignore_programs=True) 74 | assert connections == {('fpga-0015:acl0', 0): ('fpga-0016:acl0', 0), 75 | ('fpga-0015:acl0', 1): ('fpga-0015:acl1', 1), 76 | ('fpga-0015:acl0', 2): ('fpga-0016:acl1', 2), 77 | ('fpga-0015:acl1', 0): ('fpga-0016:acl1', 0), 78 | ('fpga-0015:acl1', 1): ('fpga-0015:acl0', 1), 79 | ('fpga-0015:acl1', 2): ('fpga-0016:acl0', 2), 80 | ('fpga-0016:acl0', 0): ('fpga-0015:acl0', 0), 81 | ('fpga-0016:acl0', 1): ('fpga-0016:acl1', 1), 82 | ('fpga-0016:acl0', 2): ('fpga-0015:acl1', 2), 83 | ('fpga-0016:acl1', 0): ('fpga-0015:acl1', 0), 84 | ('fpga-0016:acl1', 1): ('fpga-0016:acl0', 1), 85 | ('fpga-0016:acl1', 2): ('fpga-0015:acl0', 2)} 86 | -------------------------------------------------------------------------------- /codegen/tests/test_program.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from ops import Push, Pop, Broadcast, KEY_CKS_DATA, KEY_CKS_CONTROL, KEY_CKR_DATA, KEY_BROADCAST 4 | from program import Program, FailedAllocation 5 | 6 | 7 | def test_allocation_fail(): 8 | with pytest.raises(FailedAllocation): 9 | Program([ 10 | Push(0), 11 | Broadcast(0) 12 | ]) 13 | 14 | 15 | def test_allocation_channel_to_ports(): 16 | program = Program([ 17 | Push(0), 18 | Pop(0), 19 | Push(1), 20 | Push(2), 21 | Pop(2) 22 | ]) 23 | 24 | ops = program.operations 25 | assert program.get_channel_allocations(0) == [ 26 | (ops[0], "cks_data"), 27 | (ops[4], "cks_control"), 28 | (ops[1], "ckr_data"), 29 | (ops[3], "ckr_control"), 30 | ] 31 | assert program.get_channel_allocations(1) == [ 32 | (ops[2], "cks_data"), 33 | (ops[4], "ckr_data"), 34 | ] 35 | assert program.get_channel_allocations(2) == [ 36 | (ops[3], "cks_data"), 37 | (ops[0], "ckr_control"), 38 | ] 39 | assert program.get_channel_allocations(3) == [ 40 | (ops[1], "cks_control"), 41 | (ops[2], "ckr_control"), 42 | ] 43 | 44 | 45 | def test_allocation_get_channel(): 46 | program = Program([ 47 | Push(0), 48 | Pop(0), 49 | Push(1), 50 | Push(2), 51 | Pop(2) 52 | ]) 53 | 54 | assert program.get_channel_for_port_key(0, KEY_CKS_DATA) == 0 55 | assert program.get_channel_for_port_key(0, KEY_CKS_CONTROL) == 3 56 | assert program.get_channel_for_port_key(1, KEY_CKR_DATA) is None 57 | assert program.get_channel_for_port_key(2, KEY_CKS_DATA) == 2 58 | -------------------------------------------------------------------------------- /codegen/tests/test_rewriter.py: -------------------------------------------------------------------------------- 1 | from ops import Push, Pop, Broadcast, Reduce, Scatter, Gather 2 | 3 | 4 | def test_rewriter_port(rewrite_tester): 5 | rewrite_tester.check("port", [ 6 | Push(0, "int"), 7 | Pop(3, "int") 8 | ]) 9 | 10 | 11 | def test_rewriter_kernel_attribute(rewrite_tester): 12 | rewrite_tester.check("kernel-attribute", [ 13 | Push(0, "int"), 14 | Push(1, "int") 15 | ]) 16 | 17 | 18 | def test_rewriter_constant_variable(rewrite_tester): 19 | rewrite_tester.check("constant-variable", [ 20 | Push(5, "int"), 21 | ]) 22 | 23 | 24 | def test_rewriter_data_type(rewrite_tester): 25 | rewrite_tester.check("data-type", [ 26 | Push(0, "char"), 27 | Pop(1, "double"), 28 | Pop(3, "float"), 29 | Pop(4, "int") 30 | ]) 31 | 32 | 33 | def test_rewriter_buffer_size(rewrite_tester): 34 | rewrite_tester.check("buffer-size", [ 35 | Push(0, "int", 128), 36 | ]) 37 | 38 | 39 | def test_rewriter_complex(rewrite_tester): 40 | rewrite_tester.check("complex", [ 41 | Push(0, "char"), 42 | Pop(1, "double"), 43 | Broadcast(2, "int"), 44 | Reduce(3, "float", op_type="add"), 45 | Scatter(4, "short"), 46 | Gather(5, "int"), 47 | ]) 48 | 49 | 50 | def test_rewriter_reduce(rewrite_tester): 51 | rewrite_tester.check("reduce", [ 52 | Reduce(0, op_type="add"), 53 | Reduce(1, op_type="min"), 54 | Reduce(2, op_type="max"), 55 | ]) 56 | -------------------------------------------------------------------------------- /codegen/tests/test_routing.py: -------------------------------------------------------------------------------- 1 | import networkx 2 | 3 | from program import ProgramMapping, Program 4 | from routing import load_inter_fpga_connections, create_routing_context 5 | 6 | 7 | def test_load_inter_fpga_connections(): 8 | program = Program([]) 9 | mapping = ProgramMapping([program], { 10 | "n1:f1": program, 11 | "n1:f2": program, 12 | "n2:f1": program 13 | }) 14 | 15 | connections = { 16 | ("n1:f1", 0): ("n1:f2", 0), 17 | ("n1:f2", 1): ("n2:f1", 1), 18 | ("n2:f1", 0): ("n1:f1", 1), 19 | } 20 | 21 | graph = networkx.Graph() 22 | fpgas = load_inter_fpga_connections(graph, connections, mapping) 23 | 24 | assert len(fpgas) == 3 25 | fpgas = sorted(fpgas, key=lambda f: f.key()) 26 | assert fpgas[0].program is program 27 | 28 | assert list(graph.edges(fpgas[0].channels[0])) == [(fpgas[0].channels[0], fpgas[1].channels[0])] 29 | assert list(graph.edges(fpgas[0].channels[1])) == [(fpgas[0].channels[1], fpgas[2].channels[0])] 30 | assert list(graph.edges(fpgas[1].channels[1])) == [(fpgas[1].channels[1], fpgas[2].channels[1])] 31 | assert list(graph.edges(fpgas[2].channels[0])) == [(fpgas[2].channels[0], fpgas[0].channels[1])] 32 | 33 | 34 | def test_routing_context(): 35 | program = Program([]) 36 | mapping = ProgramMapping([program], { 37 | "n1:f1": program, 38 | "n1:f2": program, 39 | "n2:f1": program, 40 | "n3:f1": program 41 | }) 42 | 43 | connections = { 44 | ("n1:f1", 0): ("n1:f2", 0), 45 | ("n1:f2", 1): ("n2:f1", 1), 46 | ("n1:f2", 2): ("n3:f1", 1), 47 | ("n2:f1", 0): ("n1:f1", 1), 48 | } 49 | 50 | ctx = create_routing_context(connections, mapping) 51 | fpgas = ctx.fpgas 52 | assert ctx.routes[fpgas[0].channels[0]][fpgas[3].channels[3]] == [ 53 | fpgas[0].channels[0], 54 | fpgas[1].channels[0], 55 | fpgas[1].channels[2], 56 | fpgas[3].channels[1], 57 | fpgas[3].channels[3] 58 | ] 59 | -------------------------------------------------------------------------------- /codegen/tests/test_routing_table.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from conftest import get_routing_ctx, get_channel 3 | 4 | from ops import Push, Pop 5 | from program import FPGA, Program, CHANNELS_PER_FPGA 6 | from routing_table import cks_routing_table, NoRouteFound, ckr_routing_table 7 | 8 | 9 | def test_cks_table(): 10 | ctx = get_routing_ctx(Program([ 11 | Push(0), 12 | Push(1), 13 | Pop(2) 14 | ]), { 15 | ("N0:F0", 0): ("N0:F1", 0), 16 | ("N1:F0", 0): ("N0:F0", 1) 17 | }) 18 | 19 | graph, routes, fpgas = (ctx.graph, ctx.routes, ctx.fpgas) 20 | 21 | a = get_channel(graph, "N0:F0", 0) 22 | assert cks_routing_table(routes, fpgas, a) == [1, 0, 2] 23 | 24 | b = get_channel(graph, "N0:F0", 1) 25 | assert cks_routing_table(routes, fpgas, b) == [1, 2, 0] 26 | 27 | c = get_channel(graph, "N0:F1", 0) 28 | assert cks_routing_table(routes, fpgas, c) == [0, 1, 0] 29 | 30 | d = get_channel(graph, "N1:F0", 0) 31 | assert cks_routing_table(routes, fpgas, d) == [0, 0, 1] 32 | 33 | 34 | def test_ckr_table(): 35 | program = Program([ 36 | Push(0), 37 | Pop(1), 38 | Push(2), 39 | Pop(3), 40 | Pop(4) 41 | ]) 42 | fpga = FPGA("n", "f", program) 43 | 44 | assert ckr_routing_table(fpga.channels[0], CHANNELS_PER_FPGA, program) == [0, 3, 4, 0, 0, 5, 1, 0, 2, 0] 45 | assert ckr_routing_table(fpga.channels[1], CHANNELS_PER_FPGA, program) == [0, 3, 1, 0, 0, 1, 4, 0, 2, 0] 46 | assert ckr_routing_table(fpga.channels[2], CHANNELS_PER_FPGA, program) == [0, 3, 1, 0, 0, 1, 2, 0, 4, 0] 47 | assert ckr_routing_table(fpga.channels[3], CHANNELS_PER_FPGA, program) == [0, 4, 1, 0, 0, 1, 2, 0, 3, 0] 48 | 49 | 50 | def test_ckr_no_route(): 51 | ctx = get_routing_ctx(Program([]), { 52 | ("N0:F0", 0): ("N0:F1", 0), 53 | ("N1:F0", 0): ("N1:F2", 1) 54 | }) 55 | 56 | graph, routes, fpgas = (ctx.graph, ctx.routes, ctx.fpgas) 57 | ch = get_channel(graph, "N0:F0", 0) 58 | with pytest.raises(NoRouteFound): 59 | cks_routing_table(routes, fpgas, ch) 60 | -------------------------------------------------------------------------------- /codegen/tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from utils import round_robin 2 | 3 | 4 | def test_round_robin(): 5 | assert round_robin([], 0, 1) == [] 6 | assert round_robin([1], 0, 1) == [1] 7 | assert round_robin([1], 1, 2) == [] 8 | assert round_robin([1, 2], 0, 2) == [1] 9 | assert round_robin([1, 2], 1, 2) == [2] 10 | assert round_robin([1, 2, 3, 4, 5, 6, 7], 1, 4) == [2, 6] 11 | -------------------------------------------------------------------------------- /codegen/topology_file_generator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This program generates a dummy topology file. This can be used for testing or skeleton for topology files. 3 | It takes in input the number of FPGAs, a list of program names, and the output file. 4 | Programs are associated randomly to FPGAs, and FPGAs are connected in a bus. 5 | ''' 6 | 7 | import json 8 | import argparse 9 | 10 | 11 | if __name__ == "__main__": 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-n", type=int, help=' Number of FPGAs', required=True) 15 | parser.add_argument('-p', nargs='+', help=' List of programs', required=True) 16 | parser.add_argument("-f", type=str, help=' Output file', required=True) 17 | args = vars(parser.parse_args()) 18 | n = args["n"] 19 | programs = args["p"] 20 | if n < len(programs): 21 | print("The number of FPGAs must be greater or equal than the number of programs") 22 | exit(-1) 23 | #FPGAs are numbered from 0 to n-1 24 | programs_to_fpga={} 25 | for i in range(0,n): 26 | fpga_name = "fpga-{}:acl0".format(i) 27 | programs_to_fpga[fpga_name]=programs[i%len(programs)] 28 | 29 | #create a bus topology: port 0 is connected to port 1 of the next FPGA 30 | fpga_topology={} 31 | for i in range(0,n-1): 32 | src_name = "fpga-{}:acl0:ch0".format(i) 33 | dst_name = "fpga-{}:acl0:ch1".format(i+1) 34 | fpga_topology[src_name]=dst_name 35 | data = {"fpgas": programs_to_fpga, "connections": fpga_topology} 36 | with open(args["f"], 'w') as f: 37 | json.dump(data, f, indent=4, separators=(',', ': ')) 38 | -------------------------------------------------------------------------------- /codegen/utils.py: -------------------------------------------------------------------------------- 1 | from typing import List, TypeVar 2 | 3 | T = TypeVar('T') 4 | 5 | 6 | def round_robin(values: List[T], index: int, size: int) -> List[T]: 7 | assert size > 0 8 | assert 0 <= index < size 9 | 10 | return values[index::size] 11 | -------------------------------------------------------------------------------- /examples/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Configuration 2 | set(SMI_STENCIL_SIZE_X 8192 CACHE STRING "Vertical size of domain.") 3 | set(SMI_STENCIL_SIZE_Y 8192 CACHE STRING "Horizontal size of domain.") 4 | set(SMI_DATATYPE float CACHE STRING "Data type of computations.") 5 | set(SMI_VECTORIZATION_WIDTH 16 CACHE STRING "Width for exploiting vector parallelism.") 6 | set(SMI_STENCIL_NUM_PROCS_X 2 CACHE STRING "Number of processes in X dimension.") 7 | set(SMI_STENCIL_NUM_PROCS_Y 4 CACHE STRING "Number of processes in Y dimension.") 8 | math(EXPR SMI_STENCIL_NUM_PROCS "${SMI_STENCIL_NUM_PROCS_X} * ${SMI_STENCIL_NUM_PROCS_Y}") 9 | set(SMI_KMEANS_RANKS 8 CACHE STRING "Number of dimensions for K-means.") 10 | set(SMI_KMEANS_DIMS 64 CACHE STRING "Number of dimensions for K-means.") 11 | set(SMI_KMEANS_CLUSTERS 8 CACHE STRING "Number of clusters to compute.") 12 | 13 | if(SMI_DATATYPE STREQUAL "float") 14 | set(SMI_COMM_DATATYPE "SMI_FLOAT") 15 | else() 16 | message(FATAL_ERROR "Unsupported data type \"${SMI_DATATYPE}\".") 17 | endif() 18 | 19 | configure_file(include/stencil.h.in stencil.h) 20 | configure_file(include/kmeans.h.in kmeans.h) 21 | configure_file(include/fblas.h fblas.h) 22 | 23 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) 24 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 25 | 26 | find_package(PythonInterp 3) 27 | 28 | if(PythonInterp_FOUND) 29 | #stencil 30 | smi_target(stencil_smi "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_smi.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/stencil_smi.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_smi.cl" 8) 31 | #kmeans 32 | smi_target(kmeans_smi "${CMAKE_CURRENT_SOURCE_DIR}/kernels/kmeans_smi.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/kmeans_smi.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/kmeans_smi.cl" 8) 33 | 34 | #gesummv 35 | smi_target(gesummv_smi "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_smi.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/gesummv_smi.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_rank0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_rank1.cl" 2) 36 | target_link_libraries(gesummv_smi_host openblas) 37 | 38 | #onchip versions 39 | fpga_target(gesummv_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/gesummv_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_onchip.cl" OFF) 40 | fpga_target(stencil_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/stencil_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_onchip.cl" ON) 41 | 42 | endif() 43 | -------------------------------------------------------------------------------- /examples/include/common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | constexpr int kChannelsPerRank = 4; 8 | 9 | template 10 | void LoadRoutingTable(int rank, int channel, int num_entries, 11 | const std::string& routing_directory, 12 | const std::string& prefix, DataSize* table) { 13 | std::stringstream path; 14 | path << routing_directory << "/" << prefix << "-rank" << rank << "-channel" 15 | << channel; 16 | 17 | std::ifstream file(path.str(), std::ios::binary); 18 | if (!file) { 19 | throw std::runtime_error("Routing table " + path.str() + " not found."); 20 | } 21 | 22 | auto byte_size = num_entries * sizeof(DataSize); 23 | file.read(table, byte_size); 24 | } 25 | -------------------------------------------------------------------------------- /examples/include/fblas.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef COMMONS_CL_HPP 3 | #define COMMONS_CL_HPP 4 | 5 | 6 | /** 7 | * A set of commons definitions that has to be included in BLAS 8 | * routine kernels 9 | * 10 | * Must be included after the definition of the DOUBLE_PRECISION macro (if needed) 11 | */ 12 | 13 | 14 | #ifdef DOUBLE_PRECISION 15 | #define TYPE_T double 16 | #else 17 | #define TYPE_T float //type of data: float if DOUBLE_PRECISION is undefined, double otherwise 18 | #endif 19 | 20 | 21 | #ifdef DOUBLE_PRECISION 22 | //enable double precision support 23 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 24 | 25 | #ifdef __STRATIX_10__ 26 | #define DOUBLE_ADD_LATENCY 28 //double add latency for Stratix10 27 | #endif 28 | 29 | #ifdef __ARRIA_10__ 30 | #define DOUBLE_ADD_LATENCY 12 //double add latency for Arria 10 31 | #endif 32 | 33 | #define SHIFT_REG DOUBLE_ADD_LATENCY+6 //Shift register dimension for double precision operations (additional elements to avoid Fmax problems) 34 | #endif 35 | #endif 36 | -------------------------------------------------------------------------------- /examples/include/kmeans.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #pragma OPENCL EXTENSION cl_intel_channels : enable 4 | 5 | #define W ${SMI_VECTORIZATION_WIDTH} 6 | #define K ${SMI_KMEANS_CLUSTERS} 7 | #define DIMS ${SMI_KMEANS_DIMS} 8 | #define DTYPE ${SMI_DATATYPE} 9 | #define ITYPE unsigned short 10 | #define SMI_TYPE ${SMI_COMM_DATATYPE} 11 | #define W ${SMI_VECTORIZATION_WIDTH} 12 | #if W > 1 13 | #define VTYPE ${SMI_DATATYPE}${SMI_VECTORIZATION_WIDTH} 14 | #define IVTYPE ushort${SMI_VECTORIZATION_WIDTH} 15 | #else 16 | #define VTYPE DTYPE 17 | #define IVTYPE ITYPE 18 | #endif 19 | #define SMI_DEVICES_PER_NODE ${SMI_DEVICES_PER_NODE} 20 | -------------------------------------------------------------------------------- /examples/include/stencil.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #pragma OPENCL EXTENSION cl_intel_channels : enable 4 | 5 | // Constants (for now) 6 | #define HALO_X 1 7 | #define HALO_Y 1 8 | #define BOUNDARY_VALUE 1 9 | 10 | // CMake config 11 | #define X ${SMI_STENCIL_SIZE_X} 12 | #define Y ${SMI_STENCIL_SIZE_Y} 13 | #define DTYPE ${SMI_DATATYPE} 14 | #define SMI_TYPE ${SMI_COMM_DATATYPE} 15 | #define W ${SMI_VECTORIZATION_WIDTH} 16 | #define B 4 // Number of memory banks 17 | #if W > 1 18 | #define VTYPE ${SMI_DATATYPE}${SMI_VECTORIZATION_WIDTH} 19 | #else 20 | #define VTYPE DTYPE 21 | #endif 22 | #if HALO_Y > 1 23 | #define HTYPE_PASTE(a, b) a ## b 24 | #define HTYPE_EVAL(a, b) HTYPE_PASTE(a, b) 25 | #define HTYPE HTYPE_EVAL(${SMI_DATATYPE}, HALO_Y) 26 | #else 27 | #define HTYPE ${SMI_DATATYPE} 28 | #endif 29 | #if W < HALO_Y 30 | #error "Vectorization width must be greater than or equal to the horizontal halo size." 31 | #endif 32 | #define PX ${SMI_STENCIL_NUM_PROCS_X} 33 | #define PY ${SMI_STENCIL_NUM_PROCS_Y} 34 | #define SMI_DEVICES_PER_NODE ${SMI_DEVICES_PER_NODE} 35 | 36 | // Derived numbers 37 | #define X_LOCAL (X / PX) 38 | #define Y_LOCAL (Y / PY) 39 | -------------------------------------------------------------------------------- /examples/kernels/gesummv_smi.json: -------------------------------------------------------------------------------- 1 | { 2 | "fpgas": { 3 | "fpga-0006:acl0": "gesummv_rank0", 4 | "fpga-0006:acl1": "gesummv_rank1" 5 | }, 6 | "connections": { 7 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3", 8 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /examples/kernels/kmeans_smi.json: -------------------------------------------------------------------------------- 1 | { 2 | "fpgas": { 3 | "fpga-0006:acl0": "kmeans_smi", 4 | "fpga-0006:acl1": "kmeans_smi", 5 | "fpga-0007:acl0": "kmeans_smi", 6 | "fpga-0007:acl1": "kmeans_smi", 7 | "fpga-0008:acl0": "kmeans_smi", 8 | "fpga-0008:acl1": "kmeans_smi", 9 | "fpga-0009:acl0": "kmeans_smi", 10 | "fpga-0009:acl1": "kmeans_smi" 11 | }, 12 | "connections": { 13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3", 14 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2", 15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3", 16 | "fpga-0007:acl0:ch3": "fpga-0007:acl1:ch2", 17 | "fpga-0006:acl0:ch1": "fpga-0007:acl0:ch0", 18 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0", 19 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0", 20 | "fpga-0007:acl1:ch1": "fpga-0008:acl1:ch0", 21 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3", 22 | "fpga-0008:acl0:ch3": "fpga-0008:acl1:ch2", 23 | "fpga-0009:acl0:ch2": "fpga-0009:acl1:ch3", 24 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2", 25 | "fpga-0008:acl0:ch1": "fpga-0009:acl0:ch0", 26 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0", 27 | "fpga-0006:acl0:ch0": "fpga-0009:acl0:ch1", 28 | "fpga-0006:acl1:ch0": "fpga-0009:acl1:ch1" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /examples/kernels/stencil_onchip.cl.in: -------------------------------------------------------------------------------- 1 | #include "stencil.h" 2 | 3 | channel VTYPE read_stream[PX*PY] __attribute__((depth((Y/W)/PY))); 4 | channel VTYPE write_stream[PX*PY] __attribute__((depth((Y/W)/PY))); 5 | channel VTYPE vert_up[(PX - 1)*PY] __attribute__((depth((Y/W)/PY))); 6 | channel VTYPE vert_down[(PX - 1)*PY] __attribute__((depth((Y/W)/PY))); 7 | channel HTYPE hori_left[PX*(PY - 1)] __attribute__((depth(X/PX))); 8 | channel HTYPE hori_right[PX*(PY - 1)] __attribute__((depth(X/PX))); 9 | 10 | ${code} 11 | -------------------------------------------------------------------------------- /examples/kernels/stencil_onchip.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import string 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("px", type=int) 7 | parser.add_argument("py", type=int) 8 | parser.add_argument("source_dir", type=str) 9 | parser.add_argument("binary_dir", type=str) 10 | args = parser.parse_args() 11 | 12 | with open(os.path.join(args.source_dir, "stencil_onchip.cl.in"), 13 | "r") as in_file: 14 | tmpl_main = string.Template(in_file.read()) 15 | with open(os.path.join(args.source_dir, "stencil_onchip_pe.cl.in"), 16 | "r") as in_file: 17 | tmpl_pe = string.Template(in_file.read()) 18 | 19 | pe_code = [] 20 | for i_px in range(args.px): 21 | for i_py in range(args.py): 22 | pe_code.append( 23 | tmpl_pe.substitute( 24 | i_px=i_px, i_py=i_py, suffix="_{}_{}".format(i_px, i_py))) 25 | 26 | with open(os.path.join(args.binary_dir, "stencil_onchip.cl"), 27 | "w") as out_file: 28 | out_file.write(tmpl_main.substitute(code="\n\n".join(pe_code))) 29 | -------------------------------------------------------------------------------- /examples/kernels/stencil_smi.json: -------------------------------------------------------------------------------- 1 | { 2 | "fpgas": { 3 | "fpga-0006:acl0": "stencil_smi", 4 | "fpga-0006:acl1": "stencil_smi", 5 | "fpga-0007:acl0": "stencil_smi", 6 | "fpga-0007:acl1": "stencil_smi", 7 | "fpga-0008:acl0": "stencil_smi", 8 | "fpga-0008:acl1": "stencil_smi", 9 | "fpga-0009:acl0": "stencil_smi", 10 | "fpga-0009:acl1": "stencil_smi" 11 | }, 12 | "connections": { 13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3", 14 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2", 15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3", 16 | "fpga-0007:acl0:ch3": "fpga-0007:acl1:ch2", 17 | "fpga-0006:acl0:ch1": "fpga-0007:acl0:ch0", 18 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0", 19 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0", 20 | "fpga-0007:acl1:ch1": "fpga-0008:acl1:ch0", 21 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3", 22 | "fpga-0008:acl0:ch3": "fpga-0008:acl1:ch2", 23 | "fpga-0009:acl0:ch2": "fpga-0009:acl1:ch3", 24 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2", 25 | "fpga-0008:acl0:ch1": "fpga-0009:acl0:ch0", 26 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0", 27 | "fpga-0006:acl0:ch0": "fpga-0009:acl0:ch1", 28 | "fpga-0006:acl1:ch0": "fpga-0009:acl1:ch1" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /include/smi.h: -------------------------------------------------------------------------------- 1 | #ifndef SMI_H 2 | #define SMI_H 3 | 4 | #ifdef SMI_REWRITER 5 | #include "opencl-shim.h" 6 | #endif 7 | 8 | #include "smi/header_message.h" 9 | #include "smi/network_message.h" 10 | #include "smi/data_types.h" 11 | #include "smi/channel_descriptor.h" 12 | #include "smi/operation_type.h" 13 | #include "smi/reduce_operations.h" 14 | #include "smi/communicator.h" 15 | #include "smi/push.h" 16 | #include "smi/pop.h" 17 | #include "smi/bcast.h" 18 | #include "smi/reduce.h" 19 | #include "smi/gather.h" 20 | #include "smi/scatter.h" 21 | #endif // SMI_H 22 | -------------------------------------------------------------------------------- /include/smi/bcast.h: -------------------------------------------------------------------------------- 1 | #ifndef BCAST_H 2 | #define BCAST_H 3 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 4 | 5 | /** 6 | @file bcast.h 7 | This file contains the definition of channel descriptor, 8 | open channel and communication primitive for Broadcast. 9 | */ 10 | 11 | #include "data_types.h" 12 | #include "header_message.h" 13 | #include "operation_type.h" 14 | #include "network_message.h" 15 | #include "communicator.h" 16 | 17 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){ 18 | SMI_Network_message net; //buffered network message 19 | char root_rank; 20 | char my_rank; //These two are essentially the Communicator 21 | char num_rank; 22 | char port; //Port number 23 | unsigned int message_size; //given in number of data elements 24 | unsigned int processed_elements; //how many data elements we have sent/received 25 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet) 26 | SMI_Datatype data_type; //type of message 27 | SMI_Network_message net_2; //buffered network message: used for the receiving side 28 | char size_of_type; //size of data type 29 | char elements_per_packet; //number of data elements per packet 30 | char packet_element_id_rcv; //used by the receivers 31 | bool init; //true at the beginning, used by the receivers for synchronization 32 | }SMI_BChannel; 33 | 34 | /** 35 | * @brief SMI_Open_bcast_channel opens a broadcast channel 36 | * @param count number of data elements to broadcast 37 | * @param data_type type of the channel 38 | * @param port port number 39 | * @param root rank of the root 40 | * @param comm communicator 41 | * @return the channel descriptor 42 | */ 43 | SMI_BChannel SMI_Open_bcast_channel(int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm); 44 | 45 | /** 46 | * @brief SMI_Open_bcast_channel_ad opens a broadcast channel with a given asynchronicity degree 47 | * @param count number of data elements to broadcast 48 | * @param data_type type of the channel 49 | * @param port port number 50 | * @param root rank of the root 51 | * @param comm communicator 52 | * @param asynch_degree the asynchronicity degree in number of data elements 53 | * @return the channel descriptor 54 | */ 55 | SMI_BChannel SMI_Open_bcast_channel_ad(int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm, int asynch_degree); 56 | 57 | /** 58 | * @brief SMI_Bcast 59 | * @param chan pointer to the broadcast channel descriptor 60 | * @param data pointer to the data element: on the root rank is the element that will be transmitted, 61 | on the non-root rank will be the received element 62 | */ 63 | void SMI_Bcast(SMI_BChannel *chan, void* data); 64 | #endif // BCAST_H 65 | -------------------------------------------------------------------------------- /include/smi/channel_descriptor.h: -------------------------------------------------------------------------------- 1 | #ifndef CHANNEL_DESCRIPTOR_H 2 | #define CHANNEL_DESCRIPTOR_H 3 | /** 4 | @file channel_descriptor.h 5 | Point-to-point transient channel descriptor. 6 | It maintains all the informations that are necessary for performing a point-to-point communication (Push/Pop) 7 | */ 8 | 9 | #include "network_message.h" 10 | #include "operation_type.h" 11 | #include "data_types.h" 12 | #include "communicator.h" 13 | #define MIN(a,b) (((a)<(b))?(a):(b)) 14 | #define MAX(a,b) (((a)>(b))?(a):(b)) 15 | 16 | 17 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){ 18 | SMI_Network_message net; //buffered network message 19 | char sender_rank; //rank of the sender 20 | char receiver_rank; //rank of the receiver 21 | char port; //channel port 22 | unsigned int message_size; //given in number of data elements 23 | unsigned int processed_elements; //how many data elements we have sent/received so far 24 | unsigned int packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet) 25 | SMI_Datatype data_type; //type of message 26 | char op_type; //type of operation 27 | char size_of_type; //size of data type 28 | char elements_per_packet; //number of data elements per packet 29 | volatile unsigned int tokens; //current number of tokens (one tokens allow the sender to transmit one data element) 30 | unsigned int max_tokens; //max tokens on the sender side 31 | }SMI_Channel; 32 | 33 | #endif 34 | -------------------------------------------------------------------------------- /include/smi/communicator.h: -------------------------------------------------------------------------------- 1 | #ifndef COMMUNICATOR_H 2 | #define COMMUNICATOR_H 3 | /** 4 | @file communicator.h 5 | Describes a basic communicator. 6 | */ 7 | 8 | //Note: Since the Intel compiler fails in compiling the emulation if you pass a user-defined 9 | //data type, we had to define it by resorting to OpenCL data types: the first element 10 | //will be "my_rank" and the second the number of ranks 11 | #if defined __HOST_PROGRAM__ 12 | typedef cl_char2 SMI_Comm; 13 | #else 14 | typedef char2 SMI_Comm; 15 | /** 16 | * @brief SMI_Comm_size return the communicator size 17 | * @param comm 18 | * @return the communicator size 19 | */ 20 | inline int SMI_Comm_size(SMI_Comm comm){ 21 | return comm[1]; 22 | } 23 | 24 | /** 25 | * @brief SMI_Comm_rank determins the rank of the caller 26 | * @param comm 27 | * @return rank of the caller 28 | */ 29 | inline int SMI_Comm_rank(SMI_Comm comm){ 30 | return comm[0]; 31 | } 32 | #endif 33 | #endif 34 | -------------------------------------------------------------------------------- /include/smi/data_types.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef DATA_TYPES_H 3 | #define DATA_TYPES_H 4 | 5 | /** 6 | @file data_types.h 7 | Supported message data types 8 | */ 9 | 10 | typedef enum{ 11 | SMI_INT = 1, 12 | SMI_FLOAT = 2, 13 | SMI_DOUBLE = 3, 14 | SMI_CHAR = 4, 15 | SMI_SHORT = 5 16 | }SMI_Datatype; 17 | 18 | #endif 19 | -------------------------------------------------------------------------------- /include/smi/gather.h: -------------------------------------------------------------------------------- 1 | #ifndef GATHER_H 2 | #define GATHER_H 3 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 4 | 5 | /** 6 | @file gather.h 7 | This file contains the channel descriptor, open channel 8 | and communication primitive for gather 9 | */ 10 | 11 | 12 | #include "data_types.h" 13 | #include "header_message.h" 14 | #include "operation_type.h" 15 | #include "network_message.h" 16 | #include "communicator.h" 17 | 18 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){ 19 | SMI_Network_message net; //buffered network message 20 | int recv_count; //number of data elements that will be received by the root 21 | char port; 22 | int processed_elements_root; //number of elements processed by the root 23 | char packet_element_id_rcv; //used by the receivers 24 | char next_contrib; //the rank of the next contributor 25 | char my_rank; 26 | char num_rank; 27 | char root_rank; 28 | SMI_Network_message net_2; //buffered network message, used by root rank to send synchronization messages 29 | int send_count; //number of elements sent by each non-root ranks 30 | int processed_elements; //how many data elements we have sent (non-root) 31 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet) 32 | char data_type; //type of message 33 | char size_of_type; //size of data type 34 | char elements_per_packet; //number of data elements per packet 35 | }SMI_GatherChannel; 36 | 37 | /** 38 | * @brief SMI_Open_gather_channel opens a gather channel 39 | * @param send_count number of data elements transmitted by each rank 40 | * @param recv_count number of data elements received by root rank (i.e. num_ranks*send_count) 41 | * @param data_type type of the channel 42 | * @param port port number 43 | * @param root rank of the root 44 | * @param comm communicator 45 | * @return the channel descriptor 46 | */ 47 | SMI_GatherChannel SMI_Open_gather_channel(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm); 48 | 49 | /** 50 | * @brief SMI_Open_gather_channel_ad opens a gather channel with a given asynchronicity degree 51 | * @param send_count number of data elements transmitted by each rank 52 | * @param recv_count number of data elements received by root rank (i.e. num_ranks*send_count) 53 | * @param data_type type of the channel 54 | * @param port port number 55 | * @param root rank of the root 56 | * @param comm communicator 57 | * @param asynch_degree the asynchronicity degree expressed in number of data elements 58 | * @return the channel descriptor 59 | */ 60 | SMI_GatherChannel SMI_Open_gather_channel_ad(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm, int asynch_degree); 61 | 62 | /** 63 | * @brief SMI_Gather 64 | * @param chan pointer to the gather channel descriptor 65 | * @param data_snd pointer to the data element that must be sent 66 | * @param data_rcv pointer to the receiving data element (significant on the root rank only) 67 | */ 68 | void SMI_Gather(SMI_GatherChannel *chan, void* send_data, void* rcv_data); 69 | 70 | #endif // GATHER_H 71 | -------------------------------------------------------------------------------- /include/smi/header_message.h: -------------------------------------------------------------------------------- 1 | /** 2 | Message header definition with macros for accessing it 3 | (for now they just take the corresponding field) 4 | */ 5 | #ifndef HEADER_MESSAGE_H 6 | #define HEADER_MESSAGE_H 7 | 8 | #define GET_HEADER_SRC(H) (H.src) 9 | #define GET_HEADER_DST(H) (H.dst) 10 | #define GET_HEADER_PORT(H) (H.port) 11 | #define GET_HEADER_OP(H) ((char)H.elems_and_op & (char)7) 12 | #define GET_HEADER_NUM_ELEMS(H) ((char)H.elems_and_op >> ((char)3)) //returns the number of valid data elements in the packet 13 | #define SET_HEADER_SRC(H,S) (H.src=S) 14 | #define SET_HEADER_DST(H,D) (H.dst=D) 15 | #define SET_HEADER_PORT(H,P) (H.port=P) 16 | #define SET_HEADER_OP(H,O) (H.elems_and_op=((H.elems_and_op & 248) | O & 7)) 17 | #define SET_HEADER_NUM_ELEMS(H,N) (H.elems_and_op=((H.elems_and_op &7) | (N << 3))) //By assumption N < 32 18 | 19 | 20 | typedef struct __attribute__((packed)) { 21 | char src; 22 | char dst; 23 | char port; 24 | char elems_and_op; //upper 5 bits contain the number of valid data elements in the packet 25 | //lower 3 bit contain the type of operation 26 | 27 | }SMI_Message_header; 28 | 29 | #endif //ifndef HEADER_MESSAGE_H 30 | -------------------------------------------------------------------------------- /include/smi/operation_type.h: -------------------------------------------------------------------------------- 1 | /** 2 | Definition of the supported communication operations 3 | */ 4 | 5 | #ifndef OPERATION_TYPE_H 6 | #define OPERATION_TYPE_H 7 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 8 | /** 9 | Type of operation performed 10 | */ 11 | typedef enum{ 12 | SMI_SEND = 0, 13 | SMI_RECEIVE = 1, 14 | SMI_BROADCAST = 2, 15 | SMI_SYNCH=3, //special operation type used for synchronization/rendezvou 16 | SMI_SCATTER=4, 17 | SMI_REDUCE=5, 18 | SMI_GATHER=6 19 | }SMI_Operationtype; 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /include/smi/pop.h: -------------------------------------------------------------------------------- 1 | /** 2 | Pop from channel 3 | */ 4 | 5 | #ifndef POP_H 6 | #define POP_H 7 | #include "channel_descriptor.h" 8 | #include "communicator.h" 9 | 10 | 11 | /** 12 | * @brief SMI_Open_receive_channel opens a receive transient channel 13 | * @param count number of data elements to receive 14 | * @param data_type data type of the data elements 15 | * @param source rank of the sender 16 | * @param port port number 17 | * @param comm communicator 18 | * @return channel descriptor 19 | */ 20 | SMI_Channel SMI_Open_receive_channel(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm); 21 | 22 | /** 23 | * @brief SMI_Open_receive_channel_ad opens a receive transient channel with a given asynchronicity degree 24 | * @param count number of data elements to receive 25 | * @param data_type data type of the data elements 26 | * @param source rank of the sender 27 | * @param port port number 28 | * @param comm communicator 29 | * @param asynch_degree the asynchronicity degree expressed in number of data elements 30 | * @return channel descriptor 31 | */ 32 | SMI_Channel SMI_Open_receive_channel_ad(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm, int asynch_degree); 33 | 34 | /** 35 | * @brief SMI_Pop: receive a data element. Returns only when data arrives 36 | * @param chan pointer to the transient channel descriptor 37 | * @param data pointer to the target variable that, on return, will contain the data element 38 | */ 39 | void SMI_Pop(SMI_Channel *chan, void *data); 40 | 41 | #endif //ifndef POP_H 42 | -------------------------------------------------------------------------------- /include/smi/push.h: -------------------------------------------------------------------------------- 1 | /** 2 | Push to channel 3 | */ 4 | 5 | #ifndef PUSH_H 6 | #define PUSH_H 7 | #include "channel_descriptor.h" 8 | #include "communicator.h" 9 | 10 | /** 11 | * @brief SMI_OpenSendChannel open a sending transient channel 12 | * @param count number of data elements to send 13 | * @param data_type type of the data element 14 | * @param destination rank of the destination 15 | * @param port port number 16 | * @param comm communicator 17 | * @return channel descriptor 18 | */ 19 | SMI_Channel SMI_Open_send_channel(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm); 20 | 21 | 22 | /** 23 | * @brief SMI_OpenSendChannel_ad open a sending transient channel with a given asynchronicity degree 24 | * @param count number of data elements to send 25 | * @param data_type type of the data element 26 | * @param destination rank of the destination 27 | * @param port port number 28 | * @param comm communicator 29 | * @param asynch_degree the asynchronicity degree expressed in number of data elements 30 | * @return channel descriptor 31 | */ 32 | SMI_Channel SMI_Open_send_channel_ad(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm, int asynch_degree); 33 | 34 | /** 35 | * @brief private function SMI_Push push a data elements in the transient channel. Data transferring can be delayed 36 | * @param chan 37 | * @param data 38 | * @param immediate: if true the data is immediately sent, without waiting for the completion of the network packet. 39 | * In general, the user should use the other Push definition 40 | */ 41 | void SMI_Push_flush(SMI_Channel *chan, void* data, int immediate); 42 | 43 | /** 44 | * @brief SMI_Push push a data elements in the transient channel. The actual ata transferring can be delayed 45 | * @param chan pointer to the channel descriptor of the transient channel 46 | * @param data pointer to the data that can be sent 47 | */ 48 | void SMI_Push(SMI_Channel *chan, void* data); 49 | 50 | #endif //ifndef PUSH_H 51 | -------------------------------------------------------------------------------- /include/smi/reduce.h: -------------------------------------------------------------------------------- 1 | #ifndef REDUCE_H 2 | #define REDUCE_H 3 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 4 | 5 | /** 6 | @file reduce.h 7 | This file contains the channel descriptor, open channel, operation types, 8 | and communication primitive for reduce 9 | */ 10 | 11 | 12 | #include "data_types.h" 13 | #include "header_message.h" 14 | #include "network_message.h" 15 | #include "operation_type.h" 16 | #include "communicator.h" 17 | 18 | typedef enum{ 19 | SMI_ADD = 0, 20 | SMI_MAX = 1, 21 | SMI_MIN = 2 22 | }SMI_Op; 23 | 24 | /** 25 | Channel descriptor for reduce 26 | */ 27 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){ 28 | SMI_Network_message net; //buffered network message 29 | char port; //Output channel for the bcast, used by the root 30 | char root_rank; 31 | char my_rank; //communicator infos 32 | char num_rank; 33 | unsigned int message_size; //given in number of data elements 34 | unsigned int processed_elements; //how many data elements we have sent/received 35 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet) 36 | SMI_Datatype data_type; //type of message 37 | char size_of_type; //size of data type 38 | char elements_per_packet; //number of data elements per packet 39 | SMI_Network_message net_2; //buffered network message (we need two of them to remove aliasing) 40 | char packet_element_id_rcv; //used by the receivers 41 | char reduce_op; //applied reduce operation 42 | }SMI_RChannel; 43 | 44 | 45 | /** 46 | * @brief SMI_Open_reduce_channel opens a transient reduce channel 47 | * @param count number of data elements to reduce 48 | * @param data_type type of the channel 49 | * @param op rapplied reduce operation 50 | * @param port port number 51 | * @param root rank of the root 52 | * @param comm communicator 53 | * @return the channel descriptor 54 | */ 55 | SMI_RChannel SMI_Open_reduce_channel(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm); 56 | 57 | /** 58 | * @brief SMI_Open_reduce_channel_ad opens a transient reduce channel with a given asynchronicity degree 59 | * @param count number of data elements to reduce 60 | * @param data_type type of the channel 61 | * @param op rapplied reduce operation 62 | * @param port port number 63 | * @param root rank of the root 64 | * @param comm communicator 65 | * @param asynch_degree the asynchronicity degree expressed in number of data elements 66 | * @return the channel descriptor 67 | */ 68 | SMI_RChannel SMI_Open_reduce_channel_ad(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm, int asynch_degree); 69 | 70 | /** 71 | * @brief SMI_Reduce 72 | * @param chan pointer to the reduce channel descriptor 73 | * @param data_snd pointer to the data element that must be reduced 74 | * @param data_rcv pointer to the receiving data element (root only) 75 | */ 76 | void SMI_Reduce(SMI_RChannel *chan, void* data_snd, void* data_rcv); 77 | 78 | #endif // REDUCE_H 79 | -------------------------------------------------------------------------------- /include/smi/reduce_operations.h: -------------------------------------------------------------------------------- 1 | #ifndef REDUCE_OPERATIONS_H 2 | #define REDUCE_OPERATIONS_H 3 | 4 | #define SMI_OP_ADD(A,B) ((A)+(B)) 5 | #define SMI_OP_MIN(A,B) (((A)<(B))?(A):(B)) 6 | #define SMI_OP_MAX(A,B) (((A)>(B))?(A):(B)) 7 | 8 | #endif // REDUCE_OPERATIONS_H 9 | -------------------------------------------------------------------------------- /include/smi/scatter.h: -------------------------------------------------------------------------------- 1 | #ifndef SCATTER_H 2 | #define SCATTER_H 3 | 4 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 5 | 6 | /** 7 | @file scatter.h 8 | This file contains the definition of channel descriptor, 9 | open channel and communication primitive for Scatter. 10 | */ 11 | 12 | #include "data_types.h" 13 | #include "header_message.h" 14 | #include "operation_type.h" 15 | #include "network_message.h" 16 | #include "communicator.h" 17 | 18 | 19 | 20 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){ 21 | SMI_Network_message net; //buffered network message 22 | char port; //port 23 | char root_rank; 24 | char my_rank; //rank of the caller 25 | char num_ranks; //total number of ranks 26 | unsigned int send_count; //given in number of data elements 27 | unsigned int recv_count; //given in number of data elements 28 | unsigned int processed_elements; //how many data elements we have sent/received 29 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet) 30 | char data_type; //type of message 31 | SMI_Network_message net_2; //buffered network message (used by non root ranks) 32 | char size_of_type; //size of data type 33 | char elements_per_packet; //number of data elements per packet 34 | char packet_element_id_rcv; //used by the receivers 35 | char next_rcv; //the rank of the next receiver 36 | bool init; //true when the channel is opened, false when synchronization message has been sent 37 | }SMI_ScatterChannel; 38 | 39 | /** 40 | * @brief SMI_Open_scatter_channel opens a transient scatter channel 41 | * @param send_count number of data elements transmitted by root to each rank 42 | * @param recv_count number of data elements received by each rank 43 | * @param data_type type of the channel 44 | * @param port port number 45 | * @param root rank of the root 46 | * @param comm communicator 47 | * @return the channel descriptor 48 | */ 49 | SMI_ScatterChannel SMI_Open_scatter_channel(int send_count, int recv_count, 50 | SMI_Datatype data_type, int port, int root, SMI_Comm comm); 51 | 52 | /** 53 | * @brief SMI_Open_scatter_channel opens a transient scatter channel 54 | * @param send_count number of data elements transmitted by root to each rank 55 | * @param recv_count number of data elements received by each rank 56 | * @param data_type type of the channel 57 | * @param port port number 58 | * @param root rank of the root 59 | * @param comm communicator 60 | * @param asynch_degree the asynchronicity degree expressed in number of data elements 61 | * @return the channel descriptor 62 | */ 63 | SMI_ScatterChannel SMI_Open_scatter_channel_ad(int send_count, int recv_count, 64 | SMI_Datatype data_type, int port, int root, SMI_Comm comm, int asynch_degree); 65 | 66 | /** 67 | * @brief SMI_Scatter 68 | * @param chan pointer to the scatter channel descriptor 69 | * @param data_snd pointer to the data element that must be sent (root only) 70 | * @param data_rcv pointer to the receiving data element 71 | */ 72 | void SMI_Scatter(SMI_ScatterChannel *chan, void* data_snd, void* data_rcv); 73 | 74 | #endif // SCATTER_H 75 | -------------------------------------------------------------------------------- /include/utils/smi_utils.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | void checkMpiCall(int code, const char* location, int line) 10 | { 11 | if (code != MPI_SUCCESS) 12 | { 13 | char error[256]; 14 | int length; 15 | MPI_Error_string(code, error, &length); 16 | std::cerr << "MPI error at " << location << ":" << line << ": " << error << std::endl; 17 | } 18 | } 19 | 20 | #define CHECK_MPI(err) checkMpiCall((err), __FILE__, __LINE__); 21 | 22 | constexpr int kChannelsPerRank = 4; 23 | 24 | template 25 | void LoadRoutingTable(int rank, int channel, int num_entries, 26 | const std::string& routing_directory, 27 | const std::string& prefix, DataSize* table) { 28 | std::stringstream path; 29 | path << routing_directory << "/" << prefix << "-rank" << rank << "-channel" 30 | << channel; 31 | 32 | std::ifstream file(path.str(), std::ios::binary); 33 | if (!file) { 34 | throw std::runtime_error("Routing table " + path.str() + " not found."); 35 | } 36 | 37 | auto byte_size = num_entries * sizeof(DataSize); 38 | file.read(table, byte_size); 39 | } 40 | 41 | std::string replace(std::string source, const std::string& pattern, const std::string& replacement) 42 | { 43 | auto pos = source.find(pattern); 44 | if (pos != std::string::npos) 45 | { 46 | return source.substr(0, pos) + replacement + source.substr(pos + pattern.length()); 47 | } 48 | return source; 49 | } 50 | -------------------------------------------------------------------------------- /include/utils/utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_HPP 2 | #define UTILS_HPP 3 | 4 | #include 5 | /** 6 | Timing functions 7 | */ 8 | typedef long unsigned int timestamp_t; 9 | 10 | inline timestamp_t current_time_usecs() __attribute__((always_inline)); 11 | inline timestamp_t current_time_usecs(){ 12 | struct timeval t; 13 | gettimeofday(&t, NULL); 14 | return (t.tv_sec)*1000000L + t.tv_usec; 15 | 16 | } 17 | 18 | inline long current_time_nsecs() __attribute__((always_inline)); 19 | inline long current_time_nsecs(){ 20 | struct timespec t; 21 | clock_gettime(CLOCK_REALTIME, &t); 22 | return (t.tv_sec)*1000000000L + t.tv_nsec; 23 | } 24 | #endif // UTILS_HPP 25 | -------------------------------------------------------------------------------- /microbenchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Configuration 2 | 3 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 4 | 5 | find_package(PythonInterp 3) 6 | 7 | if(PythonInterp_FOUND) 8 | 9 | #SPMD 10 | smi_target(broadcast "${CMAKE_CURRENT_SOURCE_DIR}/kernels/broadcast.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/broadcast_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/broadcast.cl" 8) 11 | smi_target(reduce "${CMAKE_CURRENT_SOURCE_DIR}/kernels/reduce.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/reduce_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/reduce.cl" 8) 12 | smi_target(scatter "${CMAKE_CURRENT_SOURCE_DIR}/kernels/scatter.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/scatter_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/scatter.cl" 8) 13 | smi_target(gather "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gather.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/gather_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gather.cl" 8) 14 | smi_target(multi_collectives "${CMAKE_CURRENT_SOURCE_DIR}/kernels/multi_collectives.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/multi_collectives_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/multi_collectives.cl" 8) 15 | 16 | #MPMD 17 | smi_target(bandwidth "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/bandwidth_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_1.cl" 8) 18 | smi_target(injection "${CMAKE_CURRENT_SOURCE_DIR}/kernels/injection_rate.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/injection_rate_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/injection_rate_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/injection_rate_1.cl" 8) 19 | smi_target(latency "${CMAKE_CURRENT_SOURCE_DIR}/kernels/latency.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/latency_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/latency_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/latency_1.cl" 8) 20 | 21 | #Eager evaluation 22 | smi_target(bandwidth_eager "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/bandwidth_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_1.cl" 8 8 8 OFF) 23 | 24 | endif() 25 | 26 | -------------------------------------------------------------------------------- /microbenchmarks/kernels/bandwidth.json: -------------------------------------------------------------------------------- 1 | { 2 | "fpgas": { 3 | "fpga-0006:acl0": "bandwidth_0", 4 | "fpga-0006:acl1": "bandwidth_1", 5 | "fpga-0007:acl0": "bandwidth_1", 6 | "fpga-0007:acl1": "bandwidth_1", 7 | "fpga-0008:acl0": "bandwidth_1", 8 | "fpga-0008:acl1": "bandwidth_1", 9 | "fpga-0009:acl0": "bandwidth_1", 10 | "fpga-0009:acl1": "bandwidth_1" 11 | }, 12 | "connections": { 13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3", 14 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0", 15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3", 16 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0", 17 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3", 18 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0", 19 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /microbenchmarks/kernels/bandwidth_0.cl: -------------------------------------------------------------------------------- 1 | /** 2 | Scaling benchmark: we want to evaluate the bandwdith 3 | achieved between two ranks. The FPGA are connected in a chain 4 | so we can decide the distance at which they are 5 | 6 | RANK 0 is the source of the data 7 | */ 8 | 9 | #include 10 | 11 | __kernel void app(const int N, const char dest_rank, const SMI_Comm comm) 12 | { 13 | 14 | SMI_Channel chan=SMI_Open_send_channel_ad(N, SMI_DOUBLE, dest_rank, 0, comm, 2048); 15 | const double start=0.1f; 16 | for(int i=0;i 11 | 12 | __kernel void app(__global char *mem, const int N, SMI_Comm comm) 13 | { 14 | SMI_Channel chan=SMI_Open_receive_channel_ad(N, SMI_DOUBLE, 0, 0, comm, 2048); 15 | const double start=0.1f; 16 | char check=1; 17 | for(int i=0;i 8 | 9 | __kernel void app(__global char* mem, const int N, char root,SMI_Comm comm) 10 | { 11 | char check=1; 12 | SMI_BChannel __attribute__((register)) chan= SMI_Open_bcast_channel(N, SMI_FLOAT,0, root,comm); 13 | //SMI_BChannel chan= SMI_Open_bcast_channel(N, SMI_FLOAT,0, root,comm); 14 | for(int i=0;i 7 | 8 | 9 | __kernel void app(const int N, char root, __global char *mem, SMI_Comm comm) 10 | { 11 | SMI_GatherChannel __attribute__((register)) chan= SMI_Open_gather_channel(N,N, SMI_INT,0, root,comm); 12 | int my_rank=SMI_Comm_rank(comm); 13 | int num_ranks=SMI_Comm_size(comm); 14 | const int loop_bound=(my_rank==root)?N*num_ranks:N; 15 | int to_send=(my_rank==root)?0:my_rank*N; //starting number 16 | char check=1; 17 | for(int i=0;i 9 | 10 | 11 | 12 | __kernel void app(const int N, const char dst, SMI_Comm comm) 13 | { 14 | for(int i=0;i 9 | 10 | 11 | __kernel void app(const int N,SMI_Comm comm) 12 | { 13 | int rcv; 14 | for(int i=0;i 17 | 18 | 19 | 20 | __kernel void app(const int N, char dest_rank,SMI_Comm comm) 21 | { 22 | int to_send; 23 | for(int i=0;i 18 | 19 | 20 | __kernel void app(const int N, SMI_Comm comm) 21 | { 22 | int to_send; 23 | for(int i=0;i 16 | 17 | 18 | __kernel void sequential_collectives(const int N, char root, __global volatile char *mem, SMI_Comm comm) 19 | { 20 | unsigned int my_rank=SMI_Comm_rank(comm); 21 | unsigned int num_ranks=SMI_Comm_size(comm); 22 | float start_float=1.1f; 23 | int start_int=1; 24 | char check=1; 25 | //first execute the reduce 26 | SMI_BChannel __attribute__((register)) bchan_float= SMI_Open_bcast_channel(N, SMI_FLOAT,0, root,comm); 27 | for(int i=0;i 8 | 9 | __kernel void app(const int N, char root, __global volatile char *mem, SMI_Comm comm) 10 | { 11 | unsigned int my_rank=SMI_Comm_rank(comm); 12 | unsigned int num_ranks=SMI_Comm_size(comm); 13 | float exp=(num_ranks*(num_ranks+1))/2; 14 | char check=1; 15 | 16 | SMI_RChannel __attribute__((register)) rchan_float= SMI_Open_reduce_channel(N, SMI_FLOAT, SMI_ADD, 0,root,comm); 17 | for(int i=0;i 8 | 9 | 10 | 11 | __kernel void app(const int N, char root,__global char* mem, SMI_Comm comm) 12 | { 13 | 14 | SMI_ScatterChannel __attribute__((register)) chan= SMI_Open_scatter_channel(N,N, SMI_INT, 0,root,comm); 15 | char check=1; 16 | int num_ranks=SMI_Comm_size(comm); 17 | int my_rank=SMI_Comm_rank(comm); 18 | const int loop_bound=(my_rank==root)?N*num_ranks:N; 19 | const int to_rcv_start=my_rank*N; 20 | for(int i=0;i 4 | #include 5 | 6 | #include 7 | #include 8 | 9 | using namespace clang; 10 | using namespace llvm; 11 | 12 | bool SpecializeCallsConsumer::HandleTopLevelDecl(DeclGroupRef group) 13 | { 14 | for (auto& decl: group) 15 | { 16 | this->visitor.TraverseDecl(decl); 17 | } 18 | return true; 19 | } 20 | 21 | bool SpecializeCallsAction::PrepareToExecuteAction(CompilerInstance& compiler) 22 | { 23 | compiler.getPreprocessorOpts().addMacroDef("SMI_REWRITER"); 24 | compiler.getDiagnostics().setErrorLimit(9999); 25 | return true; 26 | } 27 | std::unique_ptr SpecializeCallsAction::CreateASTConsumer( 28 | CompilerInstance& compiler, 29 | StringRef file) 30 | { 31 | this->rewriter.setSourceMgr(compiler.getSourceManager(), compiler.getLangOpts()); 32 | return std::make_unique(this->rewriter); 33 | } 34 | void SpecializeCallsAction::EndSourceFileAction() 35 | { 36 | this->rewriter.overwriteChangedFiles(); 37 | } 38 | -------------------------------------------------------------------------------- /source-rewriter/src/action.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "rewrite.h" 4 | 5 | #include 6 | #include 7 | 8 | class SpecializeCallsConsumer: public clang::ASTConsumer 9 | { 10 | public: 11 | explicit SpecializeCallsConsumer(clang::Rewriter& rewriter) 12 | : visitor(rewriter) 13 | { 14 | 15 | } 16 | 17 | bool HandleTopLevelDecl(clang::DeclGroupRef group) override; 18 | 19 | private: 20 | RewriteKernelsVisitor visitor; 21 | }; 22 | 23 | class SpecializeCallsAction: public clang::ASTFrontendAction 24 | { 25 | public: 26 | bool PrepareToExecuteAction(clang::CompilerInstance& compiler); 27 | 28 | std::unique_ptr CreateASTConsumer( 29 | clang::CompilerInstance& compiler, 30 | llvm::StringRef file) override; 31 | void EndSourceFileAction() override; 32 | 33 | private: 34 | clang::Rewriter rewriter; 35 | }; 36 | -------------------------------------------------------------------------------- /source-rewriter/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "action.h" 2 | 3 | #include 4 | #include 5 | 6 | using namespace clang::tooling; 7 | 8 | static llvm::cl::OptionCategory Opt("SMI kernel rewriter"); 9 | 10 | int main(int argc, const char** argv) 11 | { 12 | CommonOptionsParser op(argc, argv, Opt); 13 | ClangTool Tool(op.getCompilations(), op.getSourcePathList()); 14 | 15 | return Tool.run(clang::tooling::newFrontendActionFactory().get()); 16 | } 17 | 18 | // TODO: matchers 19 | // TODO: run action manually 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/broadcast.cpp: -------------------------------------------------------------------------------- 1 | #include "broadcast.h" 2 | #include "utils.h" 3 | 4 | using namespace clang; 5 | 6 | static OperationMetadata extractBroadcast(CallExpr* channelDecl) 7 | { 8 | return OperationMetadata("broadcast", 9 | extractIntArg(channelDecl, 2), 10 | extractDataType(channelDecl, 1), 11 | extractBufferSize(channelDecl, 5) 12 | ); 13 | } 14 | 15 | OperationMetadata BroadcastExtractor::GetOperationMetadata(CallExpr* callExpr) 16 | { 17 | return extractBroadcast(extractChannelDecl(callExpr)); 18 | } 19 | std::string BroadcastExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 20 | { 21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_BChannel* chan, void* data);"; 22 | } 23 | std::vector BroadcastExtractor::GetFunctionNames() 24 | { 25 | return {"SMI_Bcast"}; 26 | } 27 | 28 | OperationMetadata BroadcastChannelExtractor::GetOperationMetadata(CallExpr* callExpr) 29 | { 30 | return extractBroadcast(callExpr); 31 | } 32 | std::string BroadcastChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 33 | { 34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_BChannel", "int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm"); 35 | } 36 | std::string BroadcastChannelExtractor::GetChannelFunctionName() 37 | { 38 | return "SMI_Open_bcast_channel"; 39 | } 40 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/broadcast.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ops.h" 4 | 5 | class BroadcastExtractor: public OperationExtractor 6 | { 7 | public: 8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 10 | std::vector GetFunctionNames() override; 11 | }; 12 | 13 | class BroadcastChannelExtractor: public ChannelExtractor 14 | { 15 | public: 16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 18 | std::string GetChannelFunctionName() override; 19 | }; 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/gather.cpp: -------------------------------------------------------------------------------- 1 | #include "gather.h" 2 | #include "utils.h" 3 | 4 | using namespace clang; 5 | 6 | static OperationMetadata extractGather(CallExpr* channelDecl) 7 | { 8 | return OperationMetadata("gather", 9 | extractIntArg(channelDecl, 3), 10 | extractDataType(channelDecl, 2), 11 | extractBufferSize(channelDecl, 6) 12 | ); 13 | } 14 | 15 | OperationMetadata GatherExtractor::GetOperationMetadata(CallExpr* callExpr) 16 | { 17 | return extractGather(extractChannelDecl(callExpr)); 18 | } 19 | std::string GatherExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 20 | { 21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_GatherChannel* chan, void* send_data, void* rcv_data);"; 22 | } 23 | std::vector GatherExtractor::GetFunctionNames() 24 | { 25 | return {"SMI_Gather"}; 26 | } 27 | 28 | OperationMetadata GatherChannelExtractor::GetOperationMetadata(CallExpr* callExpr) 29 | { 30 | return extractGather(callExpr); 31 | } 32 | std::string GatherChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 33 | { 34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_GatherChannel", "int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm"); 35 | } 36 | std::string GatherChannelExtractor::GetChannelFunctionName() 37 | { 38 | return "SMI_Open_gather_channel"; 39 | } 40 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/gather.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ops.h" 4 | 5 | class GatherExtractor: public OperationExtractor 6 | { 7 | public: 8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 10 | std::vector GetFunctionNames() override; 11 | }; 12 | 13 | class GatherChannelExtractor: public ChannelExtractor 14 | { 15 | public: 16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 18 | std::string GetChannelFunctionName() override; 19 | }; 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/ops.cpp: -------------------------------------------------------------------------------- 1 | #include "ops.h" 2 | #include "utils.h" 3 | 4 | #include "../third-party/json.hpp" 5 | 6 | #include 7 | 8 | using json = nlohmann::json; 9 | using namespace clang; 10 | 11 | std::string OperationExtractor::RenameCall(const std::string& callName, const OperationMetadata& metadata) 12 | { 13 | return renamePortDataType(callName, metadata); 14 | } 15 | 16 | OperationMetadata OperationExtractor::ModifyCall(clang::Rewriter& rewriter, clang::CallExpr& callExpr, const std::string& callName) 17 | { 18 | auto metadata = this->GetOperationMetadata(&callExpr); 19 | auto renamed = this->RenameCall(callName, metadata); 20 | rewriter.ReplaceText(callExpr.getBeginLoc(), renamed); 21 | return metadata; 22 | } 23 | 24 | void ChannelExtractor::OutputMetadata(const OperationMetadata& metadata, std::ostream& os) 25 | { 26 | json obj; 27 | obj["type"] = metadata.operation; 28 | obj["port"] = metadata.port; 29 | obj["data_type"] = formatDataType(metadata.dataType); 30 | 31 | if (metadata.isBufferSizeSet()) 32 | { 33 | obj["buffer_size"] = metadata.bufferSize; 34 | } 35 | else obj["buffer_size"] = nullptr; 36 | 37 | obj["args"] = metadata.args; 38 | 39 | os << obj.dump() << std::endl; 40 | } 41 | 42 | // https://stackoverflow.com/a/874160/1107768 43 | static bool ends(const std::string& str, const std::string& end) 44 | { 45 | if (str.length() >= end.length()) 46 | { 47 | return (0 == str.compare(str.length() - end.length(), end.length(), end)); 48 | } 49 | else return false; 50 | } 51 | static bool isExtendedChannelOpen(const std::string& callName) 52 | { 53 | return ends(callName, "_ad"); 54 | } 55 | 56 | std::string ChannelExtractor::CreateChannelDeclaration(const std::string& callName, const OperationMetadata& metadata, 57 | const std::string& returnType, const std::string& parameters) 58 | { 59 | std::stringstream ss; 60 | ss << returnType << " " << this->RenameCall(callName, metadata) << "(" << parameters << ");"; 61 | 62 | return ss.str(); 63 | } 64 | 65 | std::string ChannelExtractor::RenameCall(const std::string& callName, const OperationMetadata& metadata) 66 | { 67 | auto name = callName; 68 | if (isExtendedChannelOpen(name)) 69 | { 70 | name.resize(name.size() - 3); 71 | } 72 | return OperationExtractor::RenameCall(name, metadata); 73 | } 74 | 75 | std::vector ChannelExtractor::GetFunctionNames() 76 | { 77 | auto name = this->GetChannelFunctionName(); 78 | return {name, name + "_ad"}; 79 | } 80 | 81 | OperationMetadata ChannelExtractor::ModifyCall(Rewriter& rewriter, CallExpr& callExpr, const std::string& callName) 82 | { 83 | auto metadata = OperationExtractor::ModifyCall(rewriter, callExpr, callName); 84 | if (isExtendedChannelOpen(callName) && callExpr.getNumArgs() >= 2) 85 | { 86 | auto lastArg = callExpr.getArgs()[callExpr.getNumArgs() - 1]; 87 | auto previousArg = callExpr.getArgs()[callExpr.getNumArgs() - 2]; 88 | auto end0 = Lexer::getLocForEndOfToken(previousArg->getEndLoc(), 0, rewriter.getSourceMgr(), rewriter.getLangOpts()); 89 | auto end1 = Lexer::getLocForEndOfToken(lastArg->getEndLoc(), 0, rewriter.getSourceMgr(), rewriter.getLangOpts()); 90 | rewriter.RemoveText(CharSourceRange::getCharRange(end0, end1)); 91 | } 92 | return metadata; 93 | } 94 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | enum class DataType { 12 | Char, 13 | Short, 14 | Int, 15 | Float, 16 | Double 17 | }; 18 | 19 | class OperationMetadata 20 | { 21 | public: 22 | OperationMetadata(std::string operation, 23 | size_t port, 24 | DataType dataType = DataType::Int, 25 | int bufferSize = -1, 26 | std::unordered_map args = {}) 27 | : operation(std::move(operation)), port(port), dataType(dataType), bufferSize(bufferSize), args(std::move(args)) 28 | { 29 | 30 | } 31 | 32 | bool isBufferSizeSet() const 33 | { 34 | return this->bufferSize != -1; 35 | } 36 | 37 | std::string operation; 38 | size_t port; 39 | DataType dataType; 40 | int bufferSize; 41 | std::unordered_map args; 42 | }; 43 | 44 | class OperationExtractor 45 | { 46 | public: 47 | virtual ~OperationExtractor() = default; 48 | 49 | /** 50 | * Extract metadata from a call expression. 51 | * The metadata should contain the name of the operation, the used logical port and potentially other arguments 52 | * like data type or reduce operation. 53 | */ 54 | virtual OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) = 0; 55 | 56 | /** 57 | * Rename a function call given the extracted metadata. 58 | * For example for callName="SMI_Push" and Metadata with port 0, it should return "SMI_Push_0". 59 | */ 60 | virtual std::string RenameCall(const std::string& callName, const OperationMetadata& metadata); 61 | 62 | /** 63 | * Forward declare a renamed function call. 64 | * For example if RenameCall returned "SMI_Push_0" for a given metadata, this function should return 65 | * "void SMI_Push_0(SMI_Channel *chan, void* data);" for the same metadata. 66 | */ 67 | virtual std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) = 0; 68 | 69 | /** 70 | * Outputs the serialized metadata to the given stream. 71 | */ 72 | virtual void OutputMetadata(const OperationMetadata& metadata, std::ostream& os) 73 | { 74 | 75 | } 76 | 77 | /** 78 | * Returns the function names that should invoke this extractor. 79 | */ 80 | virtual std::vector GetFunctionNames() = 0; 81 | 82 | /** 83 | * Rewrite the function call if necessary. 84 | */ 85 | virtual OperationMetadata ModifyCall(clang::Rewriter& rewriter, clang::CallExpr& callExpr, const std::string& callName); 86 | }; 87 | 88 | class ChannelExtractor: public OperationExtractor 89 | { 90 | public: 91 | std::string RenameCall(const std::string& callName, const OperationMetadata& metadata) override; 92 | 93 | /** 94 | * Outputs the serialized metadata to the given stream. 95 | */ 96 | void OutputMetadata(const OperationMetadata& metadata, std::ostream& os) override; 97 | std::vector GetFunctionNames() final; 98 | 99 | OperationMetadata ModifyCall(clang::Rewriter& rewriter, clang::CallExpr& callExpr, const std::string& callName) override; 100 | 101 | virtual std::string GetChannelFunctionName() = 0; 102 | 103 | protected: 104 | std::string CreateChannelDeclaration( 105 | const std::string& callName, 106 | const OperationMetadata& metadata, 107 | const std::string& returnType, 108 | const std::string& parameters); 109 | }; 110 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/pop.cpp: -------------------------------------------------------------------------------- 1 | #include "pop.h" 2 | #include "utils.h" 3 | 4 | using namespace clang; 5 | 6 | static OperationMetadata extractPop(CallExpr* channelDecl) 7 | { 8 | return OperationMetadata("pop", 9 | extractIntArg(channelDecl, 3), 10 | extractDataType(channelDecl, 1), 11 | extractBufferSize(channelDecl, 5) 12 | ); 13 | } 14 | 15 | OperationMetadata PopExtractor::GetOperationMetadata(CallExpr* callExpr) 16 | { 17 | return extractPop(extractChannelDecl(callExpr)); 18 | } 19 | std::string PopExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 20 | { 21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_Channel* chan, void* data);"; 22 | } 23 | std::vector PopExtractor::GetFunctionNames() 24 | { 25 | return {"SMI_Pop"}; 26 | } 27 | 28 | OperationMetadata PopChannelExtractor::GetOperationMetadata(CallExpr* callExpr) 29 | { 30 | return extractPop(callExpr); 31 | } 32 | std::string PopChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 33 | { 34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_Channel", "int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm"); 35 | } 36 | std::string PopChannelExtractor::GetChannelFunctionName() 37 | { 38 | return "SMI_Open_receive_channel"; 39 | } 40 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/pop.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ops.h" 4 | 5 | class PopExtractor: public OperationExtractor 6 | { 7 | public: 8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 10 | std::vector GetFunctionNames() override; 11 | }; 12 | 13 | class PopChannelExtractor: public ChannelExtractor 14 | { 15 | public: 16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 18 | std::string GetChannelFunctionName() override; 19 | }; 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/push.cpp: -------------------------------------------------------------------------------- 1 | #include "push.h" 2 | #include "utils.h" 3 | 4 | using namespace clang; 5 | 6 | static OperationMetadata extractPush(CallExpr* channelDecl) 7 | { 8 | return OperationMetadata("push", 9 | extractIntArg(channelDecl, 3), 10 | extractDataType(channelDecl, 1), 11 | extractBufferSize(channelDecl, 5) 12 | ); 13 | } 14 | 15 | OperationMetadata PushExtractor::GetOperationMetadata(CallExpr* callExpr) 16 | { 17 | return extractPush(extractChannelDecl(callExpr)); 18 | } 19 | std::string PushExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 20 | { 21 | std::string args = "(SMI_Channel* chan, void* data"; 22 | if (callName == "SMI_Push_flush") 23 | { 24 | args += ", int immediate"; 25 | } 26 | 27 | return "void " + this->RenameCall(callName, metadata) + args + ");"; 28 | } 29 | std::vector PushExtractor::GetFunctionNames() 30 | { 31 | return {"SMI_Push"}; 32 | } 33 | 34 | OperationMetadata PushChannelExtractor::GetOperationMetadata(CallExpr* callExpr) 35 | { 36 | return extractPush(callExpr); 37 | } 38 | std::string PushChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 39 | { 40 | return this->CreateChannelDeclaration(callName, metadata, "SMI_Channel", "int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm"); 41 | } 42 | std::string PushChannelExtractor::GetChannelFunctionName() 43 | { 44 | return "SMI_Open_send_channel"; 45 | } 46 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/push.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ops.h" 4 | 5 | class PushExtractor: public OperationExtractor 6 | { 7 | public: 8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 10 | std::vector GetFunctionNames() override; 11 | }; 12 | 13 | class PushChannelExtractor: public ChannelExtractor 14 | { 15 | public: 16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 18 | std::string GetChannelFunctionName() override; 19 | }; 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/reduce.cpp: -------------------------------------------------------------------------------- 1 | #include "reduce.h" 2 | #include "utils.h" 3 | 4 | using namespace clang; 5 | 6 | static std::string formatReduceOp(int op) 7 | { 8 | switch (op) 9 | { 10 | case 0: return "add"; 11 | case 1: return "max"; 12 | case 2: return "min"; 13 | } 14 | 15 | assert(false); 16 | return ""; 17 | } 18 | 19 | static OperationMetadata extractReduce(CallExpr* channelDecl) 20 | { 21 | return OperationMetadata("reduce", 22 | extractIntArg(channelDecl, 3), 23 | extractDataType(channelDecl, 1), 24 | extractBufferSize(channelDecl, 6), 25 | { {"op_type", formatReduceOp(extractIntArg(channelDecl, 2))} } 26 | ); 27 | } 28 | 29 | OperationMetadata ReduceExtractor::GetOperationMetadata(CallExpr* callExpr) 30 | { 31 | return extractReduce(extractChannelDecl(callExpr)); 32 | } 33 | std::string ReduceExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 34 | { 35 | return "void " + this->RenameCall(callName, metadata) + "(SMI_RChannel* chan, void* data_snd, void* data_rcv);"; 36 | } 37 | std::vector ReduceExtractor::GetFunctionNames() 38 | { 39 | return {"SMI_Reduce"}; 40 | } 41 | 42 | OperationMetadata ReduceChannelExtractor::GetOperationMetadata(CallExpr* callExpr) 43 | { 44 | return extractReduce(callExpr); 45 | } 46 | std::string ReduceChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 47 | { 48 | return this->CreateChannelDeclaration(callName, metadata, "SMI_RChannel", "int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm"); 49 | } 50 | std::string ReduceChannelExtractor::GetChannelFunctionName() 51 | { 52 | return "SMI_Open_reduce_channel"; 53 | } 54 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/reduce.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ops.h" 4 | 5 | class ReduceExtractor: public OperationExtractor 6 | { 7 | public: 8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 10 | std::vector GetFunctionNames() override; 11 | }; 12 | 13 | class ReduceChannelExtractor: public ChannelExtractor 14 | { 15 | public: 16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 18 | std::string GetChannelFunctionName() override; 19 | }; 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/scatter.cpp: -------------------------------------------------------------------------------- 1 | #include "scatter.h" 2 | #include "utils.h" 3 | 4 | using namespace clang; 5 | 6 | static OperationMetadata extractScatter(CallExpr* channelDecl) 7 | { 8 | return OperationMetadata("scatter", 9 | extractIntArg(channelDecl, 3), 10 | extractDataType(channelDecl, 2), 11 | extractBufferSize(channelDecl, 6) 12 | ); 13 | } 14 | 15 | OperationMetadata ScatterExtractor::GetOperationMetadata(CallExpr* callExpr) 16 | { 17 | return extractScatter(extractChannelDecl(callExpr)); 18 | } 19 | std::string ScatterExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 20 | { 21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_ScatterChannel* chan, void* data_snd, void* data_rcv);"; 22 | } 23 | std::vector ScatterExtractor::GetFunctionNames() 24 | { 25 | return {"SMI_Scatter"}; 26 | } 27 | 28 | OperationMetadata ScatterChannelExtractor::GetOperationMetadata(CallExpr* callExpr) 29 | { 30 | return extractScatter(callExpr); 31 | } 32 | std::string ScatterChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) 33 | { 34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_ScatterChannel", "int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm"); 35 | } 36 | std::string ScatterChannelExtractor::GetChannelFunctionName() 37 | { 38 | return "SMI_Open_scatter_channel"; 39 | } 40 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/scatter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ops.h" 4 | 5 | class ScatterExtractor: public OperationExtractor 6 | { 7 | public: 8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 10 | std::vector GetFunctionNames() override; 11 | }; 12 | 13 | class ScatterChannelExtractor: public ChannelExtractor 14 | { 15 | public: 16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override; 17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override; 18 | std::string GetChannelFunctionName() override; 19 | }; 20 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | using namespace clang; 4 | 5 | bool FindIntegerLiteral::VisitIntegerLiteral(IntegerLiteral* literal) 6 | { 7 | this->setValue(literal->getValue().getZExtValue()); 8 | return false; 9 | } 10 | bool FindIntegerLiteral::VisitDeclRefExpr(DeclRefExpr* expr) 11 | { 12 | auto decl = expr->getDecl(); 13 | if (auto enumeration = dyn_cast(decl)) 14 | { 15 | this->setValue(enumeration->getInitVal().getZExtValue()); 16 | } 17 | else if (auto varDecl = dyn_cast(decl)) 18 | { 19 | FindIntegerLiteral visitor; 20 | visitor.TraverseDecl(varDecl); 21 | if (visitor.valueFound) 22 | { 23 | this->value = visitor.value; 24 | this->valueFound = true; 25 | } 26 | } 27 | return false; 28 | } 29 | size_t FindIntegerLiteral::getValue() const 30 | { 31 | assert(this->valueFound); 32 | return this->value; 33 | } 34 | void FindIntegerLiteral::setValue(size_t value) 35 | { 36 | assert(!this->valueFound); 37 | this->valueFound = true; 38 | this->value = value; 39 | } 40 | 41 | size_t extractIntArg(CallExpr* expr, int argumentIndex) 42 | { 43 | auto arg = expr->getArgs()[argumentIndex]; 44 | 45 | FindIntegerLiteral visitor; 46 | visitor.TraverseStmt(arg); 47 | return visitor.getValue(); 48 | } 49 | 50 | std::string formatDataType(DataType dataType) 51 | { 52 | switch (dataType) 53 | { 54 | case DataType::Char: return "char"; 55 | case DataType::Short: return "short"; 56 | case DataType::Int: return "int"; 57 | case DataType::Float: return "float"; 58 | case DataType::Double: return "double"; 59 | } 60 | 61 | assert(false); 62 | return ""; 63 | } 64 | 65 | std::string renamePortDataType(const std::string& callName, const OperationMetadata& metadata) 66 | { 67 | auto call = callName + "_" + std::to_string(metadata.port); 68 | return call + "_" + formatDataType(metadata.dataType); 69 | } 70 | 71 | DataType extractDataType(CallExpr* expr, int argumentIndex) 72 | { 73 | size_t arg = extractIntArg(expr, argumentIndex); 74 | assert(arg >= 1 && arg <= 5); 75 | 76 | switch (arg) 77 | { 78 | case 1: return DataType::Int; 79 | case 2: return DataType::Float; 80 | case 3: return DataType::Double; 81 | case 4: return DataType::Char; 82 | case 5: return DataType::Short; 83 | default: 84 | assert(false); 85 | } 86 | return DataType::Int; 87 | } 88 | 89 | class FindVarDecl: public RecursiveASTVisitor 90 | { 91 | public: 92 | bool VisitVarDecl(VarDecl* decl) 93 | { 94 | assert(!this->decl); 95 | this->decl = decl; 96 | 97 | return false; 98 | } 99 | bool VisitDeclRefExpr(DeclRefExpr* expr) 100 | { 101 | this->TraverseDecl(expr->getDecl()); 102 | return true; 103 | } 104 | 105 | VarDecl* decl = nullptr; 106 | }; 107 | 108 | CallExpr* extractChannelDecl(CallExpr* expr) 109 | { 110 | assert(expr->getNumArgs() > 0); 111 | auto& arg = expr->getArgs()[0]; 112 | FindVarDecl visitor; 113 | visitor.TraverseStmt(arg); 114 | assert(visitor.decl); 115 | auto channelDecl = visitor.decl; 116 | auto callExpr = dyn_cast(channelDecl->getInit()); 117 | assert(callExpr); 118 | return callExpr; 119 | } 120 | 121 | size_t extractBufferSize(CallExpr* callExpr, int argumentIndex) 122 | { 123 | if (argumentIndex >= callExpr->getNumArgs()) return -1; 124 | return extractIntArg(callExpr, argumentIndex); 125 | } 126 | -------------------------------------------------------------------------------- /source-rewriter/src/ops/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "ops.h" 7 | 8 | class FindIntegerLiteral: public clang::RecursiveASTVisitor 9 | { 10 | public: 11 | bool VisitIntegerLiteral(clang::IntegerLiteral* literal); 12 | bool VisitDeclRefExpr(clang::DeclRefExpr* expr); 13 | 14 | size_t getValue() const; 15 | 16 | private: 17 | void setValue(size_t value); 18 | 19 | bool valueFound = false; 20 | size_t value; 21 | }; 22 | 23 | std::string formatDataType(DataType dataType); 24 | 25 | std::string renamePortDataType(const std::string& callName, const OperationMetadata& metadata); 26 | 27 | size_t extractIntArg(clang::CallExpr* expr, int argumentIndex); 28 | size_t extractBufferSize(clang::CallExpr* expr, int argumentIndex); 29 | DataType extractDataType(clang::CallExpr* expr, int argumentIndex); 30 | clang::CallExpr* extractChannelDecl(clang::CallExpr* expr); 31 | -------------------------------------------------------------------------------- /source-rewriter/src/rewrite.cpp: -------------------------------------------------------------------------------- 1 | #include "rewrite.h" 2 | #include "utils.h" 3 | 4 | #include "ops/ops.h" 5 | #include "ops/push.h" 6 | #include "ops/pop.h" 7 | #include "ops/broadcast.h" 8 | #include "ops/scatter.h" 9 | #include "ops/gather.h" 10 | #include "ops/reduce.h" 11 | 12 | #include 13 | 14 | using namespace clang; 15 | 16 | struct Rewrite 17 | { 18 | public: 19 | Rewrite(OperationMetadata metadata, std::string callName, OperationExtractor* extractor) 20 | : metadata(std::move(metadata)), callName(std::move(callName)), extractor(extractor) 21 | { 22 | 23 | } 24 | 25 | OperationMetadata metadata; 26 | std::string callName; 27 | OperationExtractor* extractor; 28 | }; 29 | 30 | class RewriteOpsVisitor: public RecursiveASTVisitor 31 | { 32 | public: 33 | explicit RewriteOpsVisitor(Rewriter& rewriter) : rewriter(rewriter) 34 | { 35 | this->extractors.push_back(std::make_unique()); 36 | this->extractors.push_back(std::make_unique()); 37 | this->extractors.push_back(std::make_unique()); 38 | this->extractors.push_back(std::make_unique()); 39 | this->extractors.push_back(std::make_unique()); 40 | this->extractors.push_back(std::make_unique()); 41 | this->extractors.push_back(std::make_unique()); 42 | this->extractors.push_back(std::make_unique()); 43 | this->extractors.push_back(std::make_unique()); 44 | this->extractors.push_back(std::make_unique()); 45 | this->extractors.push_back(std::make_unique()); 46 | this->extractors.push_back(std::make_unique()); 47 | 48 | for (auto& extractor: this->extractors) 49 | { 50 | for (auto& fn: extractor->GetFunctionNames()) 51 | { 52 | this->callMap[fn] = extractor.get(); 53 | } 54 | } 55 | } 56 | 57 | bool VisitCallExpr(CallExpr* expr) 58 | { 59 | auto callee = expr->getDirectCallee(); 60 | if (callee) 61 | { 62 | auto name = callee->getName().str(); 63 | auto it = this->callMap.find(name); 64 | if (it != this->callMap.end()) 65 | { 66 | auto& extractor = it->second; 67 | 68 | auto metadata = extractor->ModifyCall(this->rewriter, *expr, name); 69 | this->rewrites.emplace_back(metadata, name, extractor); 70 | } 71 | } 72 | 73 | return true; 74 | } 75 | 76 | const std::vector& getRewrites() const 77 | { 78 | return this->rewrites; 79 | } 80 | 81 | private: 82 | Rewriter& rewriter; 83 | 84 | std::vector> extractors; 85 | std::unordered_map callMap; 86 | std::vector rewrites; 87 | }; 88 | 89 | /** 90 | * Only visit functions that are marked as user device kernels. 91 | */ 92 | bool RewriteKernelsVisitor::VisitFunctionDecl(FunctionDecl* f) 93 | { 94 | bool isKernel = isKernelFunction(f); 95 | if (isKernel) 96 | { 97 | std::cerr << "SMI: rewriting function " << f->getName().str() << std::endl; 98 | 99 | RewriteOpsVisitor visitor(this->rewriter); 100 | visitor.TraverseFunctionDecl(f); 101 | 102 | for (auto& rewrite: visitor.getRewrites()) 103 | { 104 | rewrite.extractor->OutputMetadata(rewrite.metadata, std::cout); 105 | std::cerr << "SMI: rewrote "; 106 | rewrite.extractor->OutputMetadata(rewrite.metadata, std::cerr); 107 | this->rewriter.InsertTextBefore(f->getBeginLoc(), rewrite.extractor->CreateDeclaration(rewrite.callName, 108 | rewrite.metadata) + "\n"); 109 | } 110 | } 111 | 112 | return false; 113 | } 114 | -------------------------------------------------------------------------------- /source-rewriter/src/rewrite.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | class RewriteKernelsVisitor: public clang::RecursiveASTVisitor 7 | { 8 | public: 9 | explicit RewriteKernelsVisitor(clang::Rewriter& rewriter) : rewriter(rewriter) { } 10 | 11 | bool VisitFunctionDecl(clang::FunctionDecl *f); 12 | 13 | private: 14 | clang::Rewriter& rewriter; 15 | }; 16 | -------------------------------------------------------------------------------- /source-rewriter/src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | 5 | using namespace clang; 6 | 7 | bool isKernelFunction(FunctionDecl* decl) 8 | { 9 | if (!decl) return false; 10 | if (!decl->hasBody()) return false; 11 | 12 | std::unordered_set validAttrs = { 13 | "kernel", 14 | "__kernel" 15 | }; 16 | 17 | for (auto& attr: decl->attrs()) 18 | { 19 | auto spelling = std::string(attr->getSpelling()); 20 | if (validAttrs.find(spelling) != validAttrs.end()) 21 | { 22 | return true; 23 | } 24 | } 25 | 26 | return false; 27 | } 28 | -------------------------------------------------------------------------------- /source-rewriter/src/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | bool isKernelFunction(clang::FunctionDecl* decl); 6 | 7 | template 8 | inline std::vector gatherStatements(T* decl) 9 | { 10 | std::vector children; 11 | for (auto& child: decl->children()) 12 | { 13 | children.push_back(child); 14 | } 15 | return children; 16 | } 17 | -------------------------------------------------------------------------------- /test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Download and unpack googletest at configure time 2 | 3 | configure_file(${PROJECT_SOURCE_DIR}/CMakeGTEST.txt.in googletest-download/CMakeLists.txt) 4 | 5 | 6 | execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . 7 | RESULT_VARIABLE result 8 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) 9 | if(result) 10 | message(FATAL_ERROR "CMake step for googletest failed: ${result}") 11 | endif() 12 | execute_process(COMMAND ${CMAKE_COMMAND} --build . 13 | RESULT_VARIABLE result 14 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) 15 | if(result) 16 | message(FATAL_ERROR "Build step for googletest failed: ${result}") 17 | endif() 18 | 19 | # Prevent overriding the parent project's compiler/linker 20 | # settings on Windows 21 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) 22 | 23 | # Add googletest directly to our build. This defines 24 | # the gtest and gtest_main targets. 25 | add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src 26 | ${CMAKE_CURRENT_BINARY_DIR}/googletest-build 27 | EXCLUDE_FROM_ALL) 28 | 29 | # The gtest/gtest_main targets carry header search path 30 | # dependencies automatically when using CMake 2.8.11 or 31 | # later. Otherwise we have to add them here ourselves. 32 | if (CMAKE_VERSION VERSION_LESS 2.8.11) 33 | include_directories("${gtest_SOURCE_DIR}/include") 34 | endif() 35 | 36 | # Now simply link against gtest or gtest_main as needed. Eg 37 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 38 | include_directories("${gtest_SOURCE_DIR}/include") 39 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgtest") 40 | 41 | 42 | #p2p 43 | smi_target(test_p2p "${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p.json" "${CMAKE_CURRENT_SOURCE_DIR}/p2p/test_p2p.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p_rank0.cl;${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p_rank1.cl" 8) 44 | target_link_libraries(test_p2p_host gtest) 45 | 46 | add_test( 47 | NAME p2p 48 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_p2p_host 49 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_p2p/" 50 | ) 51 | 52 | 53 | #broadcast 54 | smi_target(test_broadcast "${CMAKE_CURRENT_SOURCE_DIR}/broadcast/broadcast.json" "${CMAKE_CURRENT_SOURCE_DIR}/broadcast/test_broadcast.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/broadcast/broadcast.cl" 8) 55 | target_link_libraries(test_broadcast_host gtest) 56 | 57 | add_test( 58 | NAME broadcast 59 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_broadcast_host 60 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_broadcast/" 61 | ) 62 | 63 | #reduce 64 | smi_target(test_reduce "${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce.json" "${CMAKE_CURRENT_SOURCE_DIR}/reduce/test_reduce.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce.cl" 8) 65 | target_link_libraries(test_reduce_host gtest) 66 | 67 | add_test( 68 | NAME reduce 69 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_reduce_host 70 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_reduce/" 71 | ) 72 | 73 | 74 | smi_target(test_scatter "${CMAKE_CURRENT_SOURCE_DIR}/scatter/scatter.json" "${CMAKE_CURRENT_SOURCE_DIR}/scatter/test_scatter.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/scatter/scatter.cl" 8) 75 | target_link_libraries(test_scatter_host gtest) 76 | 77 | add_test( 78 | NAME scatter 79 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_scatter_host 80 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_scatter/" 81 | ) 82 | 83 | smi_target(test_gather "${CMAKE_CURRENT_SOURCE_DIR}/gather/gather.json" "${CMAKE_CURRENT_SOURCE_DIR}/gather/test_gather.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/gather/gather.cl" 8) 84 | target_link_libraries(test_gather_host gtest) 85 | 86 | add_test( 87 | NAME gather 88 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_gather_host 89 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_gather/" 90 | ) 91 | 92 | 93 | smi_target(test_mixed "${CMAKE_CURRENT_SOURCE_DIR}/mixed/mixed.json" "${CMAKE_CURRENT_SOURCE_DIR}/mixed/test_mixed.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/mixed/mixed.cl" 8) 94 | target_link_libraries(test_mixed_host gtest) 95 | 96 | add_test( 97 | NAME mixed 98 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_mixed_host 99 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_mixed/" 100 | ) 101 | -------------------------------------------------------------------------------- /test/README.md: -------------------------------------------------------------------------------- 1 | This directory contains a set of unit test for the different 2 | communication primitives. Tests are executed in emulation environment, 3 | therefore passing the test does not ensure that the primitive will work 4 | in hardware (but in any case it is a necessary condition) 5 | 6 | The user can compile and runn all of them with 7 | ``` 8 | ``` 9 | 10 | 11 | Tested primitives: 12 | - p2p: point to point communications 13 | - broadcast 14 | - scatter 15 | - gather 16 | - reduce 17 | - mixed: p2p and collective communications in the same bitstream 18 | 19 | Each primitive is tested against different message lenght, data types and (in case of collective) 20 | different roots. 21 | 22 | 23 | All the tests have a timeout to ensure that a deadlock would not stall the testing procedure. 24 | However it should be noticed that emulation can be slow, so in case try to re-execute the test 25 | or increase the timeout. 26 | For each test, the timeout value is defined as macro at the begining of the respective .cpp file. 27 | 28 | 29 | To test a primitive, in the `test` folder of the Cmake folder: 30 | 31 | 1. compile the emulated bitstream 32 | 33 | `make test__emulator` 34 | 35 | 2. compile the test program 36 | 37 | `make test__host` 38 | 39 | 3. execute the test program from the respective working directory `test_/` 40 | 41 | `env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test__host` 42 | 43 | or simply use the integration with `ctest` 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /test/broadcast/broadcast.cl: -------------------------------------------------------------------------------- 1 | /** 2 | Broadcast test. A sequeuence of number is broadcasted. 3 | Non-root ranks check whether the received number is correct 4 | */ 5 | 6 | #include 7 | 8 | 9 | __kernel void test_int(__global char* mem, const int N, char root,SMI_Comm comm) 10 | { 11 | char check=1; 12 | SMI_BChannel __attribute__((register)) chan= SMI_Open_bcast_channel(N, SMI_INT,0, root,comm); 13 | for(int i=0;i 7 | 8 | __kernel void test_char(const int N, char root, __global char *mem, SMI_Comm comm) 9 | { 10 | SMI_GatherChannel __attribute__((register)) chan= SMI_Open_gather_channel(N,N, SMI_CHAR,0, root,comm); 11 | int my_rank=SMI_Comm_rank(comm); 12 | int num_ranks=SMI_Comm_size(comm); 13 | const int loop_bound=(my_rank==root)?N*num_ranks:N; 14 | char to_send=my_rank; //starting point 15 | char exp=0; 16 | char check=1; 17 | int rcv=0; 18 | for(int i=0;i 11 | __kernel void test_int(int start, const SMI_Comm comm,__global int *mem) 12 | { 13 | unsigned int my_rank=SMI_Comm_rank(comm); 14 | unsigned int num_ranks=SMI_Comm_size(comm); 15 | SMI_Channel chans=SMI_Open_send_channel(1,SMI_INT,my_rank+1,0,comm); 16 | SMI_Channel chanr=SMI_Open_receive_channel(1,SMI_INT,my_rank-1,0,comm); 17 | int data; 18 | if(my_rank>0) 19 | { 20 | SMI_Pop(&chanr, &data); 21 | data++; 22 | } 23 | else 24 | data=start; 25 | 26 | if(my_rank.aocx" 7 | */ 8 | 9 | 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include "smi_generated_host.c" 24 | #define ROUTING_DIR "smi-routes/" 25 | using namespace std; 26 | std::string program_path; 27 | int rank_count, my_rank; 28 | hlslib::ocl::Context *context; 29 | 30 | SMI_Comm comm; 31 | //https://github.com/google/googletest/issues/348#issuecomment-492785854 32 | #define ASSERT_DURATION_LE(secs, stmt) { \ 33 | std::promise completed; \ 34 | auto stmt_future = completed.get_future(); \ 35 | std::thread([&](std::promise& completed) { \ 36 | stmt; \ 37 | completed.set_value(true); \ 38 | }, std::ref(completed)).detach(); \ 39 | if(stmt_future.wait_for(std::chrono::seconds(secs)) == std::future_status::timeout){ \ 40 | GTEST_FATAL_FAILURE_(" timed out (> " #secs \ 41 | " seconds). Check code for infinite loops"); \ 42 | MPI_Finalize();\ 43 | } \ 44 | } 45 | 46 | template 47 | bool runAndReturn(hlslib::ocl::Kernel &kernel, hlslib::ocl::Buffer &check, T exp) 48 | { 49 | //only rank 0 and the recv rank start the app kernels 50 | MPI_Barrier(MPI_COMM_WORLD); 51 | 52 | kernel.ExecuteTask(); 53 | 54 | MPI_Barrier(MPI_COMM_WORLD); 55 | //all the rank must have the same value 56 | 57 | T res; 58 | check.CopyToHost(&res); 59 | return res==exp; 60 | 61 | } 62 | 63 | TEST(Gather, MPIinit) 64 | { 65 | ASSERT_EQ(rank_count,8); 66 | } 67 | 68 | 69 | 70 | TEST(Gather, IntegerMessages) 71 | { 72 | //with this test we evaluate the correcteness of integer messages transmission 73 | 74 | hlslib::ocl::Buffer check = context->MakeBuffer(1); 75 | hlslib::ocl::Kernel kernel = context->CurrentlyLoadedProgram().MakeKernel("test_int"); 76 | 77 | 78 | std::vector starts={1,100,300}; 79 | int runs=2; 80 | for(int start:starts) //consider different roots 81 | { 82 | cl::Kernel cl_kernel = kernel.kernel(); 83 | cl_kernel.setArg(0,sizeof(int),&start); 84 | cl_kernel.setArg(1,sizeof(SMI_Comm),&comm); 85 | cl_kernel.setArg(2,sizeof(cl_mem),&check.devicePtr()); 86 | 87 | for(int i=0;i /dev/null;"); 91 | 92 | //source https://github.com/google/googletest/issues/348#issuecomment-492785854 93 | ASSERT_DURATION_LE(10, { 94 | ASSERT_TRUE(runAndReturn(kernel,check,start+7)); 95 | }); 96 | 97 | } 98 | 99 | } 100 | } 101 | 102 | 103 | 104 | int main(int argc, char *argv[]) 105 | { 106 | // std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " " << std::endl; 107 | 108 | int result = 0; 109 | 110 | ::testing::InitGoogleTest(&argc, argv); 111 | //delete listeners for all the rank except 0 112 | if(argc==2) 113 | program_path =argv[1]; 114 | else 115 | program_path = "emulator_/mixed.aocx"; 116 | ::testing::TestEventListeners& listeners = 117 | ::testing::UnitTest::GetInstance()->listeners(); 118 | CHECK_MPI(MPI_Init(&argc, &argv)); 119 | 120 | CHECK_MPI(MPI_Comm_size(MPI_COMM_WORLD, &rank_count)); 121 | CHECK_MPI(MPI_Comm_rank(MPI_COMM_WORLD, &my_rank)); 122 | if (my_rank!= 0) { 123 | delete listeners.Release(listeners.default_result_printer()); 124 | } 125 | 126 | //create environemnt 127 | int fpga=my_rank%2; 128 | program_path = replace(program_path, "", std::to_string(my_rank)); 129 | context = new hlslib::ocl::Context(); 130 | auto program = context->MakeProgram(program_path); 131 | std::vector> buffers; 132 | comm=SmiInit_mixed(my_rank, rank_count, ROUTING_DIR, *context, program, buffers); 133 | 134 | 135 | result = RUN_ALL_TESTS(); 136 | MPI_Finalize(); 137 | 138 | return result; 139 | 140 | } 141 | -------------------------------------------------------------------------------- /test/p2p/p2p.json: -------------------------------------------------------------------------------- 1 | { 2 | "fpgas": { 3 | "fpga-0001:acl0": "p2p_rank0", 4 | "fpga-0001:acl1": "p2p_rank1", 5 | "fpga-0002:acl0": "p2p_rank1", 6 | "fpga-0002:acl1": "p2p_rank1", 7 | "fpga-0003:acl0": "p2p_rank1", 8 | "fpga-0003:acl1": "p2p_rank1", 9 | "fpga-0004:acl0": "p2p_rank1", 10 | "fpga-0004:acl1": "p2p_rank1" 11 | }, 12 | "connections": { 13 | "fpga-0001:acl0:ch2": "fpga-0001:acl1:ch3", 14 | "fpga-0001:acl0:ch3": "fpga-0001:acl1:ch2", 15 | "fpga-0002:acl0:ch2": "fpga-0002:acl1:ch3", 16 | "fpga-0002:acl0:ch3": "fpga-0002:acl1:ch2", 17 | "fpga-0001:acl0:ch1": "fpga-0002:acl0:ch0", 18 | "fpga-0001:acl1:ch1": "fpga-0002:acl1:ch0", 19 | "fpga-0002:acl0:ch1": "fpga-0003:acl0:ch0", 20 | "fpga-0002:acl1:ch1": "fpga-0003:acl1:ch0", 21 | "fpga-0003:acl0:ch2": "fpga-0003:acl1:ch3", 22 | "fpga-0003:acl0:ch3": "fpga-0003:acl1:ch2", 23 | "fpga-0004:acl0:ch2": "fpga-0004:acl1:ch3", 24 | "fpga-0004:acl0:ch3": "fpga-0004:acl1:ch2", 25 | "fpga-0003:acl0:ch1": "fpga-0004:acl0:ch0", 26 | "fpga-0003:acl1:ch1": "fpga-0004:acl1:ch0", 27 | "fpga-0001:acl0:ch0": "fpga-0004:acl0:ch1", 28 | "fpga-0001:acl1:ch0": "fpga-0004:acl1:ch1" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /test/p2p/p2p_rank0.cl: -------------------------------------------------------------------------------- 1 | /** 2 | P2P test. Rank 0 sends a stream of data 3 | */ 4 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable 5 | 6 | #include 7 | 8 | __kernel void test_char(const int N, const char dest_rank, const SMI_Comm comm) 9 | { 10 | SMI_Channel chan=SMI_Open_send_channel(N,SMI_CHAR,dest_rank,1,comm); 11 | for(int i=0;i 8 | 9 | __kernel void test_short(__global char *mem, const int N, SMI_Comm comm) 10 | { 11 | SMI_Channel chan=SMI_Open_receive_channel(N,SMI_SHORT,0,0,comm); 12 | char check=1; 13 | const short expected=1001; 14 | for(int i=0;i 7 | 8 | 9 | 10 | __kernel void test_int(const int N, char root,__global char* mem, SMI_Comm comm) 11 | { 12 | 13 | SMI_ScatterChannel __attribute__((register)) chan= SMI_Open_scatter_channel(N,N, SMI_INT, 0,root,comm); 14 | char check=1; 15 | int num_ranks=SMI_Comm_size(comm); 16 | int my_rank=SMI_Comm_rank(comm); 17 | const int loop_bound=(my_rank==root)?N*num_ranks:N; 18 | const int to_rcv_start=my_rank*N; 19 | for(int i=0;i