├── .gitignore
├── .gitmodules
├── CMakeGTEST.txt.in
├── CMakeLists.txt
├── LICENSE
├── README.md
├── codegen
├── codegen.py
├── common.py
├── fpga-06-07-08-09.json
├── main.py
├── notes.txt
├── ops.py
├── program.py
├── requirements.txt
├── rewrite.py
├── routing.py
├── routing_table.py
├── serialization.py
├── templates
│ ├── bcast.cl
│ ├── ckr.cl
│ ├── cks.cl
│ ├── device.cl
│ ├── gather.cl
│ ├── host.cl
│ ├── host_hlslib.cl
│ ├── pop.cl
│ ├── push.cl
│ ├── reduce.cl
│ ├── scatter.cl
│ └── utils.cl
├── tests
│ ├── conftest.py
│ ├── data
│ │ ├── buffer-size-expected.cl
│ │ ├── buffer-size.cl
│ │ ├── complex-expected.cl
│ │ ├── complex.cl
│ │ ├── constant-variable-expected.cl
│ │ ├── constant-variable.cl
│ │ ├── data-type-expected.cl
│ │ ├── data-type.cl
│ │ ├── kernel-attribute-expected.cl
│ │ ├── kernel-attribute.cl
│ │ ├── port-expected.cl
│ │ ├── port.cl
│ │ ├── reduce-expected.cl
│ │ ├── reduce.cl
│ │ ├── smi-device-1.h
│ │ └── smi-host-1.h
│ ├── test_codegen.py
│ ├── test_parse.py
│ ├── test_program.py
│ ├── test_rewriter.py
│ ├── test_routing.py
│ ├── test_routing_table.py
│ └── test_utils.py
├── topology_file_generator.py
└── utils.py
├── examples
├── CMakeLists.txt
├── host
│ ├── gesummv_onchip.cpp
│ ├── gesummv_smi.cpp
│ ├── kmeans_smi.cpp
│ ├── stencil_onchip.cpp
│ └── stencil_smi.cpp
├── include
│ ├── common.h
│ ├── fblas.h
│ ├── kmeans.h.in
│ └── stencil.h.in
└── kernels
│ ├── gesummv_onchip.cl
│ ├── gesummv_rank0.cl
│ ├── gesummv_rank1.cl
│ ├── gesummv_smi.json
│ ├── kmeans_smi.cl
│ ├── kmeans_smi.json
│ ├── stencil_onchip.cl.in
│ ├── stencil_onchip.py
│ ├── stencil_onchip_pe.cl.in
│ ├── stencil_smi.cl
│ └── stencil_smi.json
├── include
├── opencl-shim.h
├── smi.h
├── smi
│ ├── bcast.h
│ ├── channel_descriptor.h
│ ├── communicator.h
│ ├── data_types.h
│ ├── gather.h
│ ├── header_message.h
│ ├── network_message.h
│ ├── operation_type.h
│ ├── pop.h
│ ├── push.h
│ ├── reduce.h
│ ├── reduce_operations.h
│ └── scatter.h
└── utils
│ ├── ocl_utils.hpp
│ ├── smi_utils.hpp
│ └── utils.hpp
├── microbenchmarks
├── CMakeLists.txt
├── host
│ ├── bandwidth_benchmark.cpp
│ ├── broadcast_benchmark.cpp
│ ├── gather_benchmark.cpp
│ ├── injection_rate_benchmark.cpp
│ ├── latency_benchmark.cpp
│ ├── multi_collectives_benchmark.cpp
│ ├── reduce_benchmark.cpp
│ └── scatter_benchmark.cpp
└── kernels
│ ├── bandwidth.json
│ ├── bandwidth_0.cl
│ ├── bandwidth_1.cl
│ ├── broadcast.cl
│ ├── broadcast.json
│ ├── gather.cl
│ ├── gather.json
│ ├── injection_rate.json
│ ├── injection_rate_0.cl
│ ├── injection_rate_1.cl
│ ├── latency.json
│ ├── latency_0.cl
│ ├── latency_1.cl
│ ├── multi_collectives.cl
│ ├── multi_collectives.json
│ ├── reduce.cl
│ ├── reduce.json
│ ├── scatter.cl
│ └── scatter.json
├── misc
└── smi.png
├── source-rewriter
├── CMakeLists.txt
└── src
│ ├── action.cpp
│ ├── action.h
│ ├── main.cpp
│ ├── ops
│ ├── broadcast.cpp
│ ├── broadcast.h
│ ├── gather.cpp
│ ├── gather.h
│ ├── ops.cpp
│ ├── ops.h
│ ├── pop.cpp
│ ├── pop.h
│ ├── push.cpp
│ ├── push.h
│ ├── reduce.cpp
│ ├── reduce.h
│ ├── scatter.cpp
│ ├── scatter.h
│ ├── utils.cpp
│ └── utils.h
│ ├── rewrite.cpp
│ ├── rewrite.h
│ ├── third-party
│ └── json.hpp
│ ├── utils.cpp
│ └── utils.h
└── test
├── CMakeLists.txt
├── README.md
├── broadcast
├── broadcast.cl
├── broadcast.json
└── test_broadcast.cpp
├── gather
├── gather.cl
├── gather.json
└── test_gather.cpp
├── mixed
├── mixed.cl
├── mixed.json
└── test_mixed.cpp
├── p2p
├── p2p.json
├── p2p_rank0.cl
├── p2p_rank1.cl
└── test_p2p.cpp
├── reduce
├── reduce.cl
├── reduce.json
└── test_reduce.cpp
└── scatter
├── scatter.cl
├── scatter.json
└── test_scatter.cpp
/.gitignore:
--------------------------------------------------------------------------------
1 | .ycm_extra_conf.py
2 | build*
3 | __pycache__
4 |
5 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "hlslib"]
2 | path = hlslib
3 | url = https://github.com/definelicht/hlslib.git
4 |
--------------------------------------------------------------------------------
/CMakeGTEST.txt.in:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.0)
2 |
3 | project(googletest-download NONE)
4 |
5 | include(ExternalProject)
6 | ExternalProject_Add(googletest
7 | GIT_REPOSITORY https://github.com/google/googletest.git
8 | GIT_TAG master
9 | SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
10 | BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
11 | CONFIGURE_COMMAND ""
12 | BUILD_COMMAND ""
13 | INSTALL_COMMAND ""
14 | TEST_COMMAND ""
15 | )
16 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | BSD 3-Clause License
2 |
3 | Copyright (c) 2019, SPCL - ETH Zurich
4 | All rights reserved.
5 |
6 | Redistribution and use in source and binary forms, with or without
7 | modification, are permitted provided that the following conditions are met:
8 |
9 | 1. Redistributions of source code must retain the above copyright notice, this
10 | list of conditions and the following disclaimer.
11 |
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 | this list of conditions and the following disclaimer in the documentation
14 | and/or other materials provided with the distribution.
15 |
16 | 3. Neither the name of the copyright holder nor the names of its
17 | contributors may be used to endorse or promote products derived from
18 | this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Streaming Message Interface
4 |
5 | **Streaming Message Interface** is a a distributed memory HLS programming model for **FPGAs** that provides
6 | the convenience of message passing for HLS-programmed hardware devices. Instead of bulk transmission, typical of message passing model,
7 | with SMI messages are **streamed** across the network during computation, allowing communication to be seamlessly integrated into pipelined designs.
8 |
9 | This repository contains an high-level synthesis implementation of SMI targeting OpenCL and Intel FPGAs, and all the
10 | applications used for the evaluation perfomed in the paper: *"Streaming Message Interface: High-Performance Distributed Memory
11 | Programming on Reconfigurable Hardware"*, Tiziano De Matteis, Johannes de Fine Licht, Jakub Beránek, and Torsten Hofler. In Proceedings of the International Conference for High Performance Computing, Networking, Storage, and Analysis, 2019 (SC 2019).
12 |
13 |
14 | Please refer to the [wiki](https://github.com/spcl/SMI/wiki) and to the paper for a reference on how to use SMI for your own distributed FPGA programs.
15 |
16 |
17 | ## Reproducing the paper experiments
18 |
19 | All the tests and evaluations reported in the paper have been performed on a set of Bittware 520N cards (Stratix 10),
20 | each of them equipped with 4 network connections (QSFP modules) operating at 40Gbit/s.
21 |
22 | ### Requirements
23 |
24 | The library depends on:
25 |
26 | * CMake for configuration
27 | * Intel FPGA SDK for OpenCL pro, version 18.1 ([http://fpgasoftware.intel.com/opencl/](http://fpgasoftware.intel.com/opencl/)). *Experimental: support for v19+*
28 | * GCC (version 5+)
29 | * An MPI implementation (e.g. OpenMPI)
30 | * Python (version 3+)
31 | * CLang (version 8+)
32 |
33 | ### Compilation
34 |
35 | After cloning this repository, make sure you clone the submodule dependency, by executing the following command:
36 |
37 | ```
38 | git submodule update --init
39 | ```
40 |
41 | The project uses CMake for configuration. To configure the project and build the bitstreams and executables:
42 |
43 | ```bash
44 | mkdir build
45 | cd build
46 | cmake ..
47 | ```
48 | The experiments shown in the paper are organized in two subdirectories of the CMake folder, `microbenchmarks` and `examples`.
49 |
50 | For each of them the following targets are offered:
51 |
52 | - `make _emulator` builds the emulation version of the FPGA program;
53 | - `make _host` builds the host program;
54 | - `make __aoc_report` generates the report;
55 | - `make __aoc_build` builds the hardware (can take several hours).
56 |
57 | The applications presents in the repository are the following. For the details please refer to the paper:
58 |
59 | **Microbenchmarks**
60 |
61 | - `bandwidth`: bandwidth microbenchmark: an MPMD application composed by two programs, namely `bandwidth_0` (sender) and `bandwidth_1` (receiver);
62 | - `latency`: latency microbenchmark: an MPMD application composed by two programs, namely `latency_0` (source) and `latency_1` (destination).
63 | - `injection`: injection microbenchmark: an MPMD application composed by two programs, namely `injection_0` (sender) and `injection_1` (receiver).
64 | - `broadcast`: broadcast microbenchmark: an SPMD application (`broadcast`)
65 | - `reduce`: reduce microbenchmark: an SPMD application (`reduce`)
66 | - `scatter`: scatter microbenchmark (not included in the paper): an SPMD application (`scatter`)
67 | - `gather`: gather microbenchmark (not included in the paper): an SPMD application (`gather`)
68 |
69 | **Application examples**
70 |
71 | - `stencil_smi`: stencil application, smi implementation. It is composed by a single program (`stencil_smi`);
72 | - `stencil_onchip`: on chip version of the stencil application;
73 | - `gesummv_smi`: gesummv, smi implementation: composed by a two programs (`gesummv_rank0` and `gesummv_rank1`);
74 | - `gesummv_onchip`: on chip version of the gesummv application.
75 |
76 | **Unit tests**
77 |
78 | To enable unit tests, please execute `cmake` with the following flag `-DENABLE_TESTS=ON`
79 |
80 |
81 | **Please Note**: all the host programs have been written by considering the target architecture used in the paper, which is characterized by a set of nodes each one having 2 FPGAs.
82 | If you are using a different setup, please adjust the host programs.
83 |
84 | ### Example
85 |
86 | Suppose that the user wants to execute the `stencil_smi` application in emulation.
87 | The following steps must be performed:
88 |
89 | ```bash
90 | cd examples
91 | # Compile the emulation version
92 | make stencil_smi_emulator -j
93 | # Compile the host program
94 | make stencil_smi_host
95 | cd stencil_smi
96 | # Execute the program
97 | env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./stencil_smi_host emulator
98 | ```
99 |
100 | To generate the report, from the `examples` directory in the CMake folder, the user must execute:
101 | ```bash
102 | make stencil_smi_stencil_smi_aoc_report
103 | ```
104 |
105 | The report will be stored under `examples/stencil_smi/stencil_smi`.
106 |
107 |
108 |
109 | #### Stencil parameters
110 |
111 | For the stencil application, the stencil sizes and number of ranks in either dimension are configured using CMake parameters:
112 |
113 | ```bash
114 | cmake .. -DSMI_STENCIL_SIZE_X=8192 -DSMI_STENCIL_SIZE_Y=8192 -DSMI_STENCIL_NUM_PROCS_X=2 -DSMI_STENCIL_NUM_PROCS_Y=2
115 | ```
116 |
117 | Other parameters include `SMI_VECTORIZATION_WIDTH`, `SMI_DATATYPE`, `SMI_FMAX`, and `SMI_ROUTING_FILE`.
118 |
119 |
120 |
--------------------------------------------------------------------------------
/codegen/codegen.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | from typing import List, Tuple
4 |
5 | import jinja2
6 | from networkx import Graph
7 |
8 | from program import Channel, target_index, FPGA, Program
9 |
10 |
11 | def read_template_file(path):
12 | templates = os.path.join(os.path.dirname(__file__), "templates")
13 | loader = jinja2.FileSystemLoader(searchpath=templates)
14 |
15 | logging.basicConfig()
16 | logger = logging.getLogger('logger')
17 | logger = jinja2.make_logging_undefined(logger=logger, base=jinja2.Undefined)
18 |
19 | env = jinja2.Environment(loader=loader, undefined=logger)
20 | env.lstrip_blocks = True
21 | env.trim_blocks = True
22 | return env.get_template(path)
23 |
24 |
25 | def channel_name(src: Channel, out: bool, graph: Graph) -> str:
26 | remote_channel = None
27 | for (_, to) in graph.edges(src):
28 | if to.fpga != src.fpga:
29 | remote_channel = to
30 |
31 | if remote_channel:
32 | remote_channel = "r{}c{}".format(remote_channel.fpga.rank, remote_channel.index)
33 | else:
34 | remote_channel = "unconnected"
35 |
36 | local_channel = "r{}c{}".format(src.fpga.rank, src.index)
37 | if not out:
38 | tmp = local_channel
39 | local_channel = remote_channel
40 | remote_channel = tmp
41 |
42 | return "{}_{}".format(local_channel, remote_channel)
43 |
44 |
45 | def generate_program_host(programs: List[Tuple[str, Program]]) -> str:
46 | template = read_template_file("host_hlslib.cl")
47 | return template.render(programs=programs)
48 |
49 |
50 | def generate_program_device(fpga: FPGA, fpgas: List[FPGA], graph: Graph, channels_per_fpga: int) -> str:
51 | template = read_template_file("device.cl")
52 | return template.render(channels=fpga.channels,
53 | channels_per_fpga=channels_per_fpga,
54 | target_index=target_index,
55 | program=fpga.program,
56 | fpgas=fpgas,
57 | channel_name=lambda channel, out: channel_name(channel, out, graph))
58 |
--------------------------------------------------------------------------------
/codegen/common.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | from networkx import Graph
4 |
5 | from program import FPGA
6 |
7 |
8 | class RoutingContext:
9 | def __init__(self, graph: Graph, routes, fpgas: List[FPGA]):
10 | self.graph = graph
11 | self.routes = routes
12 | self.fpgas = fpgas
13 |
14 |
15 | def write_nodefile(fpgas: List[FPGA], stream):
16 | fpgas = sorted(fpgas, key=lambda f: f.rank)
17 |
18 | for (index, fpga) in enumerate(fpgas):
19 | stream.write("{} # {}, rank{}\n".format(fpga.node, fpga.name, index))
20 |
--------------------------------------------------------------------------------
/codegen/fpga-06-07-08-09.json:
--------------------------------------------------------------------------------
1 | {
2 | "fpgas": {
3 | "fpga-0006:acl0": "pg1.json",
4 | "fpga-0006:acl1": "pg1.json",
5 | "fpga-0007:acl0": "pg1.json",
6 | "fpga-0007:acl1": "pg1.json",
7 | "fpga-0008:acl0": "pg1.json",
8 | "fpga-0008:acl1": "pg1.json",
9 | "fpga-0009:acl0": "pg1.json",
10 | "fpga-0009:acl1": "pg1.json"
11 | },
12 | "connections": {
13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3",
14 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2",
15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3",
16 | "fpga-0007:acl0:ch3": "fpga-0007:acl1:ch2",
17 | "fpga-0006:acl0:ch1": "fpga-0007:acl0:ch0",
18 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0",
19 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0",
20 | "fpga-0007:acl1:ch1": "fpga-0008:acl1:ch0",
21 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3",
22 | "fpga-0008:acl0:ch3": "fpga-0008:acl1:ch2",
23 | "fpga-0009:acl0:ch2": "fpga-0009:acl1:ch3",
24 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2",
25 | "fpga-0008:acl0:ch1": "fpga-0009:acl0:ch0",
26 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0",
27 | "fpga-0006:acl0:ch0": "fpga-0009:acl0:ch1",
28 | "fpga-0006:acl1:ch0": "fpga-0009:acl1:ch1"
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/codegen/notes.txt:
--------------------------------------------------------------------------------
1 | # Nomenclature
2 | Logical port = number that the user enters into the SMI API (e.g. SMI_Open_send_channel(..., 5))
3 | Channel = Intel channel used to write and read data
4 |
5 | # Routing tables
6 | There are 2 routing tables for each rank.
7 |
8 | ## CKS routing table
9 | - size is the number of Logical ports
10 | - maps Logical port to a number:
11 | 0 => send to connected QSFP
12 | 1 => send to connected CKR
13 | 2 => send to first neighbour CKS
14 | => send to ... neighbour CKS
15 |
16 | - neighbours are always counted from the first kernel
17 | Kernel 0
18 | Kernel 1
19 | Kernel 2
20 |
21 | K0 - first neighbour is K1, second neighbour is K2
22 | K1 - first neighbour is K0, second neighbour is K2
23 | K2 - first neighbour is K0, second neighbour is K1
24 |
25 | ## CKR routing table
26 | - size is the number of Logical ports * 2
27 | - contains entries for both data and control ports
28 | Logical port X: data is at rt[x * 2], control is at rt[x * 2 + 1]
29 |
30 | - maps Logical port and data/control to a number:
31 | 0 => invalid (the program does not expect to receive this combination)
32 | - in the current implementation, this will result in sending the packet to CK_S
33 | 1 => send to first neighbour CK_R
34 | => send to ... neighbour CK_R
35 | => first channel assigned to the given CKR
36 | => ... channel assigned to the given CKR
37 |
38 | - neighbours are counted in the same way as for the CKS routing table
39 |
40 | # Channel distribution amongst kernels
41 | Channels are assigned to CKS/CKR in a round-robin fashion.
42 | Data and control hardware ports are combined (in this order) and then distributed.
43 | Example:
44 | cks_data_channels = [0, 1, 2]
45 | cks_control_channels = [0, 1]
46 | cks_hw_ports = [("data", 0), ("data", 1), ("data", 2), ("control", 0), ("control", 1)]
47 | number of CKS = 3
48 | CKS_0 => [("data", 0), ("control", 0)]
49 | CKS_1 => [("data", 1), ("control", 1)]
50 | CKS_2 => [("data", 2)]
51 |
--------------------------------------------------------------------------------
/codegen/requirements.txt:
--------------------------------------------------------------------------------
1 | bitstring
2 | click
3 | jinja2
4 | networkx
5 | pytest
6 |
--------------------------------------------------------------------------------
/codegen/rewrite.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import shutil
4 | import subprocess
5 |
6 | import math
7 |
8 | from ops import SmiOperation
9 | from serialization import parse_smi_operation
10 |
11 |
12 | def copy_files(src_dir, dest_dir, files):
13 | """
14 | Copies device source files from the source directory to the output directory.
15 | Returns a list of tuples (src, dest) path.
16 | """
17 | for file in files:
18 | src_path = os.path.join(src_dir, file)
19 | dest_path = os.path.join(dest_dir, file)
20 | dest_dir = os.path.dirname(dest_path)
21 | os.makedirs(dest_dir, exist_ok=True)
22 | shutil.copyfile(src_path, dest_path, follow_symlinks=True)
23 | yield (src_path, dest_path)
24 |
25 |
26 | def transform_buffer_size(data, op: SmiOperation):
27 | """
28 | Buffer size from the user is given in number of elements, it has to be translated into the number of messages.
29 | The transformed buffer size has to be a multiple of 8.
30 | """
31 | buffer_size = data.get("buffer_size")
32 | if buffer_size is not None:
33 | op.buffer_size = math.ceil((max(1, op.buffer_size) / op.data_elements_per_packet()) / 8) * 8
34 |
35 |
36 | def rewrite(rewriter, file, include_dirs, log):
37 | log.write("Rewriting {}".format(file))
38 |
39 | args = [rewriter, file]
40 | for dir in include_dirs:
41 | args += ["-extra-arg=-I{}".format(dir)]
42 |
43 | process = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
44 | output = process.stdout.decode()
45 |
46 | log.write("STDOUT\n{}".format(output))
47 | log.write("STDERR\n{}".format(process.stderr.decode()))
48 |
49 | ops = []
50 | for line in output.splitlines():
51 | if line:
52 | data = json.loads(line)
53 | op = parse_smi_operation(data)
54 | transform_buffer_size(data, op)
55 | ops.append(op)
56 |
57 | return ops
58 |
--------------------------------------------------------------------------------
/codegen/routing.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple, Dict
2 |
3 | import networkx
4 | from networkx import Graph
5 |
6 | from common import RoutingContext
7 | from program import COST_INTRA_FPGA, COST_INTER_FPGA, FPGA, ProgramMapping
8 |
9 | """
10 | Each CK_R/CK_S separate QSFP
11 | CK_S <-> CK_R interconnected
12 | CK_R/CK_S connected only to a single neighbour CKR/CK_S
13 |
14 | fpga-0014:acl0:ch0 - fpga-0014:acl1:ch0
15 | """
16 |
17 |
18 | def create_routing_context(fpga_connections: Dict[Tuple[str, int], Tuple[str, int]], program_mapping: ProgramMapping):
19 | graph = networkx.Graph()
20 | fpgas = load_inter_fpga_connections(graph, fpga_connections, program_mapping)
21 | add_intra_fpga_connections(graph, fpgas)
22 | routes = shortest_paths(graph)
23 | fpgas = create_ranks_for_fpgas(fpgas)
24 | return RoutingContext(graph, routes, fpgas)
25 |
26 |
27 | def load_inter_fpga_connections(graph: networkx.Graph,
28 | fpga_connections: Dict[Tuple[str, int], Tuple[str, int]],
29 | program_mapping: ProgramMapping) -> List[FPGA]:
30 | """
31 | Parses FPGA connections and embeds them into a graph.
32 | """
33 | fpgas = {}
34 |
35 | def get_channel(fpga_key, channel):
36 | if fpga_key not in fpgas:
37 | node, fpga_name = fpga_key.split(":")
38 | fpgas[fpga_key] = FPGA(node, fpga_name, program_mapping.fpga_map[fpga_key])
39 | fpga = fpgas[fpga_key]
40 | return fpga.channels[channel]
41 |
42 | for (src, dst) in fpga_connections.items():
43 | src, dst = [get_channel(p[0], p[1]) for p in (src, dst)]
44 | graph.add_edge(src, dst, weight=COST_INTER_FPGA, label="{}-{}".format(src, dst))
45 |
46 | return list(fpgas.values())
47 |
48 |
49 | def add_intra_fpga_connections(graph: Graph, fpgas: List[FPGA]):
50 | for fpga in fpgas:
51 | for a in fpga.channels:
52 | for b in fpga.channels:
53 | if a is not b:
54 | graph.add_edge(a, b, weight=COST_INTRA_FPGA)
55 |
56 |
57 | def shortest_paths(graph):
58 | return networkx.shortest_path(graph, source=None, target=None, weight="weight")
59 |
60 |
61 | def create_ranks_for_fpgas(fpgas: List[FPGA]) -> List[FPGA]:
62 | """
63 | Enumerates all channels and assigns ranks to individual FPGAs, sorted by their (node, fpga)
64 | name pair.
65 | """
66 | fpgas = sorted(fpgas, key=lambda f: f.key())
67 | for (rank, fpga) in enumerate(fpgas):
68 | fpga.rank = rank
69 | return fpgas
70 |
--------------------------------------------------------------------------------
/codegen/routing_table.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import bitstring
4 |
5 | from ops import KEY_CKR_DATA, KEY_CKR_CONTROL
6 | from program import Channel, FPGA, Program
7 |
8 | CKS_TARGET_QSFP = 0
9 | CKS_TARGET_CKR = 1
10 |
11 |
12 | class NoRouteFound(BaseException):
13 | pass
14 |
15 |
16 | def closest_path_to_fpga(paths, channel: Channel, target: FPGA):
17 | routes = paths[channel]
18 | connections = []
19 | for destination in routes:
20 | if destination.fpga == target:
21 | connections.append(routes[destination])
22 |
23 | if not connections:
24 | raise NoRouteFound("No route found from {} to {}".format(channel, target))
25 | return min(connections, key=lambda c: len(c))
26 |
27 |
28 | def get_output_target(paths, channel: Channel, target: FPGA):
29 | """
30 | 0 -> local QSFP
31 | 1 -> CK_R
32 | 2 -> first neighbour
33 | 3 -> second neighbour
34 | 4 -> ...
35 | """
36 | if target == channel.fpga:
37 | return CKS_TARGET_CKR
38 |
39 | path = closest_path_to_fpga(paths, channel, target)[1:] # skip the channel itself
40 | if path[0].fpga == channel.fpga:
41 | return 2 + channel.target_index(path[0].index)
42 | else:
43 | return CKS_TARGET_QSFP
44 |
45 |
46 | def cks_routing_table(paths, fpgas: List[FPGA], channel: Channel) -> List[int]:
47 | table = []
48 | for fpga in fpgas:
49 | target = get_output_target(paths, channel, fpga)
50 | table.append(target)
51 | return table
52 |
53 |
54 | def get_input_target(channel: Channel, logical_port: int, program: Program,
55 | channels_per_fpga: int, key) -> int:
56 | """
57 | 0 -> local CK_S (never generated here)
58 | 1 -> CK_R_0
59 | 2 -> CK_R_1
60 | ...
61 | [channels_per_fpga - 1] -> CK_R_N-1
62 | N -> first hardware port assigned to the given channel
63 | N + 1 -> second hardware port assigned to the given channel
64 | """
65 |
66 | target_channel_index = program.get_channel_for_port_key(logical_port, key)
67 | if target_channel_index is None:
68 | return 0
69 | if target_channel_index != channel.index:
70 | return 1 + channel.target_index(target_channel_index)
71 |
72 | allocations = tuple((op.logical_port, key) for (op, key)
73 | in program.get_channel_allocations_with_prefix(channel.index, "ckr"))
74 | return channels_per_fpga + allocations.index((logical_port, key))
75 |
76 |
77 | def ckr_routing_table(channel: Channel, channels_per_fpga: int, program: Program) -> List[int]:
78 | table = []
79 | for port in range(program.logical_port_count):
80 | table.append(get_input_target(channel, port, program, channels_per_fpga, KEY_CKR_DATA))
81 | table.append(get_input_target(channel, port, program, channels_per_fpga, KEY_CKR_CONTROL))
82 | return table
83 |
84 |
85 | def serialize_to_array(table: List[int], bytes=1):
86 | stream = bitstring.BitStream()
87 | bitcount = bytes * 8
88 | for target in table:
89 | stream.append("uintle:{}={}".format(bitcount, target))
90 | return stream.bytes
91 |
--------------------------------------------------------------------------------
/codegen/serialization.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import os
4 | from typing import List, Tuple, Dict
5 |
6 | from ops import Broadcast, Push, Pop, Reduce, Scatter, Gather
7 | from program import Program, SmiOperation, ProgramMapping
8 |
9 | SMI_OP_KEYS = {
10 | "push": Push,
11 | "pop": Pop,
12 | "broadcast": Broadcast,
13 | "reduce": Reduce,
14 | "scatter": Scatter,
15 | "gather": Gather
16 | }
17 |
18 |
19 | def parse_smi_operation(obj) -> SmiOperation:
20 | type = obj["type"]
21 | port = obj["port"]
22 | data_type = obj.get("data_type", "int")
23 | buffer_size = obj.get("buffer_size")
24 | args = obj.get("args", {})
25 |
26 | assert type in SMI_OP_KEYS
27 | return SMI_OP_KEYS[type](port, data_type, buffer_size, **args)
28 |
29 |
30 | def serialize_smi_operation(op: SmiOperation):
31 | inv_map = {v: k for k, v in SMI_OP_KEYS.items()}
32 |
33 | return {
34 | "type": inv_map[op.__class__],
35 | "port": op.logical_port,
36 | "data_type": op.data_type,
37 | "buffer_size": op.buffer_size,
38 | "args": op.serialize_args()
39 | }
40 |
41 |
42 | def parse_operations(operations) -> List[SmiOperation]:
43 | return [parse_smi_operation(p) for p in operations]
44 |
45 |
46 | def parse_program(input: str) -> Program:
47 | prog = json.loads(input)
48 | return Program(
49 | parse_operations(prog["operations"]),
50 | prog.get("consecutive_reads"),
51 | prog.get("max_ranks"),
52 | """prog.get("p2p_rendezvous") TODO: fix"""
53 | )
54 |
55 |
56 | def serialize_program(program: Program) -> str:
57 | return json.dumps({
58 | "operations": [serialize_smi_operation(op) for op in program.operations]
59 | })
60 |
61 |
62 | def parse_routing_file(data: str, metadata_paths=None, ignore_programs=False) -> Tuple[Dict[Tuple[str, int], Tuple[str, int]], ProgramMapping]:
63 | if metadata_paths is None:
64 | metadata_paths = []
65 |
66 | path_index = {}
67 | for path in metadata_paths:
68 | path_index[os.path.splitext(os.path.basename(path))[0]] = path
69 |
70 | data = json.loads(data)
71 | program_cache = {}
72 | fpga_map = {}
73 | for (fpga, program_path) in data.get("fpgas", {}).items():
74 | if program_path not in program_cache:
75 | if ignore_programs:
76 | program_cache[program_path] = None
77 | else:
78 | real_path = path_index[program_path]
79 | with open(real_path) as pf:
80 | program_cache[program_path] = parse_program(pf.read())
81 |
82 | fpga_map[fpga] = program_cache[program_path]
83 |
84 | mapping = ProgramMapping(list(program_cache.values()), fpga_map)
85 |
86 | channel_regex = re.compile(r".*(\d+)$")
87 | connections = {}
88 |
89 | def parse_key(data):
90 | node, fpga_name, channel = data
91 | return "{}:{}".format(node, fpga_name)
92 |
93 | def parse_channel(data):
94 | node, fpga_name, channel = data
95 | match = channel_regex.match(channel)
96 | return int(match.group(1))
97 |
98 | for (src, dst) in data.get("connections", {}).items():
99 | src, dst = [item.split(":") for item in (src, dst)]
100 | src, dst = [(parse_key(d), parse_channel(d)) for d in (src, dst)]
101 | assert src not in connections
102 | assert dst not in connections
103 | connections[src] = dst
104 | connections[dst] = src
105 |
106 | return (connections, mapping)
107 |
--------------------------------------------------------------------------------
/codegen/templates/ckr.cl:
--------------------------------------------------------------------------------
1 | {% import 'utils.cl' as utils %}
2 |
3 | {%- macro smi_ckr(program, channel, channel_count, target_index) -%}
4 | __kernel void smi_kernel_ckr_{{ channel.index }}(__global volatile char *restrict rt, const char rank)
5 | {
6 | // rt contains intertwined (dp0, cp0, dp1, cp1, ...)
7 | {% set logical_ports = program.logical_port_count %}
8 | char external_routing_table[{{ logical_ports }} /* logical port count */][2];
9 | for (int i = 0; i < {{ logical_ports }}; i++)
10 | {
11 | for (int j = 0; j < 2; j++)
12 | {
13 | external_routing_table[i][j] = rt[i * 2 + j];
14 | }
15 | }
16 |
17 | // QSFP + number of CK_Rs - 1 + CK_S
18 | const char num_sender = {{ channel_count + 1 }};
19 | char sender_id = 0;
20 | SMI_Network_message message;
21 |
22 | char contiguous_reads = 0;
23 | while (1)
24 | {
25 | bool valid = false;
26 | switch (sender_id)
27 | {
28 | case 0:
29 | // QSFP
30 | message = read_channel_nb_intel(io_in_{{ channel.index }}, &valid);
31 | break;
32 | {% for ck_r in channel.neighbours() %}
33 | case {{ loop.index0 + 1 }}:
34 | // receive from CK_R_{{ ck_r }}
35 | message = read_channel_nb_intel(channels_interconnect_ck_r[{{ (channel_count - 1) * channel.index + loop.index0 }}], &valid);
36 | break;
37 | {% endfor %}
38 | case {{ channel_count }}:
39 | // receive from CK_S_{{ channel.index }}
40 | message = read_channel_nb_intel(channels_interconnect_ck_s_to_ck_r[{{ channel.index }}], &valid);
41 | break;
42 | }
43 |
44 | if (valid)
45 | {
46 | contiguous_reads++;
47 | char dest;
48 | if (GET_HEADER_DST(message.header) != rank)
49 | {
50 | dest = 0;
51 | }
52 | else dest = external_routing_table[GET_HEADER_PORT(message.header)][GET_HEADER_OP(message.header) == SMI_SYNCH];
53 |
54 | switch (dest)
55 | {
56 | case 0:
57 | // send to CK_S_{{ channel.index }}
58 | write_channel_intel(channels_interconnect_ck_r_to_ck_s[{{ channel.index }}], message);
59 | break;
60 | {% for ck_r in channel.neighbours() %}
61 | case {{ loop.index0 + 1 }}:
62 | // send to CK_R_{{ ck_r }}
63 | write_channel_intel(channels_interconnect_ck_r[{{ (channel_count - 1) * ck_r + target_index(ck_r, channel.index) }}], message);
64 | break;
65 | {% endfor %}
66 | {% for (op, key) in program.get_channel_allocations_with_prefix(channel.index, "ckr") %}
67 | case {{ channel_count + loop.index0 }}:
68 | // send to {{ op }}
69 | write_channel_intel({{ op.get_channel(key) }}, message);
70 | break;
71 | {% endfor %}
72 | }
73 | }
74 |
75 | if (!valid || contiguous_reads == READS_LIMIT)
76 | {
77 | contiguous_reads = 0;
78 | sender_id++;
79 | if (sender_id == num_sender)
80 | {
81 | sender_id = 0;
82 | }
83 | }
84 | }
85 | }
86 | {%- endmacro %}
87 |
--------------------------------------------------------------------------------
/codegen/templates/cks.cl:
--------------------------------------------------------------------------------
1 | {% import 'utils.cl' as utils %}
2 |
3 | {%- macro smi_cks(program, channel, channel_count, target_index) -%}
4 | __kernel void smi_kernel_cks_{{ channel.index }}(__global volatile char *restrict rt, const char num_ranks)
5 | {
6 | char external_routing_table[MAX_RANKS];
7 | for (int i = 0; i < MAX_RANKS; i++)
8 | {
9 | if (i < num_ranks)
10 | {
11 | external_routing_table[i] = rt[i];
12 | }
13 | }
14 |
15 | {% set allocations = program.get_channel_allocations_with_prefix(channel.index, "cks") %}
16 | // number of CK_S - 1 + CK_R + {{ allocations|length }} CKS hardware ports
17 | const char num_sender = {{ channel_count + allocations|length }};
18 | char sender_id = 0;
19 | SMI_Network_message message;
20 |
21 | char contiguous_reads = 0;
22 |
23 | while (1)
24 | {
25 | bool valid = false;
26 | switch (sender_id)
27 | {
28 | {% for ck_s in channel.neighbours() %}
29 | case {{ loop.index0 }}:
30 | // receive from CK_S_{{ ck_s }}
31 | message = read_channel_nb_intel(channels_interconnect_ck_s[{{ (channel_count - 1) * channel.index + loop.index0 }}], &valid);
32 | break;
33 | {% endfor %}
34 | case {{ channel_count - 1 }}:
35 | // receive from CK_R_{{ channel.index }}
36 | message = read_channel_nb_intel(channels_interconnect_ck_r_to_ck_s[{{ channel.index }}], &valid);
37 | break;
38 | {% for (op, key) in allocations %}
39 | case {{ channel_count + loop.index0 }}:
40 | // receive from {{ op }}
41 | message = read_channel_nb_intel({{ op.get_channel(key) }}, &valid);
42 | break;
43 | {% endfor %}
44 | }
45 |
46 | if (valid)
47 | {
48 | contiguous_reads++;
49 | char idx = external_routing_table[GET_HEADER_DST(message.header)];
50 | switch (idx)
51 | {
52 | case 0:
53 | // send to QSFP
54 | write_channel_intel(io_out_{{ channel.index }}, message);
55 | break;
56 | case 1:
57 | // send to CK_R_{{ channel.index }}
58 | write_channel_intel(channels_interconnect_ck_s_to_ck_r[{{ channel.index }}], message);
59 | break;
60 | {% for ck_s in channel.neighbours() %}
61 | case {{ 2 + loop.index0 }}:
62 | // send to CK_S_{{ ck_s }}
63 | write_channel_intel(channels_interconnect_ck_s[{{ (channel_count - 1) * ck_s + target_index(ck_s, channel.index) }}], message);
64 | break;
65 | {% endfor %}
66 | }
67 | }
68 | if (!valid || contiguous_reads == READS_LIMIT)
69 | {
70 | contiguous_reads = 0;
71 | sender_id++;
72 | if (sender_id == num_sender)
73 | {
74 | sender_id = 0;
75 | }
76 | }
77 | }
78 | }
79 | {%- endmacro %}
80 |
--------------------------------------------------------------------------------
/codegen/templates/device.cl:
--------------------------------------------------------------------------------
1 | #include "smi/network_message.h"
2 | {% import 'utils.cl' as utils %}
3 | {% import 'ckr.cl' as smi_ckr %}
4 | {% import 'cks.cl' as smi_cks %}
5 |
6 | {% import 'push.cl' as smi_push %}
7 | {% import 'pop.cl' as smi_pop %}
8 | {% import 'bcast.cl' as smi_bcast %}
9 | {% import 'reduce.cl' as smi_reduce %}
10 | {% import 'scatter.cl' as smi_scatter %}
11 | {% import 'gather.cl' as smi_gather %}
12 |
13 | // the maximum number of consecutive reads that each CKs/CKr can do from the same channel
14 | #define READS_LIMIT {{ program.consecutive_read_limit }}
15 | // maximum number of ranks in the cluster
16 | #define MAX_RANKS {{ program.max_ranks }}
17 | {% if program.p2p_rendezvous %}
18 | //P2P communications use synchronization
19 | #define P2P_RENDEZVOUS
20 | {% else %}
21 | //P2P communications use eager transmission protocol
22 | {% endif %}
23 |
24 | // QSFP channels
25 | #ifndef SMI_EMULATION_RANK
26 | {% for channel in channels %}
27 | channel SMI_Network_message io_out_{{ channel.index }} __attribute__((depth(16))) __attribute__((io("kernel_output_ch{{ channel.index }}")));
28 | channel SMI_Network_message io_in_{{ channel.index }} __attribute__((depth(16))) __attribute__((io("kernel_input_ch{{ channel.index }}")));
29 | {% endfor %}
30 | #else
31 | {% for fpga in fpgas %}
32 | #if SMI_EMULATION_RANK == {{ fpga.rank }}
33 | {% for channel in range(channels_per_fpga) %}
34 | channel SMI_Network_message io_out_{{ channel }} __attribute__((depth(16))) __attribute__((io("emulated_channel_{{ channel_name(fpga.channels[channel], true) }}")));
35 | channel SMI_Network_message io_in_{{ channel }} __attribute__((depth(16))) __attribute__((io("emulated_channel_{{ channel_name(fpga.channels[channel], false) }}")));
36 | {% endfor %}
37 | #endif
38 | {% endfor %}
39 | #endif
40 |
41 | {% for op in program.operations %}
42 | // {{ op }}
43 | {% for (channel, depth) in op.get_channel_defs(program.p2p_rendezvous) %}
44 | channel SMI_Network_message {{ channel }} __attribute__((depth({{ depth }})));
45 | {% endfor %}
46 | {% endfor %}
47 |
48 | __constant char QSFP_COUNT = {{ channels_per_fpga }};
49 |
50 | // connect all CK_S together
51 | channel SMI_Network_message channels_interconnect_ck_s[QSFP_COUNT*(QSFP_COUNT-1)] __attribute__((depth(16)));
52 |
53 | // connect all CK_R together
54 | channel SMI_Network_message channels_interconnect_ck_r[QSFP_COUNT*(QSFP_COUNT-1)] __attribute__((depth(16)));
55 |
56 | // connect corresponding CK_S/CK_R pairs
57 | channel SMI_Network_message channels_interconnect_ck_s_to_ck_r[QSFP_COUNT] __attribute__((depth(16)));
58 |
59 | // connect corresponding CK_R/CK_S pairs
60 | channel SMI_Network_message channels_interconnect_ck_r_to_ck_s[QSFP_COUNT] __attribute__((depth(16)));
61 |
62 | #include "smi/pop.h"
63 | #include "smi/push.h"
64 | #include "smi/bcast.h"
65 | #include "smi/reduce.h"
66 | #include "smi/scatter.h"
67 | #include "smi/gather.h"
68 | #include "smi/communicator.h"
69 |
70 | {% for channel in channels %}
71 | {{ smi_cks.smi_cks(program, channel, channels|length, target_index) }}
72 | {{ smi_ckr.smi_ckr(program, channel, channels|length, target_index) }}
73 | {% endfor %}
74 |
75 | {%- macro generate_op_impl(key, fn) %}
76 | {% for op in program.get_ops_by_type(key) %}
77 | {{ fn(program, op) }}
78 | {% endfor %}
79 | {%- endmacro %}
80 |
81 | // Push
82 | {{ generate_op_impl("push", smi_push.smi_push_channel) }}
83 | {{ generate_op_impl("push", smi_push.smi_push_impl) }}
84 | // Pop
85 | {{ generate_op_impl("pop", smi_pop.smi_pop_channel) }}
86 | {{ generate_op_impl("pop", smi_pop.smi_pop_impl) }}
87 | // Broadcast
88 | {{ generate_op_impl("broadcast", smi_bcast.smi_bcast_kernel) }}
89 | {{ generate_op_impl("broadcast", smi_bcast.smi_bcast_channel) }}
90 | {{ generate_op_impl("broadcast", smi_bcast.smi_bcast_impl) }}
91 | // Scatter
92 | {{ generate_op_impl("scatter", smi_scatter.smi_scatter_kernel) }}
93 | {{ generate_op_impl("scatter", smi_scatter.smi_scatter_channel) }}
94 | {{ generate_op_impl("scatter", smi_scatter.smi_scatter_impl) }}
95 | // Gather
96 | {{ generate_op_impl("gather", smi_gather.smi_gather_kernel) }}
97 | {{ generate_op_impl("gather", smi_gather.smi_gather_channel) }}
98 | {{ generate_op_impl("gather", smi_gather.smi_gather_impl) }}
99 | // Reduce
100 | {{ generate_op_impl("reduce", smi_reduce.smi_reduce_kernel) }}
101 | {{ generate_op_impl("reduce", smi_reduce.smi_reduce_channel) }}
102 | {{ generate_op_impl("reduce", smi_reduce.smi_reduce_impl) }}
103 |
--------------------------------------------------------------------------------
/codegen/templates/host.cl:
--------------------------------------------------------------------------------
1 | #define __HOST_PROGRAM__
2 | #include
3 | #include
4 | #include
5 |
6 | {% for (name, program) in programs -%}
7 | SMI_Comm SmiInit_{{ name }}(
8 | int rank,
9 | int ranks_count,
10 | const char* program_path,
11 | const char* routing_dir,
12 | cl::Platform &platform,
13 | cl::Device &device,
14 | cl::Context &context,
15 | cl::Program &program,
16 | int fpga,
17 | std::vector &buffers)
18 | {
19 | std::vector kernels;
20 | std::vector queues;
21 | std::vector kernel_names;
22 |
23 | // channel kernels
24 | {% for channel in range(program.channel_count) %}
25 | kernel_names.push_back("smi_kernel_cks_{{ channel }}");
26 | kernel_names.push_back("smi_kernel_ckr_{{ channel }}");
27 | {% endfor %}
28 | {%- macro generate_collective_kernels(key, kernel_name) %}
29 | {% set ops = program.get_ops_by_type(key) %}
30 | // {{ key }} kernels
31 | {% for op in ops %}
32 | kernel_names.push_back("{{ kernel_name }}_{{ op.logical_port }}");
33 | {% endfor %}
34 | {%- endmacro %}
35 |
36 | {{ generate_collective_kernels("broadcast", "smi_kernel_bcast") }}
37 | {{ generate_collective_kernels("reduce", "smi_kernel_reduce") }}
38 | {{ generate_collective_kernels("scatter", "smi_kernel_scatter") }}
39 | {{ generate_collective_kernels("gather", "smi_kernel_gather") }}
40 |
41 | IntelFPGAOCLUtils::initEnvironment(
42 | platform, device, fpga, context,
43 | program, program_path, kernel_names, kernels, queues
44 | );
45 |
46 | // create buffers for CKS/CKR
47 | const int ports = {{ program.logical_port_count }};
48 | const int cks_table_size = ranks_count;
49 | const int ckr_table_size = ports * 2;
50 | {% for channel in range(program.channel_count) %}
51 | cl::Buffer routing_table_ck_s_{{ channel }}(context, CL_MEM_READ_ONLY, cks_table_size);
52 | cl::Buffer routing_table_ck_r_{{ channel }}(context, CL_MEM_READ_ONLY, ckr_table_size);
53 | {% endfor %}
54 |
55 | // load routing tables
56 | char routing_tables_cks[{{ program.channel_count}}][cks_table_size];
57 | char routing_tables_ckr[{{ program.channel_count}}][ckr_table_size];
58 | for (int i = 0; i < {{ program.channel_count }}; i++)
59 | {
60 | LoadRoutingTable(rank, i, cks_table_size, routing_dir, "cks", &routing_tables_cks[i][0]);
61 | LoadRoutingTable(rank, i, ckr_table_size, routing_dir, "ckr", &routing_tables_ckr[i][0]);
62 | }
63 |
64 | {% for channel in range(program.channel_count) %}
65 | queues[0].enqueueWriteBuffer(routing_table_ck_s_{{ channel }}, CL_TRUE, 0, cks_table_size, &routing_tables_cks[{{ channel }}][0]);
66 | queues[0].enqueueWriteBuffer(routing_table_ck_r_{{ channel }}, CL_TRUE, 0, ckr_table_size, &routing_tables_ckr[{{ channel }}][0]);
67 | {% endfor %}
68 |
69 | char char_ranks_count=ranks_count;
70 | char char_rank=rank;
71 | {% set ctx = namespace(kernel=0) %}
72 | {% for channel in range(program.channel_count) %}
73 | // cks_{{ channel }}
74 | kernels[{{ ctx.kernel }}].setArg(0, sizeof(cl_mem), &routing_table_ck_s_{{ channel }});
75 | kernels[{{ ctx.kernel }}].setArg(1, sizeof(char), &char_ranks_count);
76 |
77 | // ckr_{{ channel }}
78 | {% set ctx.kernel = ctx.kernel + 1 %}
79 | kernels[{{ ctx.kernel }}].setArg(0, sizeof(cl_mem), &routing_table_ck_r_{{ channel }});
80 | kernels[{{ ctx.kernel }}].setArg(1, sizeof(char), &char_rank);
81 | {% set ctx.kernel = ctx.kernel + 1 %}
82 | {% endfor %}
83 |
84 | {%- macro setup_collective_kernels(key) %}
85 | {% set ops = program.get_ops_by_type(key) %}
86 | {% for op in ops %}
87 | // {{ key }} {{ op.logical_port }}
88 | kernels[{{ ctx.kernel }}].setArg(0, sizeof(char), &char_ranks_count);
89 | {% set ctx.kernel = ctx.kernel + 1 %}
90 | {% endfor %}
91 | {%- endmacro %}
92 | {{ setup_collective_kernels("broadcast") }}
93 | {{ setup_collective_kernels("reduce") }}
94 | {{ setup_collective_kernels("scatter") }}
95 | {{ setup_collective_kernels("gather") }}
96 |
97 | // move buffers
98 | {% for channel in range(program.channel_count) %}
99 | buffers.push_back(std::move( routing_table_ck_s_{{ channel }}));
100 | buffers.push_back(std::move( routing_table_ck_r_{{ channel }}));
101 | {% endfor %}
102 |
103 | // start the kernels
104 | const int num_kernels = kernel_names.size();
105 | for (int i = num_kernels - 1; i >= 0; i--)
106 | {
107 | queues[i].enqueueTask(kernels[i]);
108 | queues[i].flush();
109 | }
110 |
111 | // return the communicator
112 | SMI_Comm comm{ char_rank, char_ranks_count };
113 | return comm;
114 |
115 | }
116 | {% endfor %}
117 |
--------------------------------------------------------------------------------
/codegen/templates/host_hlslib.cl:
--------------------------------------------------------------------------------
1 | #define __HOST_PROGRAM__
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | {% for (name, program) in programs -%}
8 | SMI_Comm SmiInit_{{ name }}(
9 | int rank,
10 | int ranks_count,
11 | const char* routing_dir,
12 | hlslib::ocl::Context &context,
13 | hlslib::ocl::Program &program,
14 | std::vector> &buffers)
15 | {
16 |
17 | const int ports = {{ program.logical_port_count }};
18 | const int cks_table_size = ranks_count;
19 | const int ckr_table_size = ports * 2;
20 | // load routing tables
21 | std::vector> routing_tables_ckr({{ program.channel_count}}, std::vector(ckr_table_size));
22 | std::vector> routing_tables_cks({{ program.channel_count}}, std::vector(cks_table_size));
23 | for (int i = 0; i < {{ program.channel_count }}; i++)
24 | {
25 | LoadRoutingTable(rank, i, cks_table_size, routing_dir, "cks", &routing_tables_cks[i][0]);
26 | LoadRoutingTable(rank, i, ckr_table_size, routing_dir, "ckr", &routing_tables_ckr[i][0]);
27 | }
28 |
29 | // create buffers for CKS/CKR and copy routing tables
30 | {% for channel in range(program.channel_count) %}
31 | hlslib::ocl::Buffer routing_table_device_ck_s_{{ channel }} =
32 | context.MakeBuffer( routing_tables_cks[{{channel}}].cbegin(),
33 | routing_tables_cks[{{channel}}].cend());
34 |
35 | hlslib::ocl::Buffer routing_table_device_ck_r_{{ channel }} =
36 | context.MakeBuffer( routing_tables_ckr[{{channel}}].cbegin(),
37 | routing_tables_ckr[{{channel}}].cend());
38 | {% endfor %}
39 |
40 |
41 | char char_ranks_count=ranks_count;
42 | char char_rank=rank;
43 |
44 | // CK kernels
45 | std::vector comm_kernels;
46 | {% for channel in range(program.channel_count) %}
47 | // cks_{{ channel }}
48 | comm_kernels.emplace_back(program.MakeKernel("smi_kernel_cks_{{ channel }}", routing_table_device_ck_s_{{channel}}, (char)char_ranks_count));
49 |
50 | // ckr_{{ channel }}
51 | comm_kernels.emplace_back(program.MakeKernel("smi_kernel_ckr_{{ channel }}", routing_table_device_ck_r_{{ channel }}, (char)char_rank));
52 |
53 | {% endfor %}
54 |
55 | // Collective kernels
56 | std::vector collective_kernels;
57 | {%- macro generate_collective_kernels(key, kernel_name) %}
58 | {% set ops = program.get_ops_by_type(key) %}
59 | {% for op in ops %}
60 | // {{ key }} {{ op.logical_port }}
61 | collective_kernels.emplace_back(program.MakeKernel("{{ kernel_name }}_{{ op.logical_port }}", (char)char_ranks_count));
62 | {% endfor %}
63 | {%- endmacro %}
64 |
65 | {{ generate_collective_kernels("broadcast", "smi_kernel_bcast") }}
66 | {{ generate_collective_kernels("reduce", "smi_kernel_reduce") }}
67 | {{ generate_collective_kernels("scatter", "smi_kernel_scatter") }}
68 | {{ generate_collective_kernels("gather", "smi_kernel_gather") }}
69 |
70 | // start the kernels
71 | for (auto &k : comm_kernels) {
72 | // Will never terminate, so we don't care about the return value of fork
73 | k.ExecuteTaskFork();
74 | }
75 |
76 | for (auto &k : collective_kernels) {
77 | // Will never terminate, so we don't care about the return value of fork
78 | k.ExecuteTaskFork();
79 | }
80 |
81 | //move created buffers to the vector given my the user
82 | {% for channel in range(program.channel_count) %}
83 | buffers.push_back(std::move(routing_table_device_ck_s_{{channel}}));
84 | buffers.push_back(std::move(routing_table_device_ck_r_{{channel}}));
85 | {% endfor %}
86 |
87 | // return the communicator
88 | SMI_Comm comm{ char_rank, char_ranks_count };
89 | return comm;
90 |
91 | }
92 | {% endfor %}
93 |
--------------------------------------------------------------------------------
/codegen/templates/pop.cl:
--------------------------------------------------------------------------------
1 | {% import 'utils.cl' as utils %}
2 |
3 | {%- macro smi_pop_impl(program, op) -%}
4 | void {{ utils.impl_name_port_type("SMI_Pop", op) }}(SMI_Channel *chan, void *data)
5 | {
6 | // in this case we have to copy the data into the target variable
7 | if (chan->packet_element_id == 0)
8 | {
9 | // no data to be unpacked...receive from the network
10 | chan->net = read_channel_intel({{ op.get_channel("ckr_data") }});
11 | }
12 | chan->processed_elements++;
13 | char *data_recvd = chan->net.data;
14 |
15 | #pragma unroll
16 | for (int ee = 0; ee < {{ op.data_elements_per_packet() }}; ee++)
17 | {
18 | if (ee == chan->packet_element_id)
19 | {
20 | #pragma unroll
21 | for (int jj = 0; jj < {{ op.data_size() }}; jj++)
22 | {
23 | ((char *)data)[jj] = data_recvd[(ee * {{ op.data_size() }}) + jj];
24 | }
25 | }
26 | }
27 |
28 | chan->packet_element_id++;
29 | if (chan->packet_element_id == GET_HEADER_NUM_ELEMS(chan->net.header))
30 | {
31 | chan->packet_element_id = 0;
32 | }
33 | // TODO: This is used to prevent this funny compiler to re-oder the two *_channel_intel operations
34 | // mem_fence(CLK_CHANNEL_MEM_FENCE);
35 | #if defined P2P_RENDEZVOUS
36 | //echange tokens
37 | chan->tokens--;
38 | if (chan->tokens == 0)
39 | {
40 | // At this point, the sender has still max_tokens*7/8 tokens: we have to consider this while we send
41 | // the new tokens to it
42 | unsigned int sender = ((int) ((int) chan->message_size - (int) chan->processed_elements - (int) chan->max_tokens * 7 / 8)) < 0 ? 0: chan->message_size - chan->processed_elements - chan -> max_tokens * 7 / 8;
43 | chan->tokens = (unsigned int) (MIN(chan->max_tokens / 8, sender)); // b/2
44 | SMI_Network_message mess;
45 | *(unsigned int*) mess.data = chan->tokens;
46 | SET_HEADER_DST(mess.header, chan->sender_rank);
47 | SET_HEADER_PORT(mess.header, chan->port);
48 | SET_HEADER_OP(mess.header, SMI_SYNCH);
49 | write_channel_intel({{ op.get_channel("cks_control") }}, mess);
50 | }
51 | #endif
52 | }
53 | {%- endmacro %}
54 |
55 | {%- macro smi_pop_channel(program, op) -%}
56 | SMI_Channel {{ utils.impl_name_port_type("SMI_Open_receive_channel", op) }}(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm)
57 | {
58 | SMI_Channel chan;
59 | // setup channel descriptor
60 | chan.port = (char) port;
61 | chan.sender_rank = (char) source;
62 | chan.message_size = (unsigned int) count;
63 | chan.data_type = data_type;
64 | chan.op_type = SMI_RECEIVE;
65 | chan.elements_per_packet = {{ op.data_elements_per_packet() }};
66 | chan.max_tokens = {{ op.buffer_size * op.data_elements_per_packet() }};
67 |
68 | #if defined P2P_RENDEZVOUS
69 | chan.tokens = MIN(chan.max_tokens / ((unsigned int) 8), count); // needed to prevent the compiler to optimize-away channel connections
70 | #else
71 | chan.tokens = count; // in this way, the last rendezvous is done at the end of the message. This is needed to prevent the compiler to cut-away internal FIFO buffer connections
72 | #endif
73 | // The receiver sends tokens to the sender once every chan.max_tokens/8 received data elements
74 | // chan.tokens = chan.max_tokens / ((unsigned int) 8);
75 | SET_HEADER_NUM_ELEMS(chan.net.header, 0); // at the beginning no data
76 | chan.packet_element_id = 0; // data per packet
77 | chan.processed_elements = 0;
78 | chan.sender_rank = chan.sender_rank;
79 | chan.receiver_rank = comm[0];
80 | // comm is not directly used in this first implementation
81 | return chan;
82 | }
83 | {%- endmacro -%}
84 |
--------------------------------------------------------------------------------
/codegen/templates/push.cl:
--------------------------------------------------------------------------------
1 | {% import 'utils.cl' as utils %}
2 |
3 | {%- macro smi_push_impl(program, op) -%}
4 | void {{ utils.impl_name_port_type("SMI_Push_flush", op) }}(SMI_Channel *chan, void* data, int immediate)
5 | {
6 | char* conv = (char*) data;
7 | COPY_DATA_TO_NET_MESSAGE(chan, chan->net, conv);
8 | chan->processed_elements++;
9 | chan->packet_element_id++;
10 |
11 | // send the network packet if it full or we reached the message size
12 | if (chan->packet_element_id == chan->elements_per_packet || immediate || chan->processed_elements == chan->message_size)
13 | {
14 | SET_HEADER_NUM_ELEMS(chan->net.header, chan->packet_element_id);
15 | chan->packet_element_id = 0;
16 | write_channel_intel({{ op.get_channel("cks_data") }}, chan->net);
17 | }
18 | // This fence is not mandatory, the two channel operations can be
19 | // performed independently
20 | // mem_fence(CLK_CHANNEL_MEM_FENCE);
21 | #if defined P2P_RENDEZVOUS
22 | chan->tokens--;
23 | if (chan->tokens == 0)
24 | {
25 | // receives also with tokens=0
26 | // wait until the message arrives
27 | SMI_Network_message mess = read_channel_intel({{ op.get_channel("ckr_control") }});
28 | unsigned int tokens = *(unsigned int *) mess.data;
29 | chan->tokens += tokens; // tokens
30 | }
31 | #endif
32 | }
33 | void {{ utils.impl_name_port_type("SMI_Push", op) }}(SMI_Channel *chan, void* data)
34 | {
35 | {{ utils.impl_name_port_type("SMI_Push_flush", op) }}(chan, data, 0);
36 | }
37 | {%- endmacro %}
38 |
39 | {%- macro smi_push_channel(program, op) -%}
40 | SMI_Channel {{ utils.impl_name_port_type("SMI_Open_send_channel", op) }}(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm)
41 | {
42 | SMI_Channel chan;
43 | // setup channel descriptor
44 | chan.port = (char) port;
45 | chan.message_size = (unsigned int) count;
46 | chan.data_type = data_type;
47 | chan.op_type = SMI_SEND;
48 | chan.receiver_rank = (char) destination;
49 | // At the beginning, the sender can sends as many data items as the buffer size
50 | // in the receiver allows
51 | chan.elements_per_packet = {{ op.data_elements_per_packet() }};
52 | chan.max_tokens = {{ op.buffer_size * op.data_elements_per_packet() }};
53 |
54 | // setup header for the message
55 | SET_HEADER_DST(chan.net.header, chan.receiver_rank);
56 | SET_HEADER_PORT(chan.net.header, chan.port);
57 | SET_HEADER_OP(chan.net.header, SMI_SEND);
58 | #if defined P2P_RENDEZVOUS
59 | chan.tokens = MIN(chan.max_tokens, count); // needed to prevent the compiler to optimize-away channel connections
60 | #else // eager transmission protocol
61 | chan.tokens = count; // in this way, the last rendezvous is done at the end of the message. This is needed to prevent the compiler to cut-away internal FIFO buffer connections
62 | #endif
63 | chan.receiver_rank = destination;
64 | chan.processed_elements = 0;
65 | chan.packet_element_id = 0;
66 | chan.sender_rank = comm[0];
67 | // chan.comm = comm; // comm is not used in this first implemenation
68 | return chan;
69 | }
70 | {%- endmacro -%}
71 |
--------------------------------------------------------------------------------
/codegen/templates/utils.cl:
--------------------------------------------------------------------------------
1 | {%- macro impl_name_port_type(name, op) -%}{{ name }}_{{ op.logical_port }}_{{ op.data_type }}{%- endmacro -%}
2 |
--------------------------------------------------------------------------------
/codegen/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import shutil
4 | import subprocess
5 | import sys
6 | from typing import Union, List
7 |
8 | import pytest
9 | from networkx import Graph
10 |
11 | from common import RoutingContext
12 | from ops import SmiOperation
13 | from serialization import parse_smi_operation
14 |
15 | sys.path.append(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
16 |
17 | from program import Channel, ProgramMapping, Program
18 | from routing import create_routing_context
19 |
20 | PYTEST_DIR = os.path.dirname(__file__)
21 | WORK_DIR = os.path.join(PYTEST_DIR, "work")
22 | DATA_DIR = os.path.join(PYTEST_DIR, "data")
23 |
24 | ROOT_DIR = os.path.dirname(os.path.dirname(PYTEST_DIR))
25 | REWRITER_BUILD_DIR = os.environ.get("REWRITER_DIR", ROOT_DIR.join("build/rewriter"))
26 |
27 |
28 | def prepare():
29 | """Prepare working directory
30 | If directory exists then it is cleaned;
31 | If it does not exists then it is created.
32 | """
33 | if os.path.isdir(WORK_DIR):
34 | for root, dirs, files in os.walk(WORK_DIR):
35 | for d in dirs:
36 | os.chmod(os.path.join(root, d), 0o700)
37 | for f in files:
38 | os.chmod(os.path.join(root, f), 0o700)
39 | for item in os.listdir(WORK_DIR):
40 | path = os.path.join(WORK_DIR, item)
41 | if os.path.isfile(path):
42 | os.unlink(path)
43 | else:
44 | shutil.rmtree(path)
45 | else:
46 | os.makedirs(WORK_DIR)
47 | os.chdir(WORK_DIR)
48 |
49 |
50 | def get_routing_ctx(program: Program, connections) -> RoutingContext:
51 | fpgas = tuple(fpga for (fpga, _) in connections.keys()) + tuple(fpga for (fpga, _) in connections.values())
52 | fpga_map = {
53 | fpga: program for fpga in fpgas
54 | }
55 | for (k, v) in dict(connections).items():
56 | connections[v] = k
57 |
58 | mapping = ProgramMapping([program], fpga_map)
59 | return create_routing_context(connections, mapping)
60 |
61 |
62 | def get_channel(graph: Graph, key: str, index: int) -> Union[Channel, None]:
63 | for channel in graph.nodes:
64 | if channel.fpga.key() == key and channel.index == index:
65 | return channel
66 | return None
67 |
68 |
69 | def get_data(path: str) -> str:
70 | return os.path.join(DATA_DIR, path)
71 |
72 |
73 | class FileTester:
74 | def check(self, path: str, content: str):
75 | file_path = get_data(path)
76 | with open(file_path) as f:
77 | file_content = f.read()
78 |
79 | ok = False
80 | try:
81 | assert file_content == content
82 | ok = True
83 | finally:
84 | if not ok:
85 | with open("{}.fail".format(os.path.basename(path)), "w") as f:
86 | f.write(content)
87 |
88 |
89 | @pytest.yield_fixture(autouse=True, scope="function")
90 | def file_tester():
91 | prepare()
92 | yield FileTester()
93 |
94 |
95 | class RewriteTester:
96 | def check(self, path: str, operations: List[SmiOperation]):
97 | orig_file = get_data("{}.cl".format(path))
98 | work_file = os.path.join(WORK_DIR, "{}.cl".format(path))
99 | expected_file = get_data("{}-expected.cl".format(path))
100 |
101 | shutil.copyfile(orig_file, work_file)
102 | ok = False
103 |
104 | try:
105 | result = subprocess.run([
106 | os.path.join(REWRITER_BUILD_DIR, "rewriter"),
107 | "-extra-arg=-I{}".format(os.path.join(ROOT_DIR, "include")),
108 | work_file
109 | ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
110 | stdout = result.stdout.decode()
111 | parsed_ops = [parse_smi_operation(json.loads(line)) for line in stdout.splitlines() if line]
112 | assert parsed_ops == operations
113 |
114 | with open(expected_file) as expected:
115 | with open(work_file) as work:
116 | assert work.read() == expected.read()
117 |
118 | ok = True
119 | finally:
120 | if ok:
121 | shutil.rmtree(WORK_DIR, ignore_errors=True)
122 | else:
123 | print(result.stderr.decode())
124 |
125 |
126 | @pytest.yield_fixture(autouse=True, scope="function")
127 | def rewrite_tester():
128 | prepare()
129 | yield RewriteTester()
130 |
--------------------------------------------------------------------------------
/codegen/tests/data/buffer-size-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Push_0_int(SMI_Channel* chan, void* data);
6 | SMI_Channel SMI_Open_send_channel_0_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
7 | __kernel void app_0(const int N, const char dst)
8 | {
9 | SMI_Comm comm;
10 | for (int i = 0; i < N; i++)
11 | {
12 | SMI_Channel chan_send = SMI_Open_send_channel_0_int(1, SMI_INT, dst, 0, comm);
13 | SMI_Push_0_int(&chan_send, &i);
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/codegen/tests/data/buffer-size.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 | for (int i = 0; i < N; i++)
9 | {
10 | SMI_Channel chan_send = SMI_Open_send_channel_ad(1, SMI_INT, dst, 0, comm, 128);
11 | SMI_Push(&chan_send, &i);
12 | }
13 | }
14 |
--------------------------------------------------------------------------------
/codegen/tests/data/complex-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Gather_5_int(SMI_GatherChannel* chan, void* send_data, void* rcv_data);
6 | SMI_GatherChannel SMI_Open_gather_channel_5_int(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm);
7 | void SMI_Scatter_4_short(SMI_ScatterChannel* chan, void* data_snd, void* data_rcv);
8 | SMI_ScatterChannel SMI_Open_scatter_channel_4_short(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm);
9 | void SMI_Reduce_3_float(SMI_RChannel* chan, void* data_snd, void* data_rcv);
10 | SMI_RChannel SMI_Open_reduce_channel_3_float(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm);
11 | SMI_BChannel SMI_Open_bcast_channel_2_int(int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm);
12 | void SMI_Pop_1_double(SMI_Channel* chan, void* data);
13 | SMI_Channel SMI_Open_receive_channel_1_double(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm);
14 | void SMI_Push_0_char(SMI_Channel* chan, void* data);
15 | SMI_Channel SMI_Open_send_channel_0_char(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
16 | __kernel void app_0(const int N, const char dst)
17 | {
18 | SMI_Comm comm;
19 |
20 | #pragma unroll
21 | for (int i = 0; i < N; i++)
22 | {
23 | float16 var1;
24 | uint var2;
25 | // push
26 | SMI_Channel chan_send = SMI_Open_send_channel_0_char(1, SMI_CHAR, dst, 0, comm);
27 | SMI_Push_0_char(&chan_send, &var1);
28 |
29 | // pop
30 | SMI_Channel chan_recv = SMI_Open_receive_channel_1_double(1, SMI_DOUBLE, dst, 1, comm);
31 | SMI_Pop_1_double(&chan_recv, &var2);
32 |
33 | // broadcast
34 | SMI_BChannel chan_bcast = SMI_Open_bcast_channel_2_int(1, SMI_INT, 2, 1, comm);
35 | SMI_Bcast(&chan_bcast, &i, &i);
36 |
37 | // reduce
38 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel_3_float(1, SMI_FLOAT, SMI_ADD, 3, 1, comm);
39 | SMI_Reduce_3_float(&chan_reduce, &i, &i);
40 |
41 | // scatter
42 | SMI_ScatterChannel chan_scatter = SMI_Open_scatter_channel_4_short(1, 1, SMI_SHORT, 4, 1, comm);
43 | SMI_Scatter_4_short(&chan_scatter, &i, &i);
44 |
45 | // gather
46 | SMI_GatherChannel chan_gather = SMI_Open_gather_channel_5_int(1, 1, SMI_INT, 5, 1, comm);
47 | SMI_Gather_5_int(&chan_gather, &i, &i);
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/codegen/tests/data/complex.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 |
9 | #pragma unroll
10 | for (int i = 0; i < N; i++)
11 | {
12 | float16 var1;
13 | uint var2;
14 | // push
15 | SMI_Channel chan_send = SMI_Open_send_channel(1, SMI_CHAR, dst, 0, comm);
16 | SMI_Push(&chan_send, &var1);
17 |
18 | // pop
19 | SMI_Channel chan_recv = SMI_Open_receive_channel(1, SMI_DOUBLE, dst, 1, comm);
20 | SMI_Pop(&chan_recv, &var2);
21 |
22 | // broadcast
23 | SMI_BChannel chan_bcast = SMI_Open_bcast_channel(1, SMI_INT, 2, 1, comm);
24 | SMI_Bcast(&chan_bcast, &i, &i);
25 |
26 | // reduce
27 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel(1, SMI_FLOAT, SMI_ADD, 3, 1, comm);
28 | SMI_Reduce(&chan_reduce, &i, &i);
29 |
30 | // scatter
31 | SMI_ScatterChannel chan_scatter = SMI_Open_scatter_channel(1, 1, SMI_SHORT, 4, 1, comm);
32 | SMI_Scatter(&chan_scatter, &i, &i);
33 |
34 | // gather
35 | SMI_GatherChannel chan_gather = SMI_Open_gather_channel(1, 1, SMI_INT, 5, 1, comm);
36 | SMI_Gather(&chan_gather, &i, &i);
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/codegen/tests/data/constant-variable-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Push_5_int(SMI_Channel* chan, void* data);
6 | SMI_Channel SMI_Open_send_channel_5_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
7 | __kernel void app_0(const int N, const char dst)
8 | {
9 | SMI_Comm comm;
10 | for (int i = 0; i < N; i++)
11 | {
12 | const int port = 5;
13 | SMI_Channel chan_send1 = SMI_Open_send_channel_5_int(1, SMI_INT, dst, port, comm);
14 | SMI_Push_5_int(&chan_send1, &i);
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/codegen/tests/data/constant-variable.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 | for (int i = 0; i < N; i++)
9 | {
10 | const int port = 5;
11 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, port, comm);
12 | SMI_Push(&chan_send1, &i);
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/codegen/tests/data/data-type-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Pop_4_int(SMI_Channel* chan, void* data);
6 | SMI_Channel SMI_Open_receive_channel_4_int(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm);
7 | void SMI_Pop_3_float(SMI_Channel* chan, void* data);
8 | SMI_Channel SMI_Open_receive_channel_3_float(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm);
9 | void SMI_Pop_1_double(SMI_Channel* chan, void* data);
10 | SMI_Channel SMI_Open_receive_channel_1_double(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm);
11 | void SMI_Push_0_char(SMI_Channel* chan, void* data);
12 | SMI_Channel SMI_Open_send_channel_0_char(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
13 | __kernel void app_0(const int N, const char dst)
14 | {
15 | SMI_Comm comm;
16 | for (int i = 0; i < N; i++)
17 | {
18 | SMI_Channel chan_send = SMI_Open_send_channel_0_char(1, SMI_CHAR, dst, 0, comm);
19 | SMI_Push_0_char(&chan_send, &i);
20 | SMI_Channel chan_recv = SMI_Open_receive_channel_1_double(1, SMI_DOUBLE, dst, 1, comm);
21 | SMI_Pop_1_double(&chan_recv, &i);
22 | SMI_Channel chan_recv1 = SMI_Open_receive_channel_3_float(1, SMI_FLOAT, dst, 3, comm);
23 | SMI_Pop_3_float(&chan_recv1, &i);
24 | SMI_Channel chan_recv2 = SMI_Open_receive_channel_4_int(1, SMI_INT, dst, 4, comm);
25 | SMI_Pop_4_int(&chan_recv2, &i);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/codegen/tests/data/data-type.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 | for (int i = 0; i < N; i++)
9 | {
10 | SMI_Channel chan_send = SMI_Open_send_channel(1, SMI_CHAR, dst, 0, comm);
11 | SMI_Push(&chan_send, &i);
12 | SMI_Channel chan_recv = SMI_Open_receive_channel(1, SMI_DOUBLE, dst, 1, comm);
13 | SMI_Pop(&chan_recv, &i);
14 | SMI_Channel chan_recv1 = SMI_Open_receive_channel(1, SMI_FLOAT, dst, 3, comm);
15 | SMI_Pop(&chan_recv1, &i);
16 | SMI_Channel chan_recv2 = SMI_Open_receive_channel(1, SMI_INT, dst, 4, comm);
17 | SMI_Pop(&chan_recv2, &i);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/codegen/tests/data/kernel-attribute-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Push_0_int(SMI_Channel* chan, void* data);
6 | SMI_Channel SMI_Open_send_channel_0_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
7 | __kernel void app_0(const int N, const char dst)
8 | {
9 | SMI_Comm comm;
10 | for (int i = 0; i < N; i++)
11 | {
12 | SMI_Channel chan_send1 = SMI_Open_send_channel_0_int(1, SMI_INT, dst, 0, comm);
13 | SMI_Push_0_int(&chan_send1, &i);
14 | }
15 | }
16 |
17 | void SMI_Push_1_int(SMI_Channel* chan, void* data);
18 | SMI_Channel SMI_Open_send_channel_1_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
19 | kernel void app_1(const int N, const char dst)
20 | {
21 | SMI_Comm comm;
22 | for (int i = 0; i < N; i++)
23 | {
24 | SMI_Channel chan_send1 = SMI_Open_send_channel_1_int(1, SMI_INT, dst, 1, comm);
25 | SMI_Push_1_int(&chan_send1, &i);
26 | }
27 | }
28 |
29 | kernelx void app_2(const int N, const char dst)
30 | {
31 | SMI_Comm comm;
32 | for (int i = 0; i < N; i++)
33 | {
34 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 2, comm);
35 | SMI_Push(&chan_send1, &i);
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/codegen/tests/data/kernel-attribute.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 | for (int i = 0; i < N; i++)
9 | {
10 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 0, comm);
11 | SMI_Push(&chan_send1, &i);
12 | }
13 | }
14 |
15 | kernel void app_1(const int N, const char dst)
16 | {
17 | SMI_Comm comm;
18 | for (int i = 0; i < N; i++)
19 | {
20 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 1, comm);
21 | SMI_Push(&chan_send1, &i);
22 | }
23 | }
24 |
25 | kernelx void app_2(const int N, const char dst)
26 | {
27 | SMI_Comm comm;
28 | for (int i = 0; i < N; i++)
29 | {
30 | SMI_Channel chan_send1 = SMI_Open_send_channel(1, SMI_INT, dst, 2, comm);
31 | SMI_Push(&chan_send1, &i);
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/codegen/tests/data/port-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Pop_3_int(SMI_Channel* chan, void* data);
6 | SMI_Channel SMI_Open_receive_channel_3_int(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm);
7 | void SMI_Push_0_int(SMI_Channel* chan, void* data);
8 | SMI_Channel SMI_Open_send_channel_0_int(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
9 | __kernel void app_0(const int N, const char dst)
10 | {
11 | SMI_Comm comm;
12 | for (int i = 0; i < N; i++)
13 | {
14 | SMI_Channel chan_send = SMI_Open_send_channel_0_int(1, SMI_INT, dst, 0, comm);
15 | SMI_Push_0_int(&chan_send, &i);
16 | SMI_Channel chan_recv = SMI_Open_receive_channel_3_int(1, SMI_INT, dst, 3, comm);
17 | SMI_Pop_3_int(&chan_recv, &i);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/codegen/tests/data/port.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 | for (int i = 0; i < N; i++)
9 | {
10 | SMI_Channel chan_send = SMI_Open_send_channel(1, SMI_INT, dst, 0, comm);
11 | SMI_Push(&chan_send, &i);
12 | SMI_Channel chan_recv = SMI_Open_receive_channel(1, SMI_INT, dst, 3, comm);
13 | SMI_Pop(&chan_recv, &i);
14 | }
15 | }
16 |
--------------------------------------------------------------------------------
/codegen/tests/data/reduce-expected.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | void SMI_Reduce_2_int(SMI_RChannel* chan, void* data_snd, void* data_rcv);
6 | SMI_RChannel SMI_Open_reduce_channel_2_int(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm);
7 | void SMI_Reduce_1_int(SMI_RChannel* chan, void* data_snd, void* data_rcv);
8 | SMI_RChannel SMI_Open_reduce_channel_1_int(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm);
9 | void SMI_Reduce_0_int(SMI_RChannel* chan, void* data_snd, void* data_rcv);
10 | SMI_RChannel SMI_Open_reduce_channel_0_int(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm);
11 | __kernel void app_0(const int N, const char dst)
12 | {
13 | SMI_Comm comm;
14 | for (int i = 0; i < N; i++)
15 | {
16 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel_0_int(1, SMI_INT, SMI_ADD, 0, 1, comm);
17 | SMI_Reduce_0_int(&chan_reduce, &i, &i);
18 |
19 | SMI_RChannel chan_reduce1 = SMI_Open_reduce_channel_1_int(1, SMI_INT, SMI_MIN, 1, 1, comm);
20 | SMI_Reduce_1_int(&chan_reduce1, &i, &i);
21 |
22 | SMI_RChannel chan_reduce2 = SMI_Open_reduce_channel_2_int(1, SMI_INT, SMI_MAX, 2, 1, comm);
23 | SMI_Reduce_2_int(&chan_reduce2, &i, &i);
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/codegen/tests/data/reduce.cl:
--------------------------------------------------------------------------------
1 | #pragma OPENCL EXTENSION cl_intel_channels : enable
2 |
3 | #include
4 |
5 | __kernel void app_0(const int N, const char dst)
6 | {
7 | SMI_Comm comm;
8 | for (int i = 0; i < N; i++)
9 | {
10 | SMI_RChannel chan_reduce = SMI_Open_reduce_channel(1, SMI_INT, SMI_ADD, 0, 1, comm);
11 | SMI_Reduce(&chan_reduce, &i, &i);
12 |
13 | SMI_RChannel chan_reduce1 = SMI_Open_reduce_channel(1, SMI_INT, SMI_MIN, 1, 1, comm);
14 | SMI_Reduce(&chan_reduce1, &i, &i);
15 |
16 | SMI_RChannel chan_reduce2 = SMI_Open_reduce_channel(1, SMI_INT, SMI_MAX, 2, 1, comm);
17 | SMI_Reduce(&chan_reduce2, &i, &i);
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/codegen/tests/test_codegen.py:
--------------------------------------------------------------------------------
1 | from codegen import generate_program_device, generate_program_host
2 | from ops import Push, Pop, Broadcast, Reduce, Scatter, Gather
3 | from program import Program, ProgramMapping
4 | from routing import create_routing_context
5 |
6 |
7 | def test_codegen_device(file_tester):
8 | program = Program([
9 | Push(0, "short", 8),
10 | Pop(0),
11 | Push(1),
12 | Pop(2, "char", 8),
13 | Broadcast(3, "float", 64),
14 | Broadcast(4, "int"),
15 | Push(5, "double", 32),
16 | Reduce(6, "float", 16, "add"),
17 | Scatter(7, "double"),
18 | Gather(8, "char")
19 | ])
20 |
21 | mapping = ProgramMapping([program], {
22 | "n1:f1": program,
23 | "n1:f2": program,
24 | "n2:f1": program,
25 | "n3:f1": program
26 | })
27 |
28 | connections = {
29 | ("n1:f1", 0): ("n1:f2", 0),
30 | ("n1:f2", 1): ("n2:f1", 1),
31 | ("n1:f2", 2): ("n3:f1", 1),
32 | ("n2:f1", 0): ("n1:f1", 1),
33 | }
34 |
35 | ctx = create_routing_context(connections, mapping)
36 |
37 | file_tester.check("smi-device-1.h", generate_program_device(ctx.fpgas[0], ctx.fpgas, ctx.graph, 4))
38 |
39 |
40 | def test_codegen_host(file_tester):
41 | program = Program([
42 | Push(0),
43 | Pop(0),
44 | Push(1),
45 | Pop(2),
46 | Broadcast(3),
47 | Broadcast(4),
48 | Push(5),
49 | Reduce(6, "float", 16, "min")
50 | ])
51 |
52 | file_tester.check("smi-host-1.h", generate_program_host([("program", program)]))
53 |
--------------------------------------------------------------------------------
/codegen/tests/test_parse.py:
--------------------------------------------------------------------------------
1 | from ops import Push, Pop, Broadcast, Reduce
2 | from serialization import parse_program, parse_routing_file
3 |
4 |
5 | def test_parse_program():
6 | program = parse_program("""
7 | {
8 | "consecutive_reads": 16,
9 | "max_ranks": 16,
10 | "p2p_randezvous": false,
11 | "operations": [{
12 | "port": 0,
13 | "type": "push",
14 | "data_type": "int"
15 | }, {
16 | "port": 1,
17 | "type": "push",
18 | "data_type": "char"
19 | }, {
20 | "port": 2,
21 | "type": "pop"
22 | }, {
23 | "port": 3,
24 | "type": "broadcast",
25 | "data_type": "int"
26 | }, {
27 | "port": 4,
28 | "type": "reduce",
29 | "data_type": "float",
30 | "args": {
31 | "op_type": "add"
32 | }
33 | }, {
34 | "port": 5,
35 | "type": "scatter",
36 | "data_type": "short"
37 | }, {
38 | "port": 6,
39 | "type": "gather",
40 | "data_type": "double",
41 | "buffer_size": 32
42 | }]
43 | }
44 | """)
45 | assert program.consecutive_read_limit == 16
46 | assert program.max_ranks == 16
47 | assert len(program.operations) == 7
48 | assert isinstance(program.operations[0], Push)
49 | assert program.operations[0].logical_port == 0
50 | assert isinstance(program.operations[2], Pop)
51 | assert program.operations[2].logical_port == 2
52 |
53 | assert isinstance(program.operations[3], Broadcast)
54 | assert isinstance(program.operations[4], Reduce)
55 | assert program.operations[4].data_type == "float"
56 |
57 | assert program.operations[6].buffer_size == 32
58 |
59 |
60 | def test_parse_connections():
61 | (connections, _) = parse_routing_file("""
62 | {
63 | "fpgas": {},
64 | "connections": {
65 | "fpga-0015:acl0:ch0": "fpga-0016:acl0:ch0",
66 | "fpga-0015:acl0:ch1": "fpga-0015:acl1:ch1",
67 | "fpga-0015:acl0:ch2": "fpga-0016:acl1:ch2",
68 | "fpga-0015:acl1:ch0": "fpga-0016:acl1:ch0",
69 | "fpga-0015:acl1:ch2": "fpga-0016:acl0:ch2",
70 | "fpga-0016:acl0:ch1": "fpga-0016:acl1:ch1"
71 | }
72 | }
73 | """, ignore_programs=True)
74 | assert connections == {('fpga-0015:acl0', 0): ('fpga-0016:acl0', 0),
75 | ('fpga-0015:acl0', 1): ('fpga-0015:acl1', 1),
76 | ('fpga-0015:acl0', 2): ('fpga-0016:acl1', 2),
77 | ('fpga-0015:acl1', 0): ('fpga-0016:acl1', 0),
78 | ('fpga-0015:acl1', 1): ('fpga-0015:acl0', 1),
79 | ('fpga-0015:acl1', 2): ('fpga-0016:acl0', 2),
80 | ('fpga-0016:acl0', 0): ('fpga-0015:acl0', 0),
81 | ('fpga-0016:acl0', 1): ('fpga-0016:acl1', 1),
82 | ('fpga-0016:acl0', 2): ('fpga-0015:acl1', 2),
83 | ('fpga-0016:acl1', 0): ('fpga-0015:acl1', 0),
84 | ('fpga-0016:acl1', 1): ('fpga-0016:acl0', 1),
85 | ('fpga-0016:acl1', 2): ('fpga-0015:acl0', 2)}
86 |
--------------------------------------------------------------------------------
/codegen/tests/test_program.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from ops import Push, Pop, Broadcast, KEY_CKS_DATA, KEY_CKS_CONTROL, KEY_CKR_DATA, KEY_BROADCAST
4 | from program import Program, FailedAllocation
5 |
6 |
7 | def test_allocation_fail():
8 | with pytest.raises(FailedAllocation):
9 | Program([
10 | Push(0),
11 | Broadcast(0)
12 | ])
13 |
14 |
15 | def test_allocation_channel_to_ports():
16 | program = Program([
17 | Push(0),
18 | Pop(0),
19 | Push(1),
20 | Push(2),
21 | Pop(2)
22 | ])
23 |
24 | ops = program.operations
25 | assert program.get_channel_allocations(0) == [
26 | (ops[0], "cks_data"),
27 | (ops[4], "cks_control"),
28 | (ops[1], "ckr_data"),
29 | (ops[3], "ckr_control"),
30 | ]
31 | assert program.get_channel_allocations(1) == [
32 | (ops[2], "cks_data"),
33 | (ops[4], "ckr_data"),
34 | ]
35 | assert program.get_channel_allocations(2) == [
36 | (ops[3], "cks_data"),
37 | (ops[0], "ckr_control"),
38 | ]
39 | assert program.get_channel_allocations(3) == [
40 | (ops[1], "cks_control"),
41 | (ops[2], "ckr_control"),
42 | ]
43 |
44 |
45 | def test_allocation_get_channel():
46 | program = Program([
47 | Push(0),
48 | Pop(0),
49 | Push(1),
50 | Push(2),
51 | Pop(2)
52 | ])
53 |
54 | assert program.get_channel_for_port_key(0, KEY_CKS_DATA) == 0
55 | assert program.get_channel_for_port_key(0, KEY_CKS_CONTROL) == 3
56 | assert program.get_channel_for_port_key(1, KEY_CKR_DATA) is None
57 | assert program.get_channel_for_port_key(2, KEY_CKS_DATA) == 2
58 |
--------------------------------------------------------------------------------
/codegen/tests/test_rewriter.py:
--------------------------------------------------------------------------------
1 | from ops import Push, Pop, Broadcast, Reduce, Scatter, Gather
2 |
3 |
4 | def test_rewriter_port(rewrite_tester):
5 | rewrite_tester.check("port", [
6 | Push(0, "int"),
7 | Pop(3, "int")
8 | ])
9 |
10 |
11 | def test_rewriter_kernel_attribute(rewrite_tester):
12 | rewrite_tester.check("kernel-attribute", [
13 | Push(0, "int"),
14 | Push(1, "int")
15 | ])
16 |
17 |
18 | def test_rewriter_constant_variable(rewrite_tester):
19 | rewrite_tester.check("constant-variable", [
20 | Push(5, "int"),
21 | ])
22 |
23 |
24 | def test_rewriter_data_type(rewrite_tester):
25 | rewrite_tester.check("data-type", [
26 | Push(0, "char"),
27 | Pop(1, "double"),
28 | Pop(3, "float"),
29 | Pop(4, "int")
30 | ])
31 |
32 |
33 | def test_rewriter_buffer_size(rewrite_tester):
34 | rewrite_tester.check("buffer-size", [
35 | Push(0, "int", 128),
36 | ])
37 |
38 |
39 | def test_rewriter_complex(rewrite_tester):
40 | rewrite_tester.check("complex", [
41 | Push(0, "char"),
42 | Pop(1, "double"),
43 | Broadcast(2, "int"),
44 | Reduce(3, "float", op_type="add"),
45 | Scatter(4, "short"),
46 | Gather(5, "int"),
47 | ])
48 |
49 |
50 | def test_rewriter_reduce(rewrite_tester):
51 | rewrite_tester.check("reduce", [
52 | Reduce(0, op_type="add"),
53 | Reduce(1, op_type="min"),
54 | Reduce(2, op_type="max"),
55 | ])
56 |
--------------------------------------------------------------------------------
/codegen/tests/test_routing.py:
--------------------------------------------------------------------------------
1 | import networkx
2 |
3 | from program import ProgramMapping, Program
4 | from routing import load_inter_fpga_connections, create_routing_context
5 |
6 |
7 | def test_load_inter_fpga_connections():
8 | program = Program([])
9 | mapping = ProgramMapping([program], {
10 | "n1:f1": program,
11 | "n1:f2": program,
12 | "n2:f1": program
13 | })
14 |
15 | connections = {
16 | ("n1:f1", 0): ("n1:f2", 0),
17 | ("n1:f2", 1): ("n2:f1", 1),
18 | ("n2:f1", 0): ("n1:f1", 1),
19 | }
20 |
21 | graph = networkx.Graph()
22 | fpgas = load_inter_fpga_connections(graph, connections, mapping)
23 |
24 | assert len(fpgas) == 3
25 | fpgas = sorted(fpgas, key=lambda f: f.key())
26 | assert fpgas[0].program is program
27 |
28 | assert list(graph.edges(fpgas[0].channels[0])) == [(fpgas[0].channels[0], fpgas[1].channels[0])]
29 | assert list(graph.edges(fpgas[0].channels[1])) == [(fpgas[0].channels[1], fpgas[2].channels[0])]
30 | assert list(graph.edges(fpgas[1].channels[1])) == [(fpgas[1].channels[1], fpgas[2].channels[1])]
31 | assert list(graph.edges(fpgas[2].channels[0])) == [(fpgas[2].channels[0], fpgas[0].channels[1])]
32 |
33 |
34 | def test_routing_context():
35 | program = Program([])
36 | mapping = ProgramMapping([program], {
37 | "n1:f1": program,
38 | "n1:f2": program,
39 | "n2:f1": program,
40 | "n3:f1": program
41 | })
42 |
43 | connections = {
44 | ("n1:f1", 0): ("n1:f2", 0),
45 | ("n1:f2", 1): ("n2:f1", 1),
46 | ("n1:f2", 2): ("n3:f1", 1),
47 | ("n2:f1", 0): ("n1:f1", 1),
48 | }
49 |
50 | ctx = create_routing_context(connections, mapping)
51 | fpgas = ctx.fpgas
52 | assert ctx.routes[fpgas[0].channels[0]][fpgas[3].channels[3]] == [
53 | fpgas[0].channels[0],
54 | fpgas[1].channels[0],
55 | fpgas[1].channels[2],
56 | fpgas[3].channels[1],
57 | fpgas[3].channels[3]
58 | ]
59 |
--------------------------------------------------------------------------------
/codegen/tests/test_routing_table.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from conftest import get_routing_ctx, get_channel
3 |
4 | from ops import Push, Pop
5 | from program import FPGA, Program, CHANNELS_PER_FPGA
6 | from routing_table import cks_routing_table, NoRouteFound, ckr_routing_table
7 |
8 |
9 | def test_cks_table():
10 | ctx = get_routing_ctx(Program([
11 | Push(0),
12 | Push(1),
13 | Pop(2)
14 | ]), {
15 | ("N0:F0", 0): ("N0:F1", 0),
16 | ("N1:F0", 0): ("N0:F0", 1)
17 | })
18 |
19 | graph, routes, fpgas = (ctx.graph, ctx.routes, ctx.fpgas)
20 |
21 | a = get_channel(graph, "N0:F0", 0)
22 | assert cks_routing_table(routes, fpgas, a) == [1, 0, 2]
23 |
24 | b = get_channel(graph, "N0:F0", 1)
25 | assert cks_routing_table(routes, fpgas, b) == [1, 2, 0]
26 |
27 | c = get_channel(graph, "N0:F1", 0)
28 | assert cks_routing_table(routes, fpgas, c) == [0, 1, 0]
29 |
30 | d = get_channel(graph, "N1:F0", 0)
31 | assert cks_routing_table(routes, fpgas, d) == [0, 0, 1]
32 |
33 |
34 | def test_ckr_table():
35 | program = Program([
36 | Push(0),
37 | Pop(1),
38 | Push(2),
39 | Pop(3),
40 | Pop(4)
41 | ])
42 | fpga = FPGA("n", "f", program)
43 |
44 | assert ckr_routing_table(fpga.channels[0], CHANNELS_PER_FPGA, program) == [0, 3, 4, 0, 0, 5, 1, 0, 2, 0]
45 | assert ckr_routing_table(fpga.channels[1], CHANNELS_PER_FPGA, program) == [0, 3, 1, 0, 0, 1, 4, 0, 2, 0]
46 | assert ckr_routing_table(fpga.channels[2], CHANNELS_PER_FPGA, program) == [0, 3, 1, 0, 0, 1, 2, 0, 4, 0]
47 | assert ckr_routing_table(fpga.channels[3], CHANNELS_PER_FPGA, program) == [0, 4, 1, 0, 0, 1, 2, 0, 3, 0]
48 |
49 |
50 | def test_ckr_no_route():
51 | ctx = get_routing_ctx(Program([]), {
52 | ("N0:F0", 0): ("N0:F1", 0),
53 | ("N1:F0", 0): ("N1:F2", 1)
54 | })
55 |
56 | graph, routes, fpgas = (ctx.graph, ctx.routes, ctx.fpgas)
57 | ch = get_channel(graph, "N0:F0", 0)
58 | with pytest.raises(NoRouteFound):
59 | cks_routing_table(routes, fpgas, ch)
60 |
--------------------------------------------------------------------------------
/codegen/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | from utils import round_robin
2 |
3 |
4 | def test_round_robin():
5 | assert round_robin([], 0, 1) == []
6 | assert round_robin([1], 0, 1) == [1]
7 | assert round_robin([1], 1, 2) == []
8 | assert round_robin([1, 2], 0, 2) == [1]
9 | assert round_robin([1, 2], 1, 2) == [2]
10 | assert round_robin([1, 2, 3, 4, 5, 6, 7], 1, 4) == [2, 6]
11 |
--------------------------------------------------------------------------------
/codegen/topology_file_generator.py:
--------------------------------------------------------------------------------
1 | '''
2 | This program generates a dummy topology file. This can be used for testing or skeleton for topology files.
3 | It takes in input the number of FPGAs, a list of program names, and the output file.
4 | Programs are associated randomly to FPGAs, and FPGAs are connected in a bus.
5 | '''
6 |
7 | import json
8 | import argparse
9 |
10 |
11 | if __name__ == "__main__":
12 |
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("-n", type=int, help=' Number of FPGAs', required=True)
15 | parser.add_argument('-p', nargs='+', help=' List of programs', required=True)
16 | parser.add_argument("-f", type=str, help=' Output file', required=True)
17 | args = vars(parser.parse_args())
18 | n = args["n"]
19 | programs = args["p"]
20 | if n < len(programs):
21 | print("The number of FPGAs must be greater or equal than the number of programs")
22 | exit(-1)
23 | #FPGAs are numbered from 0 to n-1
24 | programs_to_fpga={}
25 | for i in range(0,n):
26 | fpga_name = "fpga-{}:acl0".format(i)
27 | programs_to_fpga[fpga_name]=programs[i%len(programs)]
28 |
29 | #create a bus topology: port 0 is connected to port 1 of the next FPGA
30 | fpga_topology={}
31 | for i in range(0,n-1):
32 | src_name = "fpga-{}:acl0:ch0".format(i)
33 | dst_name = "fpga-{}:acl0:ch1".format(i+1)
34 | fpga_topology[src_name]=dst_name
35 | data = {"fpgas": programs_to_fpga, "connections": fpga_topology}
36 | with open(args["f"], 'w') as f:
37 | json.dump(data, f, indent=4, separators=(',', ': '))
38 |
--------------------------------------------------------------------------------
/codegen/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List, TypeVar
2 |
3 | T = TypeVar('T')
4 |
5 |
6 | def round_robin(values: List[T], index: int, size: int) -> List[T]:
7 | assert size > 0
8 | assert 0 <= index < size
9 |
10 | return values[index::size]
11 |
--------------------------------------------------------------------------------
/examples/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Configuration
2 | set(SMI_STENCIL_SIZE_X 8192 CACHE STRING "Vertical size of domain.")
3 | set(SMI_STENCIL_SIZE_Y 8192 CACHE STRING "Horizontal size of domain.")
4 | set(SMI_DATATYPE float CACHE STRING "Data type of computations.")
5 | set(SMI_VECTORIZATION_WIDTH 16 CACHE STRING "Width for exploiting vector parallelism.")
6 | set(SMI_STENCIL_NUM_PROCS_X 2 CACHE STRING "Number of processes in X dimension.")
7 | set(SMI_STENCIL_NUM_PROCS_Y 4 CACHE STRING "Number of processes in Y dimension.")
8 | math(EXPR SMI_STENCIL_NUM_PROCS "${SMI_STENCIL_NUM_PROCS_X} * ${SMI_STENCIL_NUM_PROCS_Y}")
9 | set(SMI_KMEANS_RANKS 8 CACHE STRING "Number of dimensions for K-means.")
10 | set(SMI_KMEANS_DIMS 64 CACHE STRING "Number of dimensions for K-means.")
11 | set(SMI_KMEANS_CLUSTERS 8 CACHE STRING "Number of clusters to compute.")
12 |
13 | if(SMI_DATATYPE STREQUAL "float")
14 | set(SMI_COMM_DATATYPE "SMI_FLOAT")
15 | else()
16 | message(FATAL_ERROR "Unsupported data type \"${SMI_DATATYPE}\".")
17 | endif()
18 |
19 | configure_file(include/stencil.h.in stencil.h)
20 | configure_file(include/kmeans.h.in kmeans.h)
21 | configure_file(include/fblas.h fblas.h)
22 |
23 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include)
24 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
25 |
26 | find_package(PythonInterp 3)
27 |
28 | if(PythonInterp_FOUND)
29 | #stencil
30 | smi_target(stencil_smi "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_smi.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/stencil_smi.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_smi.cl" 8)
31 | #kmeans
32 | smi_target(kmeans_smi "${CMAKE_CURRENT_SOURCE_DIR}/kernels/kmeans_smi.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/kmeans_smi.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/kmeans_smi.cl" 8)
33 |
34 | #gesummv
35 | smi_target(gesummv_smi "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_smi.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/gesummv_smi.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_rank0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_rank1.cl" 2)
36 | target_link_libraries(gesummv_smi_host openblas)
37 |
38 | #onchip versions
39 | fpga_target(gesummv_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/gesummv_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_onchip.cl" OFF)
40 | fpga_target(stencil_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/stencil_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_onchip.cl" ON)
41 |
42 | endif()
43 |
--------------------------------------------------------------------------------
/examples/include/common.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 |
7 | constexpr int kChannelsPerRank = 4;
8 |
9 | template
10 | void LoadRoutingTable(int rank, int channel, int num_entries,
11 | const std::string& routing_directory,
12 | const std::string& prefix, DataSize* table) {
13 | std::stringstream path;
14 | path << routing_directory << "/" << prefix << "-rank" << rank << "-channel"
15 | << channel;
16 |
17 | std::ifstream file(path.str(), std::ios::binary);
18 | if (!file) {
19 | throw std::runtime_error("Routing table " + path.str() + " not found.");
20 | }
21 |
22 | auto byte_size = num_entries * sizeof(DataSize);
23 | file.read(table, byte_size);
24 | }
25 |
--------------------------------------------------------------------------------
/examples/include/fblas.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef COMMONS_CL_HPP
3 | #define COMMONS_CL_HPP
4 |
5 |
6 | /**
7 | * A set of commons definitions that has to be included in BLAS
8 | * routine kernels
9 | *
10 | * Must be included after the definition of the DOUBLE_PRECISION macro (if needed)
11 | */
12 |
13 |
14 | #ifdef DOUBLE_PRECISION
15 | #define TYPE_T double
16 | #else
17 | #define TYPE_T float //type of data: float if DOUBLE_PRECISION is undefined, double otherwise
18 | #endif
19 |
20 |
21 | #ifdef DOUBLE_PRECISION
22 | //enable double precision support
23 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
24 |
25 | #ifdef __STRATIX_10__
26 | #define DOUBLE_ADD_LATENCY 28 //double add latency for Stratix10
27 | #endif
28 |
29 | #ifdef __ARRIA_10__
30 | #define DOUBLE_ADD_LATENCY 12 //double add latency for Arria 10
31 | #endif
32 |
33 | #define SHIFT_REG DOUBLE_ADD_LATENCY+6 //Shift register dimension for double precision operations (additional elements to avoid Fmax problems)
34 | #endif
35 | #endif
36 |
--------------------------------------------------------------------------------
/examples/include/kmeans.h.in:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #pragma OPENCL EXTENSION cl_intel_channels : enable
4 |
5 | #define W ${SMI_VECTORIZATION_WIDTH}
6 | #define K ${SMI_KMEANS_CLUSTERS}
7 | #define DIMS ${SMI_KMEANS_DIMS}
8 | #define DTYPE ${SMI_DATATYPE}
9 | #define ITYPE unsigned short
10 | #define SMI_TYPE ${SMI_COMM_DATATYPE}
11 | #define W ${SMI_VECTORIZATION_WIDTH}
12 | #if W > 1
13 | #define VTYPE ${SMI_DATATYPE}${SMI_VECTORIZATION_WIDTH}
14 | #define IVTYPE ushort${SMI_VECTORIZATION_WIDTH}
15 | #else
16 | #define VTYPE DTYPE
17 | #define IVTYPE ITYPE
18 | #endif
19 | #define SMI_DEVICES_PER_NODE ${SMI_DEVICES_PER_NODE}
20 |
--------------------------------------------------------------------------------
/examples/include/stencil.h.in:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #pragma OPENCL EXTENSION cl_intel_channels : enable
4 |
5 | // Constants (for now)
6 | #define HALO_X 1
7 | #define HALO_Y 1
8 | #define BOUNDARY_VALUE 1
9 |
10 | // CMake config
11 | #define X ${SMI_STENCIL_SIZE_X}
12 | #define Y ${SMI_STENCIL_SIZE_Y}
13 | #define DTYPE ${SMI_DATATYPE}
14 | #define SMI_TYPE ${SMI_COMM_DATATYPE}
15 | #define W ${SMI_VECTORIZATION_WIDTH}
16 | #define B 4 // Number of memory banks
17 | #if W > 1
18 | #define VTYPE ${SMI_DATATYPE}${SMI_VECTORIZATION_WIDTH}
19 | #else
20 | #define VTYPE DTYPE
21 | #endif
22 | #if HALO_Y > 1
23 | #define HTYPE_PASTE(a, b) a ## b
24 | #define HTYPE_EVAL(a, b) HTYPE_PASTE(a, b)
25 | #define HTYPE HTYPE_EVAL(${SMI_DATATYPE}, HALO_Y)
26 | #else
27 | #define HTYPE ${SMI_DATATYPE}
28 | #endif
29 | #if W < HALO_Y
30 | #error "Vectorization width must be greater than or equal to the horizontal halo size."
31 | #endif
32 | #define PX ${SMI_STENCIL_NUM_PROCS_X}
33 | #define PY ${SMI_STENCIL_NUM_PROCS_Y}
34 | #define SMI_DEVICES_PER_NODE ${SMI_DEVICES_PER_NODE}
35 |
36 | // Derived numbers
37 | #define X_LOCAL (X / PX)
38 | #define Y_LOCAL (Y / PY)
39 |
--------------------------------------------------------------------------------
/examples/kernels/gesummv_smi.json:
--------------------------------------------------------------------------------
1 | {
2 | "fpgas": {
3 | "fpga-0006:acl0": "gesummv_rank0",
4 | "fpga-0006:acl1": "gesummv_rank1"
5 | },
6 | "connections": {
7 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3",
8 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2"
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/examples/kernels/kmeans_smi.json:
--------------------------------------------------------------------------------
1 | {
2 | "fpgas": {
3 | "fpga-0006:acl0": "kmeans_smi",
4 | "fpga-0006:acl1": "kmeans_smi",
5 | "fpga-0007:acl0": "kmeans_smi",
6 | "fpga-0007:acl1": "kmeans_smi",
7 | "fpga-0008:acl0": "kmeans_smi",
8 | "fpga-0008:acl1": "kmeans_smi",
9 | "fpga-0009:acl0": "kmeans_smi",
10 | "fpga-0009:acl1": "kmeans_smi"
11 | },
12 | "connections": {
13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3",
14 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2",
15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3",
16 | "fpga-0007:acl0:ch3": "fpga-0007:acl1:ch2",
17 | "fpga-0006:acl0:ch1": "fpga-0007:acl0:ch0",
18 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0",
19 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0",
20 | "fpga-0007:acl1:ch1": "fpga-0008:acl1:ch0",
21 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3",
22 | "fpga-0008:acl0:ch3": "fpga-0008:acl1:ch2",
23 | "fpga-0009:acl0:ch2": "fpga-0009:acl1:ch3",
24 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2",
25 | "fpga-0008:acl0:ch1": "fpga-0009:acl0:ch0",
26 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0",
27 | "fpga-0006:acl0:ch0": "fpga-0009:acl0:ch1",
28 | "fpga-0006:acl1:ch0": "fpga-0009:acl1:ch1"
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/examples/kernels/stencil_onchip.cl.in:
--------------------------------------------------------------------------------
1 | #include "stencil.h"
2 |
3 | channel VTYPE read_stream[PX*PY] __attribute__((depth((Y/W)/PY)));
4 | channel VTYPE write_stream[PX*PY] __attribute__((depth((Y/W)/PY)));
5 | channel VTYPE vert_up[(PX - 1)*PY] __attribute__((depth((Y/W)/PY)));
6 | channel VTYPE vert_down[(PX - 1)*PY] __attribute__((depth((Y/W)/PY)));
7 | channel HTYPE hori_left[PX*(PY - 1)] __attribute__((depth(X/PX)));
8 | channel HTYPE hori_right[PX*(PY - 1)] __attribute__((depth(X/PX)));
9 |
10 | ${code}
11 |
--------------------------------------------------------------------------------
/examples/kernels/stencil_onchip.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import string
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("px", type=int)
7 | parser.add_argument("py", type=int)
8 | parser.add_argument("source_dir", type=str)
9 | parser.add_argument("binary_dir", type=str)
10 | args = parser.parse_args()
11 |
12 | with open(os.path.join(args.source_dir, "stencil_onchip.cl.in"),
13 | "r") as in_file:
14 | tmpl_main = string.Template(in_file.read())
15 | with open(os.path.join(args.source_dir, "stencil_onchip_pe.cl.in"),
16 | "r") as in_file:
17 | tmpl_pe = string.Template(in_file.read())
18 |
19 | pe_code = []
20 | for i_px in range(args.px):
21 | for i_py in range(args.py):
22 | pe_code.append(
23 | tmpl_pe.substitute(
24 | i_px=i_px, i_py=i_py, suffix="_{}_{}".format(i_px, i_py)))
25 |
26 | with open(os.path.join(args.binary_dir, "stencil_onchip.cl"),
27 | "w") as out_file:
28 | out_file.write(tmpl_main.substitute(code="\n\n".join(pe_code)))
29 |
--------------------------------------------------------------------------------
/examples/kernels/stencil_smi.json:
--------------------------------------------------------------------------------
1 | {
2 | "fpgas": {
3 | "fpga-0006:acl0": "stencil_smi",
4 | "fpga-0006:acl1": "stencil_smi",
5 | "fpga-0007:acl0": "stencil_smi",
6 | "fpga-0007:acl1": "stencil_smi",
7 | "fpga-0008:acl0": "stencil_smi",
8 | "fpga-0008:acl1": "stencil_smi",
9 | "fpga-0009:acl0": "stencil_smi",
10 | "fpga-0009:acl1": "stencil_smi"
11 | },
12 | "connections": {
13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3",
14 | "fpga-0006:acl0:ch3": "fpga-0006:acl1:ch2",
15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3",
16 | "fpga-0007:acl0:ch3": "fpga-0007:acl1:ch2",
17 | "fpga-0006:acl0:ch1": "fpga-0007:acl0:ch0",
18 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0",
19 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0",
20 | "fpga-0007:acl1:ch1": "fpga-0008:acl1:ch0",
21 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3",
22 | "fpga-0008:acl0:ch3": "fpga-0008:acl1:ch2",
23 | "fpga-0009:acl0:ch2": "fpga-0009:acl1:ch3",
24 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2",
25 | "fpga-0008:acl0:ch1": "fpga-0009:acl0:ch0",
26 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0",
27 | "fpga-0006:acl0:ch0": "fpga-0009:acl0:ch1",
28 | "fpga-0006:acl1:ch0": "fpga-0009:acl1:ch1"
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/include/smi.h:
--------------------------------------------------------------------------------
1 | #ifndef SMI_H
2 | #define SMI_H
3 |
4 | #ifdef SMI_REWRITER
5 | #include "opencl-shim.h"
6 | #endif
7 |
8 | #include "smi/header_message.h"
9 | #include "smi/network_message.h"
10 | #include "smi/data_types.h"
11 | #include "smi/channel_descriptor.h"
12 | #include "smi/operation_type.h"
13 | #include "smi/reduce_operations.h"
14 | #include "smi/communicator.h"
15 | #include "smi/push.h"
16 | #include "smi/pop.h"
17 | #include "smi/bcast.h"
18 | #include "smi/reduce.h"
19 | #include "smi/gather.h"
20 | #include "smi/scatter.h"
21 | #endif // SMI_H
22 |
--------------------------------------------------------------------------------
/include/smi/bcast.h:
--------------------------------------------------------------------------------
1 | #ifndef BCAST_H
2 | #define BCAST_H
3 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
4 |
5 | /**
6 | @file bcast.h
7 | This file contains the definition of channel descriptor,
8 | open channel and communication primitive for Broadcast.
9 | */
10 |
11 | #include "data_types.h"
12 | #include "header_message.h"
13 | #include "operation_type.h"
14 | #include "network_message.h"
15 | #include "communicator.h"
16 |
17 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){
18 | SMI_Network_message net; //buffered network message
19 | char root_rank;
20 | char my_rank; //These two are essentially the Communicator
21 | char num_rank;
22 | char port; //Port number
23 | unsigned int message_size; //given in number of data elements
24 | unsigned int processed_elements; //how many data elements we have sent/received
25 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet)
26 | SMI_Datatype data_type; //type of message
27 | SMI_Network_message net_2; //buffered network message: used for the receiving side
28 | char size_of_type; //size of data type
29 | char elements_per_packet; //number of data elements per packet
30 | char packet_element_id_rcv; //used by the receivers
31 | bool init; //true at the beginning, used by the receivers for synchronization
32 | }SMI_BChannel;
33 |
34 | /**
35 | * @brief SMI_Open_bcast_channel opens a broadcast channel
36 | * @param count number of data elements to broadcast
37 | * @param data_type type of the channel
38 | * @param port port number
39 | * @param root rank of the root
40 | * @param comm communicator
41 | * @return the channel descriptor
42 | */
43 | SMI_BChannel SMI_Open_bcast_channel(int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm);
44 |
45 | /**
46 | * @brief SMI_Open_bcast_channel_ad opens a broadcast channel with a given asynchronicity degree
47 | * @param count number of data elements to broadcast
48 | * @param data_type type of the channel
49 | * @param port port number
50 | * @param root rank of the root
51 | * @param comm communicator
52 | * @param asynch_degree the asynchronicity degree in number of data elements
53 | * @return the channel descriptor
54 | */
55 | SMI_BChannel SMI_Open_bcast_channel_ad(int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm, int asynch_degree);
56 |
57 | /**
58 | * @brief SMI_Bcast
59 | * @param chan pointer to the broadcast channel descriptor
60 | * @param data pointer to the data element: on the root rank is the element that will be transmitted,
61 | on the non-root rank will be the received element
62 | */
63 | void SMI_Bcast(SMI_BChannel *chan, void* data);
64 | #endif // BCAST_H
65 |
--------------------------------------------------------------------------------
/include/smi/channel_descriptor.h:
--------------------------------------------------------------------------------
1 | #ifndef CHANNEL_DESCRIPTOR_H
2 | #define CHANNEL_DESCRIPTOR_H
3 | /**
4 | @file channel_descriptor.h
5 | Point-to-point transient channel descriptor.
6 | It maintains all the informations that are necessary for performing a point-to-point communication (Push/Pop)
7 | */
8 |
9 | #include "network_message.h"
10 | #include "operation_type.h"
11 | #include "data_types.h"
12 | #include "communicator.h"
13 | #define MIN(a,b) (((a)<(b))?(a):(b))
14 | #define MAX(a,b) (((a)>(b))?(a):(b))
15 |
16 |
17 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){
18 | SMI_Network_message net; //buffered network message
19 | char sender_rank; //rank of the sender
20 | char receiver_rank; //rank of the receiver
21 | char port; //channel port
22 | unsigned int message_size; //given in number of data elements
23 | unsigned int processed_elements; //how many data elements we have sent/received so far
24 | unsigned int packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet)
25 | SMI_Datatype data_type; //type of message
26 | char op_type; //type of operation
27 | char size_of_type; //size of data type
28 | char elements_per_packet; //number of data elements per packet
29 | volatile unsigned int tokens; //current number of tokens (one tokens allow the sender to transmit one data element)
30 | unsigned int max_tokens; //max tokens on the sender side
31 | }SMI_Channel;
32 |
33 | #endif
34 |
--------------------------------------------------------------------------------
/include/smi/communicator.h:
--------------------------------------------------------------------------------
1 | #ifndef COMMUNICATOR_H
2 | #define COMMUNICATOR_H
3 | /**
4 | @file communicator.h
5 | Describes a basic communicator.
6 | */
7 |
8 | //Note: Since the Intel compiler fails in compiling the emulation if you pass a user-defined
9 | //data type, we had to define it by resorting to OpenCL data types: the first element
10 | //will be "my_rank" and the second the number of ranks
11 | #if defined __HOST_PROGRAM__
12 | typedef cl_char2 SMI_Comm;
13 | #else
14 | typedef char2 SMI_Comm;
15 | /**
16 | * @brief SMI_Comm_size return the communicator size
17 | * @param comm
18 | * @return the communicator size
19 | */
20 | inline int SMI_Comm_size(SMI_Comm comm){
21 | return comm[1];
22 | }
23 |
24 | /**
25 | * @brief SMI_Comm_rank determins the rank of the caller
26 | * @param comm
27 | * @return rank of the caller
28 | */
29 | inline int SMI_Comm_rank(SMI_Comm comm){
30 | return comm[0];
31 | }
32 | #endif
33 | #endif
34 |
--------------------------------------------------------------------------------
/include/smi/data_types.h:
--------------------------------------------------------------------------------
1 |
2 | #ifndef DATA_TYPES_H
3 | #define DATA_TYPES_H
4 |
5 | /**
6 | @file data_types.h
7 | Supported message data types
8 | */
9 |
10 | typedef enum{
11 | SMI_INT = 1,
12 | SMI_FLOAT = 2,
13 | SMI_DOUBLE = 3,
14 | SMI_CHAR = 4,
15 | SMI_SHORT = 5
16 | }SMI_Datatype;
17 |
18 | #endif
19 |
--------------------------------------------------------------------------------
/include/smi/gather.h:
--------------------------------------------------------------------------------
1 | #ifndef GATHER_H
2 | #define GATHER_H
3 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
4 |
5 | /**
6 | @file gather.h
7 | This file contains the channel descriptor, open channel
8 | and communication primitive for gather
9 | */
10 |
11 |
12 | #include "data_types.h"
13 | #include "header_message.h"
14 | #include "operation_type.h"
15 | #include "network_message.h"
16 | #include "communicator.h"
17 |
18 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){
19 | SMI_Network_message net; //buffered network message
20 | int recv_count; //number of data elements that will be received by the root
21 | char port;
22 | int processed_elements_root; //number of elements processed by the root
23 | char packet_element_id_rcv; //used by the receivers
24 | char next_contrib; //the rank of the next contributor
25 | char my_rank;
26 | char num_rank;
27 | char root_rank;
28 | SMI_Network_message net_2; //buffered network message, used by root rank to send synchronization messages
29 | int send_count; //number of elements sent by each non-root ranks
30 | int processed_elements; //how many data elements we have sent (non-root)
31 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet)
32 | char data_type; //type of message
33 | char size_of_type; //size of data type
34 | char elements_per_packet; //number of data elements per packet
35 | }SMI_GatherChannel;
36 |
37 | /**
38 | * @brief SMI_Open_gather_channel opens a gather channel
39 | * @param send_count number of data elements transmitted by each rank
40 | * @param recv_count number of data elements received by root rank (i.e. num_ranks*send_count)
41 | * @param data_type type of the channel
42 | * @param port port number
43 | * @param root rank of the root
44 | * @param comm communicator
45 | * @return the channel descriptor
46 | */
47 | SMI_GatherChannel SMI_Open_gather_channel(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm);
48 |
49 | /**
50 | * @brief SMI_Open_gather_channel_ad opens a gather channel with a given asynchronicity degree
51 | * @param send_count number of data elements transmitted by each rank
52 | * @param recv_count number of data elements received by root rank (i.e. num_ranks*send_count)
53 | * @param data_type type of the channel
54 | * @param port port number
55 | * @param root rank of the root
56 | * @param comm communicator
57 | * @param asynch_degree the asynchronicity degree expressed in number of data elements
58 | * @return the channel descriptor
59 | */
60 | SMI_GatherChannel SMI_Open_gather_channel_ad(int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm, int asynch_degree);
61 |
62 | /**
63 | * @brief SMI_Gather
64 | * @param chan pointer to the gather channel descriptor
65 | * @param data_snd pointer to the data element that must be sent
66 | * @param data_rcv pointer to the receiving data element (significant on the root rank only)
67 | */
68 | void SMI_Gather(SMI_GatherChannel *chan, void* send_data, void* rcv_data);
69 |
70 | #endif // GATHER_H
71 |
--------------------------------------------------------------------------------
/include/smi/header_message.h:
--------------------------------------------------------------------------------
1 | /**
2 | Message header definition with macros for accessing it
3 | (for now they just take the corresponding field)
4 | */
5 | #ifndef HEADER_MESSAGE_H
6 | #define HEADER_MESSAGE_H
7 |
8 | #define GET_HEADER_SRC(H) (H.src)
9 | #define GET_HEADER_DST(H) (H.dst)
10 | #define GET_HEADER_PORT(H) (H.port)
11 | #define GET_HEADER_OP(H) ((char)H.elems_and_op & (char)7)
12 | #define GET_HEADER_NUM_ELEMS(H) ((char)H.elems_and_op >> ((char)3)) //returns the number of valid data elements in the packet
13 | #define SET_HEADER_SRC(H,S) (H.src=S)
14 | #define SET_HEADER_DST(H,D) (H.dst=D)
15 | #define SET_HEADER_PORT(H,P) (H.port=P)
16 | #define SET_HEADER_OP(H,O) (H.elems_and_op=((H.elems_and_op & 248) | O & 7))
17 | #define SET_HEADER_NUM_ELEMS(H,N) (H.elems_and_op=((H.elems_and_op &7) | (N << 3))) //By assumption N < 32
18 |
19 |
20 | typedef struct __attribute__((packed)) {
21 | char src;
22 | char dst;
23 | char port;
24 | char elems_and_op; //upper 5 bits contain the number of valid data elements in the packet
25 | //lower 3 bit contain the type of operation
26 |
27 | }SMI_Message_header;
28 |
29 | #endif //ifndef HEADER_MESSAGE_H
30 |
--------------------------------------------------------------------------------
/include/smi/operation_type.h:
--------------------------------------------------------------------------------
1 | /**
2 | Definition of the supported communication operations
3 | */
4 |
5 | #ifndef OPERATION_TYPE_H
6 | #define OPERATION_TYPE_H
7 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
8 | /**
9 | Type of operation performed
10 | */
11 | typedef enum{
12 | SMI_SEND = 0,
13 | SMI_RECEIVE = 1,
14 | SMI_BROADCAST = 2,
15 | SMI_SYNCH=3, //special operation type used for synchronization/rendezvou
16 | SMI_SCATTER=4,
17 | SMI_REDUCE=5,
18 | SMI_GATHER=6
19 | }SMI_Operationtype;
20 |
21 | #endif
22 |
--------------------------------------------------------------------------------
/include/smi/pop.h:
--------------------------------------------------------------------------------
1 | /**
2 | Pop from channel
3 | */
4 |
5 | #ifndef POP_H
6 | #define POP_H
7 | #include "channel_descriptor.h"
8 | #include "communicator.h"
9 |
10 |
11 | /**
12 | * @brief SMI_Open_receive_channel opens a receive transient channel
13 | * @param count number of data elements to receive
14 | * @param data_type data type of the data elements
15 | * @param source rank of the sender
16 | * @param port port number
17 | * @param comm communicator
18 | * @return channel descriptor
19 | */
20 | SMI_Channel SMI_Open_receive_channel(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm);
21 |
22 | /**
23 | * @brief SMI_Open_receive_channel_ad opens a receive transient channel with a given asynchronicity degree
24 | * @param count number of data elements to receive
25 | * @param data_type data type of the data elements
26 | * @param source rank of the sender
27 | * @param port port number
28 | * @param comm communicator
29 | * @param asynch_degree the asynchronicity degree expressed in number of data elements
30 | * @return channel descriptor
31 | */
32 | SMI_Channel SMI_Open_receive_channel_ad(int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm, int asynch_degree);
33 |
34 | /**
35 | * @brief SMI_Pop: receive a data element. Returns only when data arrives
36 | * @param chan pointer to the transient channel descriptor
37 | * @param data pointer to the target variable that, on return, will contain the data element
38 | */
39 | void SMI_Pop(SMI_Channel *chan, void *data);
40 |
41 | #endif //ifndef POP_H
42 |
--------------------------------------------------------------------------------
/include/smi/push.h:
--------------------------------------------------------------------------------
1 | /**
2 | Push to channel
3 | */
4 |
5 | #ifndef PUSH_H
6 | #define PUSH_H
7 | #include "channel_descriptor.h"
8 | #include "communicator.h"
9 |
10 | /**
11 | * @brief SMI_OpenSendChannel open a sending transient channel
12 | * @param count number of data elements to send
13 | * @param data_type type of the data element
14 | * @param destination rank of the destination
15 | * @param port port number
16 | * @param comm communicator
17 | * @return channel descriptor
18 | */
19 | SMI_Channel SMI_Open_send_channel(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm);
20 |
21 |
22 | /**
23 | * @brief SMI_OpenSendChannel_ad open a sending transient channel with a given asynchronicity degree
24 | * @param count number of data elements to send
25 | * @param data_type type of the data element
26 | * @param destination rank of the destination
27 | * @param port port number
28 | * @param comm communicator
29 | * @param asynch_degree the asynchronicity degree expressed in number of data elements
30 | * @return channel descriptor
31 | */
32 | SMI_Channel SMI_Open_send_channel_ad(int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm, int asynch_degree);
33 |
34 | /**
35 | * @brief private function SMI_Push push a data elements in the transient channel. Data transferring can be delayed
36 | * @param chan
37 | * @param data
38 | * @param immediate: if true the data is immediately sent, without waiting for the completion of the network packet.
39 | * In general, the user should use the other Push definition
40 | */
41 | void SMI_Push_flush(SMI_Channel *chan, void* data, int immediate);
42 |
43 | /**
44 | * @brief SMI_Push push a data elements in the transient channel. The actual ata transferring can be delayed
45 | * @param chan pointer to the channel descriptor of the transient channel
46 | * @param data pointer to the data that can be sent
47 | */
48 | void SMI_Push(SMI_Channel *chan, void* data);
49 |
50 | #endif //ifndef PUSH_H
51 |
--------------------------------------------------------------------------------
/include/smi/reduce.h:
--------------------------------------------------------------------------------
1 | #ifndef REDUCE_H
2 | #define REDUCE_H
3 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
4 |
5 | /**
6 | @file reduce.h
7 | This file contains the channel descriptor, open channel, operation types,
8 | and communication primitive for reduce
9 | */
10 |
11 |
12 | #include "data_types.h"
13 | #include "header_message.h"
14 | #include "network_message.h"
15 | #include "operation_type.h"
16 | #include "communicator.h"
17 |
18 | typedef enum{
19 | SMI_ADD = 0,
20 | SMI_MAX = 1,
21 | SMI_MIN = 2
22 | }SMI_Op;
23 |
24 | /**
25 | Channel descriptor for reduce
26 | */
27 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){
28 | SMI_Network_message net; //buffered network message
29 | char port; //Output channel for the bcast, used by the root
30 | char root_rank;
31 | char my_rank; //communicator infos
32 | char num_rank;
33 | unsigned int message_size; //given in number of data elements
34 | unsigned int processed_elements; //how many data elements we have sent/received
35 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet)
36 | SMI_Datatype data_type; //type of message
37 | char size_of_type; //size of data type
38 | char elements_per_packet; //number of data elements per packet
39 | SMI_Network_message net_2; //buffered network message (we need two of them to remove aliasing)
40 | char packet_element_id_rcv; //used by the receivers
41 | char reduce_op; //applied reduce operation
42 | }SMI_RChannel;
43 |
44 |
45 | /**
46 | * @brief SMI_Open_reduce_channel opens a transient reduce channel
47 | * @param count number of data elements to reduce
48 | * @param data_type type of the channel
49 | * @param op rapplied reduce operation
50 | * @param port port number
51 | * @param root rank of the root
52 | * @param comm communicator
53 | * @return the channel descriptor
54 | */
55 | SMI_RChannel SMI_Open_reduce_channel(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm);
56 |
57 | /**
58 | * @brief SMI_Open_reduce_channel_ad opens a transient reduce channel with a given asynchronicity degree
59 | * @param count number of data elements to reduce
60 | * @param data_type type of the channel
61 | * @param op rapplied reduce operation
62 | * @param port port number
63 | * @param root rank of the root
64 | * @param comm communicator
65 | * @param asynch_degree the asynchronicity degree expressed in number of data elements
66 | * @return the channel descriptor
67 | */
68 | SMI_RChannel SMI_Open_reduce_channel_ad(int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm, int asynch_degree);
69 |
70 | /**
71 | * @brief SMI_Reduce
72 | * @param chan pointer to the reduce channel descriptor
73 | * @param data_snd pointer to the data element that must be reduced
74 | * @param data_rcv pointer to the receiving data element (root only)
75 | */
76 | void SMI_Reduce(SMI_RChannel *chan, void* data_snd, void* data_rcv);
77 |
78 | #endif // REDUCE_H
79 |
--------------------------------------------------------------------------------
/include/smi/reduce_operations.h:
--------------------------------------------------------------------------------
1 | #ifndef REDUCE_OPERATIONS_H
2 | #define REDUCE_OPERATIONS_H
3 |
4 | #define SMI_OP_ADD(A,B) ((A)+(B))
5 | #define SMI_OP_MIN(A,B) (((A)<(B))?(A):(B))
6 | #define SMI_OP_MAX(A,B) (((A)>(B))?(A):(B))
7 |
8 | #endif // REDUCE_OPERATIONS_H
9 |
--------------------------------------------------------------------------------
/include/smi/scatter.h:
--------------------------------------------------------------------------------
1 | #ifndef SCATTER_H
2 | #define SCATTER_H
3 |
4 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
5 |
6 | /**
7 | @file scatter.h
8 | This file contains the definition of channel descriptor,
9 | open channel and communication primitive for Scatter.
10 | */
11 |
12 | #include "data_types.h"
13 | #include "header_message.h"
14 | #include "operation_type.h"
15 | #include "network_message.h"
16 | #include "communicator.h"
17 |
18 |
19 |
20 | typedef struct __attribute__((packed)) __attribute__((aligned(64))){
21 | SMI_Network_message net; //buffered network message
22 | char port; //port
23 | char root_rank;
24 | char my_rank; //rank of the caller
25 | char num_ranks; //total number of ranks
26 | unsigned int send_count; //given in number of data elements
27 | unsigned int recv_count; //given in number of data elements
28 | unsigned int processed_elements; //how many data elements we have sent/received
29 | char packet_element_id; //given a packet, the id of the element that we are currently processing (from 0 to the data elements per packet)
30 | char data_type; //type of message
31 | SMI_Network_message net_2; //buffered network message (used by non root ranks)
32 | char size_of_type; //size of data type
33 | char elements_per_packet; //number of data elements per packet
34 | char packet_element_id_rcv; //used by the receivers
35 | char next_rcv; //the rank of the next receiver
36 | bool init; //true when the channel is opened, false when synchronization message has been sent
37 | }SMI_ScatterChannel;
38 |
39 | /**
40 | * @brief SMI_Open_scatter_channel opens a transient scatter channel
41 | * @param send_count number of data elements transmitted by root to each rank
42 | * @param recv_count number of data elements received by each rank
43 | * @param data_type type of the channel
44 | * @param port port number
45 | * @param root rank of the root
46 | * @param comm communicator
47 | * @return the channel descriptor
48 | */
49 | SMI_ScatterChannel SMI_Open_scatter_channel(int send_count, int recv_count,
50 | SMI_Datatype data_type, int port, int root, SMI_Comm comm);
51 |
52 | /**
53 | * @brief SMI_Open_scatter_channel opens a transient scatter channel
54 | * @param send_count number of data elements transmitted by root to each rank
55 | * @param recv_count number of data elements received by each rank
56 | * @param data_type type of the channel
57 | * @param port port number
58 | * @param root rank of the root
59 | * @param comm communicator
60 | * @param asynch_degree the asynchronicity degree expressed in number of data elements
61 | * @return the channel descriptor
62 | */
63 | SMI_ScatterChannel SMI_Open_scatter_channel_ad(int send_count, int recv_count,
64 | SMI_Datatype data_type, int port, int root, SMI_Comm comm, int asynch_degree);
65 |
66 | /**
67 | * @brief SMI_Scatter
68 | * @param chan pointer to the scatter channel descriptor
69 | * @param data_snd pointer to the data element that must be sent (root only)
70 | * @param data_rcv pointer to the receiving data element
71 | */
72 | void SMI_Scatter(SMI_ScatterChannel *chan, void* data_snd, void* data_rcv);
73 |
74 | #endif // SCATTER_H
75 |
--------------------------------------------------------------------------------
/include/utils/smi_utils.hpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | void checkMpiCall(int code, const char* location, int line)
10 | {
11 | if (code != MPI_SUCCESS)
12 | {
13 | char error[256];
14 | int length;
15 | MPI_Error_string(code, error, &length);
16 | std::cerr << "MPI error at " << location << ":" << line << ": " << error << std::endl;
17 | }
18 | }
19 |
20 | #define CHECK_MPI(err) checkMpiCall((err), __FILE__, __LINE__);
21 |
22 | constexpr int kChannelsPerRank = 4;
23 |
24 | template
25 | void LoadRoutingTable(int rank, int channel, int num_entries,
26 | const std::string& routing_directory,
27 | const std::string& prefix, DataSize* table) {
28 | std::stringstream path;
29 | path << routing_directory << "/" << prefix << "-rank" << rank << "-channel"
30 | << channel;
31 |
32 | std::ifstream file(path.str(), std::ios::binary);
33 | if (!file) {
34 | throw std::runtime_error("Routing table " + path.str() + " not found.");
35 | }
36 |
37 | auto byte_size = num_entries * sizeof(DataSize);
38 | file.read(table, byte_size);
39 | }
40 |
41 | std::string replace(std::string source, const std::string& pattern, const std::string& replacement)
42 | {
43 | auto pos = source.find(pattern);
44 | if (pos != std::string::npos)
45 | {
46 | return source.substr(0, pos) + replacement + source.substr(pos + pattern.length());
47 | }
48 | return source;
49 | }
50 |
--------------------------------------------------------------------------------
/include/utils/utils.hpp:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_HPP
2 | #define UTILS_HPP
3 |
4 | #include
5 | /**
6 | Timing functions
7 | */
8 | typedef long unsigned int timestamp_t;
9 |
10 | inline timestamp_t current_time_usecs() __attribute__((always_inline));
11 | inline timestamp_t current_time_usecs(){
12 | struct timeval t;
13 | gettimeofday(&t, NULL);
14 | return (t.tv_sec)*1000000L + t.tv_usec;
15 |
16 | }
17 |
18 | inline long current_time_nsecs() __attribute__((always_inline));
19 | inline long current_time_nsecs(){
20 | struct timespec t;
21 | clock_gettime(CLOCK_REALTIME, &t);
22 | return (t.tv_sec)*1000000000L + t.tv_nsec;
23 | }
24 | #endif // UTILS_HPP
25 |
--------------------------------------------------------------------------------
/microbenchmarks/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
4 |
5 | find_package(PythonInterp 3)
6 |
7 | if(PythonInterp_FOUND)
8 |
9 | #SPMD
10 | smi_target(broadcast "${CMAKE_CURRENT_SOURCE_DIR}/kernels/broadcast.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/broadcast_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/broadcast.cl" 8)
11 | smi_target(reduce "${CMAKE_CURRENT_SOURCE_DIR}/kernels/reduce.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/reduce_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/reduce.cl" 8)
12 | smi_target(scatter "${CMAKE_CURRENT_SOURCE_DIR}/kernels/scatter.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/scatter_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/scatter.cl" 8)
13 | smi_target(gather "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gather.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/gather_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gather.cl" 8)
14 | smi_target(multi_collectives "${CMAKE_CURRENT_SOURCE_DIR}/kernels/multi_collectives.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/multi_collectives_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/multi_collectives.cl" 8)
15 |
16 | #MPMD
17 | smi_target(bandwidth "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/bandwidth_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_1.cl" 8)
18 | smi_target(injection "${CMAKE_CURRENT_SOURCE_DIR}/kernels/injection_rate.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/injection_rate_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/injection_rate_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/injection_rate_1.cl" 8)
19 | smi_target(latency "${CMAKE_CURRENT_SOURCE_DIR}/kernels/latency.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/latency_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/latency_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/latency_1.cl" 8)
20 |
21 | #Eager evaluation
22 | smi_target(bandwidth_eager "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth.json" "${CMAKE_CURRENT_SOURCE_DIR}/host/bandwidth_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_0.cl;${CMAKE_CURRENT_SOURCE_DIR}/kernels/bandwidth_1.cl" 8 8 8 OFF)
23 |
24 | endif()
25 |
26 |
--------------------------------------------------------------------------------
/microbenchmarks/kernels/bandwidth.json:
--------------------------------------------------------------------------------
1 | {
2 | "fpgas": {
3 | "fpga-0006:acl0": "bandwidth_0",
4 | "fpga-0006:acl1": "bandwidth_1",
5 | "fpga-0007:acl0": "bandwidth_1",
6 | "fpga-0007:acl1": "bandwidth_1",
7 | "fpga-0008:acl0": "bandwidth_1",
8 | "fpga-0008:acl1": "bandwidth_1",
9 | "fpga-0009:acl0": "bandwidth_1",
10 | "fpga-0009:acl1": "bandwidth_1"
11 | },
12 | "connections": {
13 | "fpga-0006:acl0:ch2": "fpga-0006:acl1:ch3",
14 | "fpga-0006:acl1:ch1": "fpga-0007:acl1:ch0",
15 | "fpga-0007:acl0:ch2": "fpga-0007:acl1:ch3",
16 | "fpga-0007:acl0:ch1": "fpga-0008:acl0:ch0",
17 | "fpga-0008:acl0:ch2": "fpga-0008:acl1:ch3",
18 | "fpga-0008:acl1:ch1": "fpga-0009:acl1:ch0",
19 | "fpga-0009:acl0:ch3": "fpga-0009:acl1:ch2"
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/microbenchmarks/kernels/bandwidth_0.cl:
--------------------------------------------------------------------------------
1 | /**
2 | Scaling benchmark: we want to evaluate the bandwdith
3 | achieved between two ranks. The FPGA are connected in a chain
4 | so we can decide the distance at which they are
5 |
6 | RANK 0 is the source of the data
7 | */
8 |
9 | #include
10 |
11 | __kernel void app(const int N, const char dest_rank, const SMI_Comm comm)
12 | {
13 |
14 | SMI_Channel chan=SMI_Open_send_channel_ad(N, SMI_DOUBLE, dest_rank, 0, comm, 2048);
15 | const double start=0.1f;
16 | for(int i=0;i
11 |
12 | __kernel void app(__global char *mem, const int N, SMI_Comm comm)
13 | {
14 | SMI_Channel chan=SMI_Open_receive_channel_ad(N, SMI_DOUBLE, 0, 0, comm, 2048);
15 | const double start=0.1f;
16 | char check=1;
17 | for(int i=0;i
8 |
9 | __kernel void app(__global char* mem, const int N, char root,SMI_Comm comm)
10 | {
11 | char check=1;
12 | SMI_BChannel __attribute__((register)) chan= SMI_Open_bcast_channel(N, SMI_FLOAT,0, root,comm);
13 | //SMI_BChannel chan= SMI_Open_bcast_channel(N, SMI_FLOAT,0, root,comm);
14 | for(int i=0;i
7 |
8 |
9 | __kernel void app(const int N, char root, __global char *mem, SMI_Comm comm)
10 | {
11 | SMI_GatherChannel __attribute__((register)) chan= SMI_Open_gather_channel(N,N, SMI_INT,0, root,comm);
12 | int my_rank=SMI_Comm_rank(comm);
13 | int num_ranks=SMI_Comm_size(comm);
14 | const int loop_bound=(my_rank==root)?N*num_ranks:N;
15 | int to_send=(my_rank==root)?0:my_rank*N; //starting number
16 | char check=1;
17 | for(int i=0;i
9 |
10 |
11 |
12 | __kernel void app(const int N, const char dst, SMI_Comm comm)
13 | {
14 | for(int i=0;i
9 |
10 |
11 | __kernel void app(const int N,SMI_Comm comm)
12 | {
13 | int rcv;
14 | for(int i=0;i
17 |
18 |
19 |
20 | __kernel void app(const int N, char dest_rank,SMI_Comm comm)
21 | {
22 | int to_send;
23 | for(int i=0;i
18 |
19 |
20 | __kernel void app(const int N, SMI_Comm comm)
21 | {
22 | int to_send;
23 | for(int i=0;i
16 |
17 |
18 | __kernel void sequential_collectives(const int N, char root, __global volatile char *mem, SMI_Comm comm)
19 | {
20 | unsigned int my_rank=SMI_Comm_rank(comm);
21 | unsigned int num_ranks=SMI_Comm_size(comm);
22 | float start_float=1.1f;
23 | int start_int=1;
24 | char check=1;
25 | //first execute the reduce
26 | SMI_BChannel __attribute__((register)) bchan_float= SMI_Open_bcast_channel(N, SMI_FLOAT,0, root,comm);
27 | for(int i=0;i
8 |
9 | __kernel void app(const int N, char root, __global volatile char *mem, SMI_Comm comm)
10 | {
11 | unsigned int my_rank=SMI_Comm_rank(comm);
12 | unsigned int num_ranks=SMI_Comm_size(comm);
13 | float exp=(num_ranks*(num_ranks+1))/2;
14 | char check=1;
15 |
16 | SMI_RChannel __attribute__((register)) rchan_float= SMI_Open_reduce_channel(N, SMI_FLOAT, SMI_ADD, 0,root,comm);
17 | for(int i=0;i
8 |
9 |
10 |
11 | __kernel void app(const int N, char root,__global char* mem, SMI_Comm comm)
12 | {
13 |
14 | SMI_ScatterChannel __attribute__((register)) chan= SMI_Open_scatter_channel(N,N, SMI_INT, 0,root,comm);
15 | char check=1;
16 | int num_ranks=SMI_Comm_size(comm);
17 | int my_rank=SMI_Comm_rank(comm);
18 | const int loop_bound=(my_rank==root)?N*num_ranks:N;
19 | const int to_rcv_start=my_rank*N;
20 | for(int i=0;i
4 | #include
5 |
6 | #include
7 | #include
8 |
9 | using namespace clang;
10 | using namespace llvm;
11 |
12 | bool SpecializeCallsConsumer::HandleTopLevelDecl(DeclGroupRef group)
13 | {
14 | for (auto& decl: group)
15 | {
16 | this->visitor.TraverseDecl(decl);
17 | }
18 | return true;
19 | }
20 |
21 | bool SpecializeCallsAction::PrepareToExecuteAction(CompilerInstance& compiler)
22 | {
23 | compiler.getPreprocessorOpts().addMacroDef("SMI_REWRITER");
24 | compiler.getDiagnostics().setErrorLimit(9999);
25 | return true;
26 | }
27 | std::unique_ptr SpecializeCallsAction::CreateASTConsumer(
28 | CompilerInstance& compiler,
29 | StringRef file)
30 | {
31 | this->rewriter.setSourceMgr(compiler.getSourceManager(), compiler.getLangOpts());
32 | return std::make_unique(this->rewriter);
33 | }
34 | void SpecializeCallsAction::EndSourceFileAction()
35 | {
36 | this->rewriter.overwriteChangedFiles();
37 | }
38 |
--------------------------------------------------------------------------------
/source-rewriter/src/action.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "rewrite.h"
4 |
5 | #include
6 | #include
7 |
8 | class SpecializeCallsConsumer: public clang::ASTConsumer
9 | {
10 | public:
11 | explicit SpecializeCallsConsumer(clang::Rewriter& rewriter)
12 | : visitor(rewriter)
13 | {
14 |
15 | }
16 |
17 | bool HandleTopLevelDecl(clang::DeclGroupRef group) override;
18 |
19 | private:
20 | RewriteKernelsVisitor visitor;
21 | };
22 |
23 | class SpecializeCallsAction: public clang::ASTFrontendAction
24 | {
25 | public:
26 | bool PrepareToExecuteAction(clang::CompilerInstance& compiler);
27 |
28 | std::unique_ptr CreateASTConsumer(
29 | clang::CompilerInstance& compiler,
30 | llvm::StringRef file) override;
31 | void EndSourceFileAction() override;
32 |
33 | private:
34 | clang::Rewriter rewriter;
35 | };
36 |
--------------------------------------------------------------------------------
/source-rewriter/src/main.cpp:
--------------------------------------------------------------------------------
1 | #include "action.h"
2 |
3 | #include
4 | #include
5 |
6 | using namespace clang::tooling;
7 |
8 | static llvm::cl::OptionCategory Opt("SMI kernel rewriter");
9 |
10 | int main(int argc, const char** argv)
11 | {
12 | CommonOptionsParser op(argc, argv, Opt);
13 | ClangTool Tool(op.getCompilations(), op.getSourcePathList());
14 |
15 | return Tool.run(clang::tooling::newFrontendActionFactory().get());
16 | }
17 |
18 | // TODO: matchers
19 | // TODO: run action manually
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/broadcast.cpp:
--------------------------------------------------------------------------------
1 | #include "broadcast.h"
2 | #include "utils.h"
3 |
4 | using namespace clang;
5 |
6 | static OperationMetadata extractBroadcast(CallExpr* channelDecl)
7 | {
8 | return OperationMetadata("broadcast",
9 | extractIntArg(channelDecl, 2),
10 | extractDataType(channelDecl, 1),
11 | extractBufferSize(channelDecl, 5)
12 | );
13 | }
14 |
15 | OperationMetadata BroadcastExtractor::GetOperationMetadata(CallExpr* callExpr)
16 | {
17 | return extractBroadcast(extractChannelDecl(callExpr));
18 | }
19 | std::string BroadcastExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
20 | {
21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_BChannel* chan, void* data);";
22 | }
23 | std::vector BroadcastExtractor::GetFunctionNames()
24 | {
25 | return {"SMI_Bcast"};
26 | }
27 |
28 | OperationMetadata BroadcastChannelExtractor::GetOperationMetadata(CallExpr* callExpr)
29 | {
30 | return extractBroadcast(callExpr);
31 | }
32 | std::string BroadcastChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
33 | {
34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_BChannel", "int count, SMI_Datatype data_type, int port, int root, SMI_Comm comm");
35 | }
36 | std::string BroadcastChannelExtractor::GetChannelFunctionName()
37 | {
38 | return "SMI_Open_bcast_channel";
39 | }
40 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/broadcast.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "ops.h"
4 |
5 | class BroadcastExtractor: public OperationExtractor
6 | {
7 | public:
8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
10 | std::vector GetFunctionNames() override;
11 | };
12 |
13 | class BroadcastChannelExtractor: public ChannelExtractor
14 | {
15 | public:
16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
18 | std::string GetChannelFunctionName() override;
19 | };
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/gather.cpp:
--------------------------------------------------------------------------------
1 | #include "gather.h"
2 | #include "utils.h"
3 |
4 | using namespace clang;
5 |
6 | static OperationMetadata extractGather(CallExpr* channelDecl)
7 | {
8 | return OperationMetadata("gather",
9 | extractIntArg(channelDecl, 3),
10 | extractDataType(channelDecl, 2),
11 | extractBufferSize(channelDecl, 6)
12 | );
13 | }
14 |
15 | OperationMetadata GatherExtractor::GetOperationMetadata(CallExpr* callExpr)
16 | {
17 | return extractGather(extractChannelDecl(callExpr));
18 | }
19 | std::string GatherExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
20 | {
21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_GatherChannel* chan, void* send_data, void* rcv_data);";
22 | }
23 | std::vector GatherExtractor::GetFunctionNames()
24 | {
25 | return {"SMI_Gather"};
26 | }
27 |
28 | OperationMetadata GatherChannelExtractor::GetOperationMetadata(CallExpr* callExpr)
29 | {
30 | return extractGather(callExpr);
31 | }
32 | std::string GatherChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
33 | {
34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_GatherChannel", "int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm");
35 | }
36 | std::string GatherChannelExtractor::GetChannelFunctionName()
37 | {
38 | return "SMI_Open_gather_channel";
39 | }
40 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/gather.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "ops.h"
4 |
5 | class GatherExtractor: public OperationExtractor
6 | {
7 | public:
8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
10 | std::vector GetFunctionNames() override;
11 | };
12 |
13 | class GatherChannelExtractor: public ChannelExtractor
14 | {
15 | public:
16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
18 | std::string GetChannelFunctionName() override;
19 | };
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/ops.cpp:
--------------------------------------------------------------------------------
1 | #include "ops.h"
2 | #include "utils.h"
3 |
4 | #include "../third-party/json.hpp"
5 |
6 | #include
7 |
8 | using json = nlohmann::json;
9 | using namespace clang;
10 |
11 | std::string OperationExtractor::RenameCall(const std::string& callName, const OperationMetadata& metadata)
12 | {
13 | return renamePortDataType(callName, metadata);
14 | }
15 |
16 | OperationMetadata OperationExtractor::ModifyCall(clang::Rewriter& rewriter, clang::CallExpr& callExpr, const std::string& callName)
17 | {
18 | auto metadata = this->GetOperationMetadata(&callExpr);
19 | auto renamed = this->RenameCall(callName, metadata);
20 | rewriter.ReplaceText(callExpr.getBeginLoc(), renamed);
21 | return metadata;
22 | }
23 |
24 | void ChannelExtractor::OutputMetadata(const OperationMetadata& metadata, std::ostream& os)
25 | {
26 | json obj;
27 | obj["type"] = metadata.operation;
28 | obj["port"] = metadata.port;
29 | obj["data_type"] = formatDataType(metadata.dataType);
30 |
31 | if (metadata.isBufferSizeSet())
32 | {
33 | obj["buffer_size"] = metadata.bufferSize;
34 | }
35 | else obj["buffer_size"] = nullptr;
36 |
37 | obj["args"] = metadata.args;
38 |
39 | os << obj.dump() << std::endl;
40 | }
41 |
42 | // https://stackoverflow.com/a/874160/1107768
43 | static bool ends(const std::string& str, const std::string& end)
44 | {
45 | if (str.length() >= end.length())
46 | {
47 | return (0 == str.compare(str.length() - end.length(), end.length(), end));
48 | }
49 | else return false;
50 | }
51 | static bool isExtendedChannelOpen(const std::string& callName)
52 | {
53 | return ends(callName, "_ad");
54 | }
55 |
56 | std::string ChannelExtractor::CreateChannelDeclaration(const std::string& callName, const OperationMetadata& metadata,
57 | const std::string& returnType, const std::string& parameters)
58 | {
59 | std::stringstream ss;
60 | ss << returnType << " " << this->RenameCall(callName, metadata) << "(" << parameters << ");";
61 |
62 | return ss.str();
63 | }
64 |
65 | std::string ChannelExtractor::RenameCall(const std::string& callName, const OperationMetadata& metadata)
66 | {
67 | auto name = callName;
68 | if (isExtendedChannelOpen(name))
69 | {
70 | name.resize(name.size() - 3);
71 | }
72 | return OperationExtractor::RenameCall(name, metadata);
73 | }
74 |
75 | std::vector ChannelExtractor::GetFunctionNames()
76 | {
77 | auto name = this->GetChannelFunctionName();
78 | return {name, name + "_ad"};
79 | }
80 |
81 | OperationMetadata ChannelExtractor::ModifyCall(Rewriter& rewriter, CallExpr& callExpr, const std::string& callName)
82 | {
83 | auto metadata = OperationExtractor::ModifyCall(rewriter, callExpr, callName);
84 | if (isExtendedChannelOpen(callName) && callExpr.getNumArgs() >= 2)
85 | {
86 | auto lastArg = callExpr.getArgs()[callExpr.getNumArgs() - 1];
87 | auto previousArg = callExpr.getArgs()[callExpr.getNumArgs() - 2];
88 | auto end0 = Lexer::getLocForEndOfToken(previousArg->getEndLoc(), 0, rewriter.getSourceMgr(), rewriter.getLangOpts());
89 | auto end1 = Lexer::getLocForEndOfToken(lastArg->getEndLoc(), 0, rewriter.getSourceMgr(), rewriter.getLangOpts());
90 | rewriter.RemoveText(CharSourceRange::getCharRange(end0, end1));
91 | }
92 | return metadata;
93 | }
94 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #include
9 | #include
10 |
11 | enum class DataType {
12 | Char,
13 | Short,
14 | Int,
15 | Float,
16 | Double
17 | };
18 |
19 | class OperationMetadata
20 | {
21 | public:
22 | OperationMetadata(std::string operation,
23 | size_t port,
24 | DataType dataType = DataType::Int,
25 | int bufferSize = -1,
26 | std::unordered_map args = {})
27 | : operation(std::move(operation)), port(port), dataType(dataType), bufferSize(bufferSize), args(std::move(args))
28 | {
29 |
30 | }
31 |
32 | bool isBufferSizeSet() const
33 | {
34 | return this->bufferSize != -1;
35 | }
36 |
37 | std::string operation;
38 | size_t port;
39 | DataType dataType;
40 | int bufferSize;
41 | std::unordered_map args;
42 | };
43 |
44 | class OperationExtractor
45 | {
46 | public:
47 | virtual ~OperationExtractor() = default;
48 |
49 | /**
50 | * Extract metadata from a call expression.
51 | * The metadata should contain the name of the operation, the used logical port and potentially other arguments
52 | * like data type or reduce operation.
53 | */
54 | virtual OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) = 0;
55 |
56 | /**
57 | * Rename a function call given the extracted metadata.
58 | * For example for callName="SMI_Push" and Metadata with port 0, it should return "SMI_Push_0".
59 | */
60 | virtual std::string RenameCall(const std::string& callName, const OperationMetadata& metadata);
61 |
62 | /**
63 | * Forward declare a renamed function call.
64 | * For example if RenameCall returned "SMI_Push_0" for a given metadata, this function should return
65 | * "void SMI_Push_0(SMI_Channel *chan, void* data);" for the same metadata.
66 | */
67 | virtual std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) = 0;
68 |
69 | /**
70 | * Outputs the serialized metadata to the given stream.
71 | */
72 | virtual void OutputMetadata(const OperationMetadata& metadata, std::ostream& os)
73 | {
74 |
75 | }
76 |
77 | /**
78 | * Returns the function names that should invoke this extractor.
79 | */
80 | virtual std::vector GetFunctionNames() = 0;
81 |
82 | /**
83 | * Rewrite the function call if necessary.
84 | */
85 | virtual OperationMetadata ModifyCall(clang::Rewriter& rewriter, clang::CallExpr& callExpr, const std::string& callName);
86 | };
87 |
88 | class ChannelExtractor: public OperationExtractor
89 | {
90 | public:
91 | std::string RenameCall(const std::string& callName, const OperationMetadata& metadata) override;
92 |
93 | /**
94 | * Outputs the serialized metadata to the given stream.
95 | */
96 | void OutputMetadata(const OperationMetadata& metadata, std::ostream& os) override;
97 | std::vector GetFunctionNames() final;
98 |
99 | OperationMetadata ModifyCall(clang::Rewriter& rewriter, clang::CallExpr& callExpr, const std::string& callName) override;
100 |
101 | virtual std::string GetChannelFunctionName() = 0;
102 |
103 | protected:
104 | std::string CreateChannelDeclaration(
105 | const std::string& callName,
106 | const OperationMetadata& metadata,
107 | const std::string& returnType,
108 | const std::string& parameters);
109 | };
110 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/pop.cpp:
--------------------------------------------------------------------------------
1 | #include "pop.h"
2 | #include "utils.h"
3 |
4 | using namespace clang;
5 |
6 | static OperationMetadata extractPop(CallExpr* channelDecl)
7 | {
8 | return OperationMetadata("pop",
9 | extractIntArg(channelDecl, 3),
10 | extractDataType(channelDecl, 1),
11 | extractBufferSize(channelDecl, 5)
12 | );
13 | }
14 |
15 | OperationMetadata PopExtractor::GetOperationMetadata(CallExpr* callExpr)
16 | {
17 | return extractPop(extractChannelDecl(callExpr));
18 | }
19 | std::string PopExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
20 | {
21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_Channel* chan, void* data);";
22 | }
23 | std::vector PopExtractor::GetFunctionNames()
24 | {
25 | return {"SMI_Pop"};
26 | }
27 |
28 | OperationMetadata PopChannelExtractor::GetOperationMetadata(CallExpr* callExpr)
29 | {
30 | return extractPop(callExpr);
31 | }
32 | std::string PopChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
33 | {
34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_Channel", "int count, SMI_Datatype data_type, int source, int port, SMI_Comm comm");
35 | }
36 | std::string PopChannelExtractor::GetChannelFunctionName()
37 | {
38 | return "SMI_Open_receive_channel";
39 | }
40 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/pop.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "ops.h"
4 |
5 | class PopExtractor: public OperationExtractor
6 | {
7 | public:
8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
10 | std::vector GetFunctionNames() override;
11 | };
12 |
13 | class PopChannelExtractor: public ChannelExtractor
14 | {
15 | public:
16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
18 | std::string GetChannelFunctionName() override;
19 | };
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/push.cpp:
--------------------------------------------------------------------------------
1 | #include "push.h"
2 | #include "utils.h"
3 |
4 | using namespace clang;
5 |
6 | static OperationMetadata extractPush(CallExpr* channelDecl)
7 | {
8 | return OperationMetadata("push",
9 | extractIntArg(channelDecl, 3),
10 | extractDataType(channelDecl, 1),
11 | extractBufferSize(channelDecl, 5)
12 | );
13 | }
14 |
15 | OperationMetadata PushExtractor::GetOperationMetadata(CallExpr* callExpr)
16 | {
17 | return extractPush(extractChannelDecl(callExpr));
18 | }
19 | std::string PushExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
20 | {
21 | std::string args = "(SMI_Channel* chan, void* data";
22 | if (callName == "SMI_Push_flush")
23 | {
24 | args += ", int immediate";
25 | }
26 |
27 | return "void " + this->RenameCall(callName, metadata) + args + ");";
28 | }
29 | std::vector PushExtractor::GetFunctionNames()
30 | {
31 | return {"SMI_Push"};
32 | }
33 |
34 | OperationMetadata PushChannelExtractor::GetOperationMetadata(CallExpr* callExpr)
35 | {
36 | return extractPush(callExpr);
37 | }
38 | std::string PushChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
39 | {
40 | return this->CreateChannelDeclaration(callName, metadata, "SMI_Channel", "int count, SMI_Datatype data_type, int destination, int port, SMI_Comm comm");
41 | }
42 | std::string PushChannelExtractor::GetChannelFunctionName()
43 | {
44 | return "SMI_Open_send_channel";
45 | }
46 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/push.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "ops.h"
4 |
5 | class PushExtractor: public OperationExtractor
6 | {
7 | public:
8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
10 | std::vector GetFunctionNames() override;
11 | };
12 |
13 | class PushChannelExtractor: public ChannelExtractor
14 | {
15 | public:
16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
18 | std::string GetChannelFunctionName() override;
19 | };
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/reduce.cpp:
--------------------------------------------------------------------------------
1 | #include "reduce.h"
2 | #include "utils.h"
3 |
4 | using namespace clang;
5 |
6 | static std::string formatReduceOp(int op)
7 | {
8 | switch (op)
9 | {
10 | case 0: return "add";
11 | case 1: return "max";
12 | case 2: return "min";
13 | }
14 |
15 | assert(false);
16 | return "";
17 | }
18 |
19 | static OperationMetadata extractReduce(CallExpr* channelDecl)
20 | {
21 | return OperationMetadata("reduce",
22 | extractIntArg(channelDecl, 3),
23 | extractDataType(channelDecl, 1),
24 | extractBufferSize(channelDecl, 6),
25 | { {"op_type", formatReduceOp(extractIntArg(channelDecl, 2))} }
26 | );
27 | }
28 |
29 | OperationMetadata ReduceExtractor::GetOperationMetadata(CallExpr* callExpr)
30 | {
31 | return extractReduce(extractChannelDecl(callExpr));
32 | }
33 | std::string ReduceExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
34 | {
35 | return "void " + this->RenameCall(callName, metadata) + "(SMI_RChannel* chan, void* data_snd, void* data_rcv);";
36 | }
37 | std::vector ReduceExtractor::GetFunctionNames()
38 | {
39 | return {"SMI_Reduce"};
40 | }
41 |
42 | OperationMetadata ReduceChannelExtractor::GetOperationMetadata(CallExpr* callExpr)
43 | {
44 | return extractReduce(callExpr);
45 | }
46 | std::string ReduceChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
47 | {
48 | return this->CreateChannelDeclaration(callName, metadata, "SMI_RChannel", "int count, SMI_Datatype data_type, SMI_Op op, int port, int root, SMI_Comm comm");
49 | }
50 | std::string ReduceChannelExtractor::GetChannelFunctionName()
51 | {
52 | return "SMI_Open_reduce_channel";
53 | }
54 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/reduce.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "ops.h"
4 |
5 | class ReduceExtractor: public OperationExtractor
6 | {
7 | public:
8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
10 | std::vector GetFunctionNames() override;
11 | };
12 |
13 | class ReduceChannelExtractor: public ChannelExtractor
14 | {
15 | public:
16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
18 | std::string GetChannelFunctionName() override;
19 | };
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/scatter.cpp:
--------------------------------------------------------------------------------
1 | #include "scatter.h"
2 | #include "utils.h"
3 |
4 | using namespace clang;
5 |
6 | static OperationMetadata extractScatter(CallExpr* channelDecl)
7 | {
8 | return OperationMetadata("scatter",
9 | extractIntArg(channelDecl, 3),
10 | extractDataType(channelDecl, 2),
11 | extractBufferSize(channelDecl, 6)
12 | );
13 | }
14 |
15 | OperationMetadata ScatterExtractor::GetOperationMetadata(CallExpr* callExpr)
16 | {
17 | return extractScatter(extractChannelDecl(callExpr));
18 | }
19 | std::string ScatterExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
20 | {
21 | return "void " + this->RenameCall(callName, metadata) + "(SMI_ScatterChannel* chan, void* data_snd, void* data_rcv);";
22 | }
23 | std::vector ScatterExtractor::GetFunctionNames()
24 | {
25 | return {"SMI_Scatter"};
26 | }
27 |
28 | OperationMetadata ScatterChannelExtractor::GetOperationMetadata(CallExpr* callExpr)
29 | {
30 | return extractScatter(callExpr);
31 | }
32 | std::string ScatterChannelExtractor::CreateDeclaration(const std::string& callName, const OperationMetadata& metadata)
33 | {
34 | return this->CreateChannelDeclaration(callName, metadata, "SMI_ScatterChannel", "int send_count, int recv_count, SMI_Datatype data_type, int port, int root, SMI_Comm comm");
35 | }
36 | std::string ScatterChannelExtractor::GetChannelFunctionName()
37 | {
38 | return "SMI_Open_scatter_channel";
39 | }
40 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/scatter.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "ops.h"
4 |
5 | class ScatterExtractor: public OperationExtractor
6 | {
7 | public:
8 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
9 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
10 | std::vector GetFunctionNames() override;
11 | };
12 |
13 | class ScatterChannelExtractor: public ChannelExtractor
14 | {
15 | public:
16 | OperationMetadata GetOperationMetadata(clang::CallExpr* callExpr) override;
17 | std::string CreateDeclaration(const std::string& callName, const OperationMetadata& metadata) override;
18 | std::string GetChannelFunctionName() override;
19 | };
20 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/utils.cpp:
--------------------------------------------------------------------------------
1 | #include "utils.h"
2 |
3 | using namespace clang;
4 |
5 | bool FindIntegerLiteral::VisitIntegerLiteral(IntegerLiteral* literal)
6 | {
7 | this->setValue(literal->getValue().getZExtValue());
8 | return false;
9 | }
10 | bool FindIntegerLiteral::VisitDeclRefExpr(DeclRefExpr* expr)
11 | {
12 | auto decl = expr->getDecl();
13 | if (auto enumeration = dyn_cast(decl))
14 | {
15 | this->setValue(enumeration->getInitVal().getZExtValue());
16 | }
17 | else if (auto varDecl = dyn_cast(decl))
18 | {
19 | FindIntegerLiteral visitor;
20 | visitor.TraverseDecl(varDecl);
21 | if (visitor.valueFound)
22 | {
23 | this->value = visitor.value;
24 | this->valueFound = true;
25 | }
26 | }
27 | return false;
28 | }
29 | size_t FindIntegerLiteral::getValue() const
30 | {
31 | assert(this->valueFound);
32 | return this->value;
33 | }
34 | void FindIntegerLiteral::setValue(size_t value)
35 | {
36 | assert(!this->valueFound);
37 | this->valueFound = true;
38 | this->value = value;
39 | }
40 |
41 | size_t extractIntArg(CallExpr* expr, int argumentIndex)
42 | {
43 | auto arg = expr->getArgs()[argumentIndex];
44 |
45 | FindIntegerLiteral visitor;
46 | visitor.TraverseStmt(arg);
47 | return visitor.getValue();
48 | }
49 |
50 | std::string formatDataType(DataType dataType)
51 | {
52 | switch (dataType)
53 | {
54 | case DataType::Char: return "char";
55 | case DataType::Short: return "short";
56 | case DataType::Int: return "int";
57 | case DataType::Float: return "float";
58 | case DataType::Double: return "double";
59 | }
60 |
61 | assert(false);
62 | return "";
63 | }
64 |
65 | std::string renamePortDataType(const std::string& callName, const OperationMetadata& metadata)
66 | {
67 | auto call = callName + "_" + std::to_string(metadata.port);
68 | return call + "_" + formatDataType(metadata.dataType);
69 | }
70 |
71 | DataType extractDataType(CallExpr* expr, int argumentIndex)
72 | {
73 | size_t arg = extractIntArg(expr, argumentIndex);
74 | assert(arg >= 1 && arg <= 5);
75 |
76 | switch (arg)
77 | {
78 | case 1: return DataType::Int;
79 | case 2: return DataType::Float;
80 | case 3: return DataType::Double;
81 | case 4: return DataType::Char;
82 | case 5: return DataType::Short;
83 | default:
84 | assert(false);
85 | }
86 | return DataType::Int;
87 | }
88 |
89 | class FindVarDecl: public RecursiveASTVisitor
90 | {
91 | public:
92 | bool VisitVarDecl(VarDecl* decl)
93 | {
94 | assert(!this->decl);
95 | this->decl = decl;
96 |
97 | return false;
98 | }
99 | bool VisitDeclRefExpr(DeclRefExpr* expr)
100 | {
101 | this->TraverseDecl(expr->getDecl());
102 | return true;
103 | }
104 |
105 | VarDecl* decl = nullptr;
106 | };
107 |
108 | CallExpr* extractChannelDecl(CallExpr* expr)
109 | {
110 | assert(expr->getNumArgs() > 0);
111 | auto& arg = expr->getArgs()[0];
112 | FindVarDecl visitor;
113 | visitor.TraverseStmt(arg);
114 | assert(visitor.decl);
115 | auto channelDecl = visitor.decl;
116 | auto callExpr = dyn_cast(channelDecl->getInit());
117 | assert(callExpr);
118 | return callExpr;
119 | }
120 |
121 | size_t extractBufferSize(CallExpr* callExpr, int argumentIndex)
122 | {
123 | if (argumentIndex >= callExpr->getNumArgs()) return -1;
124 | return extractIntArg(callExpr, argumentIndex);
125 | }
126 |
--------------------------------------------------------------------------------
/source-rewriter/src/ops/utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | #include "ops.h"
7 |
8 | class FindIntegerLiteral: public clang::RecursiveASTVisitor
9 | {
10 | public:
11 | bool VisitIntegerLiteral(clang::IntegerLiteral* literal);
12 | bool VisitDeclRefExpr(clang::DeclRefExpr* expr);
13 |
14 | size_t getValue() const;
15 |
16 | private:
17 | void setValue(size_t value);
18 |
19 | bool valueFound = false;
20 | size_t value;
21 | };
22 |
23 | std::string formatDataType(DataType dataType);
24 |
25 | std::string renamePortDataType(const std::string& callName, const OperationMetadata& metadata);
26 |
27 | size_t extractIntArg(clang::CallExpr* expr, int argumentIndex);
28 | size_t extractBufferSize(clang::CallExpr* expr, int argumentIndex);
29 | DataType extractDataType(clang::CallExpr* expr, int argumentIndex);
30 | clang::CallExpr* extractChannelDecl(clang::CallExpr* expr);
31 |
--------------------------------------------------------------------------------
/source-rewriter/src/rewrite.cpp:
--------------------------------------------------------------------------------
1 | #include "rewrite.h"
2 | #include "utils.h"
3 |
4 | #include "ops/ops.h"
5 | #include "ops/push.h"
6 | #include "ops/pop.h"
7 | #include "ops/broadcast.h"
8 | #include "ops/scatter.h"
9 | #include "ops/gather.h"
10 | #include "ops/reduce.h"
11 |
12 | #include
13 |
14 | using namespace clang;
15 |
16 | struct Rewrite
17 | {
18 | public:
19 | Rewrite(OperationMetadata metadata, std::string callName, OperationExtractor* extractor)
20 | : metadata(std::move(metadata)), callName(std::move(callName)), extractor(extractor)
21 | {
22 |
23 | }
24 |
25 | OperationMetadata metadata;
26 | std::string callName;
27 | OperationExtractor* extractor;
28 | };
29 |
30 | class RewriteOpsVisitor: public RecursiveASTVisitor
31 | {
32 | public:
33 | explicit RewriteOpsVisitor(Rewriter& rewriter) : rewriter(rewriter)
34 | {
35 | this->extractors.push_back(std::make_unique());
36 | this->extractors.push_back(std::make_unique());
37 | this->extractors.push_back(std::make_unique());
38 | this->extractors.push_back(std::make_unique());
39 | this->extractors.push_back(std::make_unique());
40 | this->extractors.push_back(std::make_unique());
41 | this->extractors.push_back(std::make_unique());
42 | this->extractors.push_back(std::make_unique());
43 | this->extractors.push_back(std::make_unique());
44 | this->extractors.push_back(std::make_unique());
45 | this->extractors.push_back(std::make_unique());
46 | this->extractors.push_back(std::make_unique());
47 |
48 | for (auto& extractor: this->extractors)
49 | {
50 | for (auto& fn: extractor->GetFunctionNames())
51 | {
52 | this->callMap[fn] = extractor.get();
53 | }
54 | }
55 | }
56 |
57 | bool VisitCallExpr(CallExpr* expr)
58 | {
59 | auto callee = expr->getDirectCallee();
60 | if (callee)
61 | {
62 | auto name = callee->getName().str();
63 | auto it = this->callMap.find(name);
64 | if (it != this->callMap.end())
65 | {
66 | auto& extractor = it->second;
67 |
68 | auto metadata = extractor->ModifyCall(this->rewriter, *expr, name);
69 | this->rewrites.emplace_back(metadata, name, extractor);
70 | }
71 | }
72 |
73 | return true;
74 | }
75 |
76 | const std::vector& getRewrites() const
77 | {
78 | return this->rewrites;
79 | }
80 |
81 | private:
82 | Rewriter& rewriter;
83 |
84 | std::vector> extractors;
85 | std::unordered_map callMap;
86 | std::vector rewrites;
87 | };
88 |
89 | /**
90 | * Only visit functions that are marked as user device kernels.
91 | */
92 | bool RewriteKernelsVisitor::VisitFunctionDecl(FunctionDecl* f)
93 | {
94 | bool isKernel = isKernelFunction(f);
95 | if (isKernel)
96 | {
97 | std::cerr << "SMI: rewriting function " << f->getName().str() << std::endl;
98 |
99 | RewriteOpsVisitor visitor(this->rewriter);
100 | visitor.TraverseFunctionDecl(f);
101 |
102 | for (auto& rewrite: visitor.getRewrites())
103 | {
104 | rewrite.extractor->OutputMetadata(rewrite.metadata, std::cout);
105 | std::cerr << "SMI: rewrote ";
106 | rewrite.extractor->OutputMetadata(rewrite.metadata, std::cerr);
107 | this->rewriter.InsertTextBefore(f->getBeginLoc(), rewrite.extractor->CreateDeclaration(rewrite.callName,
108 | rewrite.metadata) + "\n");
109 | }
110 | }
111 |
112 | return false;
113 | }
114 |
--------------------------------------------------------------------------------
/source-rewriter/src/rewrite.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 | #include
5 |
6 | class RewriteKernelsVisitor: public clang::RecursiveASTVisitor
7 | {
8 | public:
9 | explicit RewriteKernelsVisitor(clang::Rewriter& rewriter) : rewriter(rewriter) { }
10 |
11 | bool VisitFunctionDecl(clang::FunctionDecl *f);
12 |
13 | private:
14 | clang::Rewriter& rewriter;
15 | };
16 |
--------------------------------------------------------------------------------
/source-rewriter/src/utils.cpp:
--------------------------------------------------------------------------------
1 | #include "utils.h"
2 |
3 | #include
4 |
5 | using namespace clang;
6 |
7 | bool isKernelFunction(FunctionDecl* decl)
8 | {
9 | if (!decl) return false;
10 | if (!decl->hasBody()) return false;
11 |
12 | std::unordered_set validAttrs = {
13 | "kernel",
14 | "__kernel"
15 | };
16 |
17 | for (auto& attr: decl->attrs())
18 | {
19 | auto spelling = std::string(attr->getSpelling());
20 | if (validAttrs.find(spelling) != validAttrs.end())
21 | {
22 | return true;
23 | }
24 | }
25 |
26 | return false;
27 | }
28 |
--------------------------------------------------------------------------------
/source-rewriter/src/utils.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include
4 |
5 | bool isKernelFunction(clang::FunctionDecl* decl);
6 |
7 | template
8 | inline std::vector gatherStatements(T* decl)
9 | {
10 | std::vector children;
11 | for (auto& child: decl->children())
12 | {
13 | children.push_back(child);
14 | }
15 | return children;
16 | }
17 |
--------------------------------------------------------------------------------
/test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Download and unpack googletest at configure time
2 |
3 | configure_file(${PROJECT_SOURCE_DIR}/CMakeGTEST.txt.in googletest-download/CMakeLists.txt)
4 |
5 |
6 | execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
7 | RESULT_VARIABLE result
8 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
9 | if(result)
10 | message(FATAL_ERROR "CMake step for googletest failed: ${result}")
11 | endif()
12 | execute_process(COMMAND ${CMAKE_COMMAND} --build .
13 | RESULT_VARIABLE result
14 | WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
15 | if(result)
16 | message(FATAL_ERROR "Build step for googletest failed: ${result}")
17 | endif()
18 |
19 | # Prevent overriding the parent project's compiler/linker
20 | # settings on Windows
21 | set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
22 |
23 | # Add googletest directly to our build. This defines
24 | # the gtest and gtest_main targets.
25 | add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
26 | ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
27 | EXCLUDE_FROM_ALL)
28 |
29 | # The gtest/gtest_main targets carry header search path
30 | # dependencies automatically when using CMake 2.8.11 or
31 | # later. Otherwise we have to add them here ourselves.
32 | if (CMAKE_VERSION VERSION_LESS 2.8.11)
33 | include_directories("${gtest_SOURCE_DIR}/include")
34 | endif()
35 |
36 | # Now simply link against gtest or gtest_main as needed. Eg
37 | include_directories(${CMAKE_CURRENT_BINARY_DIR})
38 | include_directories("${gtest_SOURCE_DIR}/include")
39 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgtest")
40 |
41 |
42 | #p2p
43 | smi_target(test_p2p "${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p.json" "${CMAKE_CURRENT_SOURCE_DIR}/p2p/test_p2p.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p_rank0.cl;${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p_rank1.cl" 8)
44 | target_link_libraries(test_p2p_host gtest)
45 |
46 | add_test(
47 | NAME p2p
48 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_p2p_host
49 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_p2p/"
50 | )
51 |
52 |
53 | #broadcast
54 | smi_target(test_broadcast "${CMAKE_CURRENT_SOURCE_DIR}/broadcast/broadcast.json" "${CMAKE_CURRENT_SOURCE_DIR}/broadcast/test_broadcast.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/broadcast/broadcast.cl" 8)
55 | target_link_libraries(test_broadcast_host gtest)
56 |
57 | add_test(
58 | NAME broadcast
59 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_broadcast_host
60 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_broadcast/"
61 | )
62 |
63 | #reduce
64 | smi_target(test_reduce "${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce.json" "${CMAKE_CURRENT_SOURCE_DIR}/reduce/test_reduce.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/reduce/reduce.cl" 8)
65 | target_link_libraries(test_reduce_host gtest)
66 |
67 | add_test(
68 | NAME reduce
69 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_reduce_host
70 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_reduce/"
71 | )
72 |
73 |
74 | smi_target(test_scatter "${CMAKE_CURRENT_SOURCE_DIR}/scatter/scatter.json" "${CMAKE_CURRENT_SOURCE_DIR}/scatter/test_scatter.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/scatter/scatter.cl" 8)
75 | target_link_libraries(test_scatter_host gtest)
76 |
77 | add_test(
78 | NAME scatter
79 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_scatter_host
80 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_scatter/"
81 | )
82 |
83 | smi_target(test_gather "${CMAKE_CURRENT_SOURCE_DIR}/gather/gather.json" "${CMAKE_CURRENT_SOURCE_DIR}/gather/test_gather.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/gather/gather.cl" 8)
84 | target_link_libraries(test_gather_host gtest)
85 |
86 | add_test(
87 | NAME gather
88 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_gather_host
89 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_gather/"
90 | )
91 |
92 |
93 | smi_target(test_mixed "${CMAKE_CURRENT_SOURCE_DIR}/mixed/mixed.json" "${CMAKE_CURRENT_SOURCE_DIR}/mixed/test_mixed.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/mixed/mixed.cl" 8)
94 | target_link_libraries(test_mixed_host gtest)
95 |
96 | add_test(
97 | NAME mixed
98 | COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_mixed_host
99 | WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_mixed/"
100 | )
101 |
--------------------------------------------------------------------------------
/test/README.md:
--------------------------------------------------------------------------------
1 | This directory contains a set of unit test for the different
2 | communication primitives. Tests are executed in emulation environment,
3 | therefore passing the test does not ensure that the primitive will work
4 | in hardware (but in any case it is a necessary condition)
5 |
6 | The user can compile and runn all of them with
7 | ```
8 | ```
9 |
10 |
11 | Tested primitives:
12 | - p2p: point to point communications
13 | - broadcast
14 | - scatter
15 | - gather
16 | - reduce
17 | - mixed: p2p and collective communications in the same bitstream
18 |
19 | Each primitive is tested against different message lenght, data types and (in case of collective)
20 | different roots.
21 |
22 |
23 | All the tests have a timeout to ensure that a deadlock would not stall the testing procedure.
24 | However it should be noticed that emulation can be slow, so in case try to re-execute the test
25 | or increase the timeout.
26 | For each test, the timeout value is defined as macro at the begining of the respective .cpp file.
27 |
28 |
29 | To test a primitive, in the `test` folder of the Cmake folder:
30 |
31 | 1. compile the emulated bitstream
32 |
33 | `make test__emulator`
34 |
35 | 2. compile the test program
36 |
37 | `make test__host`
38 |
39 | 3. execute the test program from the respective working directory `test_/`
40 |
41 | `env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test__host`
42 |
43 | or simply use the integration with `ctest`
44 |
45 |
46 |
47 |
--------------------------------------------------------------------------------
/test/broadcast/broadcast.cl:
--------------------------------------------------------------------------------
1 | /**
2 | Broadcast test. A sequeuence of number is broadcasted.
3 | Non-root ranks check whether the received number is correct
4 | */
5 |
6 | #include
7 |
8 |
9 | __kernel void test_int(__global char* mem, const int N, char root,SMI_Comm comm)
10 | {
11 | char check=1;
12 | SMI_BChannel __attribute__((register)) chan= SMI_Open_bcast_channel(N, SMI_INT,0, root,comm);
13 | for(int i=0;i
7 |
8 | __kernel void test_char(const int N, char root, __global char *mem, SMI_Comm comm)
9 | {
10 | SMI_GatherChannel __attribute__((register)) chan= SMI_Open_gather_channel(N,N, SMI_CHAR,0, root,comm);
11 | int my_rank=SMI_Comm_rank(comm);
12 | int num_ranks=SMI_Comm_size(comm);
13 | const int loop_bound=(my_rank==root)?N*num_ranks:N;
14 | char to_send=my_rank; //starting point
15 | char exp=0;
16 | char check=1;
17 | int rcv=0;
18 | for(int i=0;i
11 | __kernel void test_int(int start, const SMI_Comm comm,__global int *mem)
12 | {
13 | unsigned int my_rank=SMI_Comm_rank(comm);
14 | unsigned int num_ranks=SMI_Comm_size(comm);
15 | SMI_Channel chans=SMI_Open_send_channel(1,SMI_INT,my_rank+1,0,comm);
16 | SMI_Channel chanr=SMI_Open_receive_channel(1,SMI_INT,my_rank-1,0,comm);
17 | int data;
18 | if(my_rank>0)
19 | {
20 | SMI_Pop(&chanr, &data);
21 | data++;
22 | }
23 | else
24 | data=start;
25 |
26 | if(my_rank.aocx"
7 | */
8 |
9 |
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 | #include
22 | #include
23 | #include "smi_generated_host.c"
24 | #define ROUTING_DIR "smi-routes/"
25 | using namespace std;
26 | std::string program_path;
27 | int rank_count, my_rank;
28 | hlslib::ocl::Context *context;
29 |
30 | SMI_Comm comm;
31 | //https://github.com/google/googletest/issues/348#issuecomment-492785854
32 | #define ASSERT_DURATION_LE(secs, stmt) { \
33 | std::promise completed; \
34 | auto stmt_future = completed.get_future(); \
35 | std::thread([&](std::promise& completed) { \
36 | stmt; \
37 | completed.set_value(true); \
38 | }, std::ref(completed)).detach(); \
39 | if(stmt_future.wait_for(std::chrono::seconds(secs)) == std::future_status::timeout){ \
40 | GTEST_FATAL_FAILURE_(" timed out (> " #secs \
41 | " seconds). Check code for infinite loops"); \
42 | MPI_Finalize();\
43 | } \
44 | }
45 |
46 | template
47 | bool runAndReturn(hlslib::ocl::Kernel &kernel, hlslib::ocl::Buffer &check, T exp)
48 | {
49 | //only rank 0 and the recv rank start the app kernels
50 | MPI_Barrier(MPI_COMM_WORLD);
51 |
52 | kernel.ExecuteTask();
53 |
54 | MPI_Barrier(MPI_COMM_WORLD);
55 | //all the rank must have the same value
56 |
57 | T res;
58 | check.CopyToHost(&res);
59 | return res==exp;
60 |
61 | }
62 |
63 | TEST(Gather, MPIinit)
64 | {
65 | ASSERT_EQ(rank_count,8);
66 | }
67 |
68 |
69 |
70 | TEST(Gather, IntegerMessages)
71 | {
72 | //with this test we evaluate the correcteness of integer messages transmission
73 |
74 | hlslib::ocl::Buffer check = context->MakeBuffer(1);
75 | hlslib::ocl::Kernel kernel = context->CurrentlyLoadedProgram().MakeKernel("test_int");
76 |
77 |
78 | std::vector starts={1,100,300};
79 | int runs=2;
80 | for(int start:starts) //consider different roots
81 | {
82 | cl::Kernel cl_kernel = kernel.kernel();
83 | cl_kernel.setArg(0,sizeof(int),&start);
84 | cl_kernel.setArg(1,sizeof(SMI_Comm),&comm);
85 | cl_kernel.setArg(2,sizeof(cl_mem),&check.devicePtr());
86 |
87 | for(int i=0;i /dev/null;");
91 |
92 | //source https://github.com/google/googletest/issues/348#issuecomment-492785854
93 | ASSERT_DURATION_LE(10, {
94 | ASSERT_TRUE(runAndReturn(kernel,check,start+7));
95 | });
96 |
97 | }
98 |
99 | }
100 | }
101 |
102 |
103 |
104 | int main(int argc, char *argv[])
105 | {
106 | // std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " " << std::endl;
107 |
108 | int result = 0;
109 |
110 | ::testing::InitGoogleTest(&argc, argv);
111 | //delete listeners for all the rank except 0
112 | if(argc==2)
113 | program_path =argv[1];
114 | else
115 | program_path = "emulator_/mixed.aocx";
116 | ::testing::TestEventListeners& listeners =
117 | ::testing::UnitTest::GetInstance()->listeners();
118 | CHECK_MPI(MPI_Init(&argc, &argv));
119 |
120 | CHECK_MPI(MPI_Comm_size(MPI_COMM_WORLD, &rank_count));
121 | CHECK_MPI(MPI_Comm_rank(MPI_COMM_WORLD, &my_rank));
122 | if (my_rank!= 0) {
123 | delete listeners.Release(listeners.default_result_printer());
124 | }
125 |
126 | //create environemnt
127 | int fpga=my_rank%2;
128 | program_path = replace(program_path, "", std::to_string(my_rank));
129 | context = new hlslib::ocl::Context();
130 | auto program = context->MakeProgram(program_path);
131 | std::vector> buffers;
132 | comm=SmiInit_mixed(my_rank, rank_count, ROUTING_DIR, *context, program, buffers);
133 |
134 |
135 | result = RUN_ALL_TESTS();
136 | MPI_Finalize();
137 |
138 | return result;
139 |
140 | }
141 |
--------------------------------------------------------------------------------
/test/p2p/p2p.json:
--------------------------------------------------------------------------------
1 | {
2 | "fpgas": {
3 | "fpga-0001:acl0": "p2p_rank0",
4 | "fpga-0001:acl1": "p2p_rank1",
5 | "fpga-0002:acl0": "p2p_rank1",
6 | "fpga-0002:acl1": "p2p_rank1",
7 | "fpga-0003:acl0": "p2p_rank1",
8 | "fpga-0003:acl1": "p2p_rank1",
9 | "fpga-0004:acl0": "p2p_rank1",
10 | "fpga-0004:acl1": "p2p_rank1"
11 | },
12 | "connections": {
13 | "fpga-0001:acl0:ch2": "fpga-0001:acl1:ch3",
14 | "fpga-0001:acl0:ch3": "fpga-0001:acl1:ch2",
15 | "fpga-0002:acl0:ch2": "fpga-0002:acl1:ch3",
16 | "fpga-0002:acl0:ch3": "fpga-0002:acl1:ch2",
17 | "fpga-0001:acl0:ch1": "fpga-0002:acl0:ch0",
18 | "fpga-0001:acl1:ch1": "fpga-0002:acl1:ch0",
19 | "fpga-0002:acl0:ch1": "fpga-0003:acl0:ch0",
20 | "fpga-0002:acl1:ch1": "fpga-0003:acl1:ch0",
21 | "fpga-0003:acl0:ch2": "fpga-0003:acl1:ch3",
22 | "fpga-0003:acl0:ch3": "fpga-0003:acl1:ch2",
23 | "fpga-0004:acl0:ch2": "fpga-0004:acl1:ch3",
24 | "fpga-0004:acl0:ch3": "fpga-0004:acl1:ch2",
25 | "fpga-0003:acl0:ch1": "fpga-0004:acl0:ch0",
26 | "fpga-0003:acl1:ch1": "fpga-0004:acl1:ch0",
27 | "fpga-0001:acl0:ch0": "fpga-0004:acl0:ch1",
28 | "fpga-0001:acl1:ch0": "fpga-0004:acl1:ch1"
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/test/p2p/p2p_rank0.cl:
--------------------------------------------------------------------------------
1 | /**
2 | P2P test. Rank 0 sends a stream of data
3 | */
4 | #pragma OPENCL EXTENSION cl_khr_fp64 : enable
5 |
6 | #include
7 |
8 | __kernel void test_char(const int N, const char dest_rank, const SMI_Comm comm)
9 | {
10 | SMI_Channel chan=SMI_Open_send_channel(N,SMI_CHAR,dest_rank,1,comm);
11 | for(int i=0;i
8 |
9 | __kernel void test_short(__global char *mem, const int N, SMI_Comm comm)
10 | {
11 | SMI_Channel chan=SMI_Open_receive_channel(N,SMI_SHORT,0,0,comm);
12 | char check=1;
13 | const short expected=1001;
14 | for(int i=0;i
7 |
8 |
9 |
10 | __kernel void test_int(const int N, char root,__global char* mem, SMI_Comm comm)
11 | {
12 |
13 | SMI_ScatterChannel __attribute__((register)) chan= SMI_Open_scatter_channel(N,N, SMI_INT, 0,root,comm);
14 | char check=1;
15 | int num_ranks=SMI_Comm_size(comm);
16 | int my_rank=SMI_Comm_rank(comm);
17 | const int loop_bound=(my_rank==root)?N*num_ranks:N;
18 | const int to_rcv_start=my_rank*N;
19 | for(int i=0;i