├── .gitignore ├── GPU ├── final_network_cublasLt_1_node_no_FIFO_scatter │ ├── test.cpp │ ├── a.out │ ├── cuda_server │ ├── network_client_sende │ ├── single_connection_network_client_sender │ ├── multiple_connections_network_client_sender │ ├── single_connection_network_server_receiver │ ├── multiple_connections_network_server_receiver │ ├── run_client_sender.sh │ ├── run_cuda_server.sh │ ├── pthread_test.c │ ├── constant.h │ ├── README.md │ ├── out │ └── single_connection_network_client_sender.c ├── measure_network_cuda_cp_latency_3_nodes │ ├── a.out │ ├── cuda_server │ ├── timeline.prof │ ├── network_client_sende │ ├── single_connection_network_client_sender │ ├── single_connection_network_server_receiver │ ├── multiple_connections_network_client_sender │ ├── multiple_connections_network_server_receiver │ ├── run_cuda_server.sh │ ├── CPU0_multiple_connections_network_client_sender │ ├── FPGA0_multiple_connections_network_client_sender │ ├── FPGA1_multiple_connections_network_client_sender │ ├── run_client_sender.sh │ ├── pthread_test.c │ ├── README.md │ ├── out │ ├── constant.h │ └── single_connection_network_client_sender.c ├── measure_network_cuda_cp_latency_single_node │ ├── a.out │ ├── cuda_server │ ├── timeline.prof │ ├── network_client_sende │ ├── simple_2_thread_latency │ ├── simple_2_thread_latency.sh │ ├── single_connection_network_client_sender │ ├── multiple_connections_network_client_sender │ ├── single_connection_network_server_receiver │ ├── multiple_connections_network_server_receiver │ ├── run_cuda_server.sh │ ├── run_client_sender.sh │ ├── pthread_test.c │ ├── constant.h │ ├── README.md │ └── single_connection_network_client_sender.c └── final_network_cublasLt_3_nodes_no_FIFO_scatter │ ├── a.out │ ├── cuda_server │ ├── network_client_sende │ ├── single_connection_network_client_sender │ ├── single_connection_network_server_receiver │ ├── multiple_connections_network_client_sender │ ├── multiple_connections_network_server_receiver │ ├── CPU0_multiple_connections_network_client_sender │ ├── FPGA0_multiple_connections_network_client_sender │ ├── FPGA1_multiple_connections_network_client_sender │ ├── run_single_sender_CPU0.sh │ ├── run_single_sender_FPGA0.sh │ ├── run_single_sender_FPGA1.sh │ ├── run_cuda_server.sh │ ├── run_client_sender.sh │ ├── out │ ├── pthread_test.c │ ├── constant.h │ ├── README.md │ └── single_connection_network_client_sender.c ├── FPGA ├── kernel │ ├── user_krnl │ │ ├── scatter_krnl │ │ │ ├── src │ │ │ │ └── hls │ │ │ │ │ ├── scatter_config.hpp.in │ │ │ │ │ ├── make.tcl.in │ │ │ │ │ ├── mem_utils.hpp │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── scatter.hpp │ │ │ │ │ └── packet.hpp │ │ │ └── config_sp_scatter_krnl.txt │ │ ├── iperf_krnl │ │ │ ├── src │ │ │ │ └── hls │ │ │ │ │ ├── iperf_client_config.hpp.in │ │ │ │ │ ├── make.tcl.in │ │ │ │ │ ├── mem_utils.hpp │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ └── packet.hpp │ │ │ └── config_sp_iperf_krnl.txt │ │ ├── hls_test_krnl │ │ │ ├── config_sp_hls_test_krnl.txt │ │ │ └── src │ │ │ │ └── hls │ │ │ │ ├── mem_utils.hpp │ │ │ │ └── in_casting_bench.hpp │ │ ├── embedding_krnl │ │ │ ├── src │ │ │ │ └── hls │ │ │ │ │ └── mem_utils.hpp │ │ │ └── config_sp_embedding_krnl.txt │ │ ├── embedding_377_krnl │ │ │ ├── src │ │ │ │ └── hls │ │ │ │ │ └── mem_utils.hpp │ │ │ └── config_sp_embedding_377_krnl.txt │ │ ├── embedding_47_krnl │ │ │ ├── src │ │ │ │ └── hls │ │ │ │ │ └── mem_utils.hpp │ │ │ └── config_sp_embedding_47_krnl.txt │ │ └── embedding_98_krnl │ │ │ ├── src │ │ │ └── hls │ │ │ │ └── mem_utils.hpp │ │ │ └── config_sp_embedding_98_krnl.txt │ ├── common │ │ └── types │ │ │ └── network_types.svh.in │ ├── network_krnl │ │ └── src │ │ │ └── hdl │ │ │ ├── axis_meta_reg.sv │ │ │ ├── axis_data_reg_array.sv │ │ │ ├── axis_udp_meta_reg.sv │ │ │ └── axis_data_reg.sv │ └── cmac_krnl │ │ ├── cmac_krnl.xml │ │ └── src │ │ └── hdl │ │ ├── axis_data_reg_array.sv │ │ ├── axis_data_reg.sv │ │ └── network_clk_cross.sv ├── common │ ├── includes │ │ ├── xcl2 │ │ │ ├── xcl2.mk │ │ │ └── xcl2.cpp │ │ ├── bitmap │ │ │ ├── bitmap.mk │ │ │ ├── bitmap.h │ │ │ └── bitmap.cpp │ │ ├── logger │ │ │ ├── logger.mk │ │ │ └── logger.h │ │ ├── lodepng │ │ │ └── lodepng.mk │ │ ├── simplebmp │ │ │ ├── simplebmp.mk │ │ │ └── simplebmp.h │ │ ├── cmdparser │ │ │ └── cmdparser.mk │ │ ├── oclHelper │ │ │ ├── oclHelper.mk │ │ │ └── oclHelper.h │ │ └── opencl │ │ │ └── opencl.mk │ └── utility │ │ ├── readme_gen │ │ ├── gs_summary.py │ │ ├── gs_summary_subdir.py │ │ ├── update_all_readme.sh │ │ └── gs_summary_util.py │ │ ├── parse_platform_list.py │ │ ├── check_target_device.py │ │ ├── makefile_gen │ │ ├── update_makegen_all.sh │ │ ├── update_descgen_all.sh │ │ └── descgen.py │ │ ├── device_list.py │ │ ├── build_what.sh │ │ ├── md2rst │ │ └── update_md2rst_all.sh │ │ ├── check_license.sh │ │ ├── Consolidation.py │ │ ├── check_readme.sh │ │ ├── check_makefile.sh │ │ └── check_descr.py ├── README.md ├── cmake │ ├── FindVivado.cmake │ └── FindVivadoHLS.cmake ├── config_rtl.mk ├── config_hls.mk ├── scripts │ ├── network_ultrascale.tcl │ └── gen_xo.tcl ├── CMakeLists.txt ├── utils.mk └── host │ ├── embedding_377_krnl │ └── host.hpp │ ├── embedding_47_krnl │ └── host.hpp │ ├── embedding_98_krnl │ └── host.hpp │ └── embedding_krnl │ └── host.hpp └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/test.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int main(){} 4 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/src/hls/scatter_config.hpp.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | const unsigned DATA_WIDTH = ${DATA_WIDTH} * 8; -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/iperf_krnl/src/hls/iperf_client_config.hpp.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | const unsigned DATA_WIDTH = ${DATA_WIDTH} * 8; 4 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/a.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/a.out -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/a.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/a.out -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/cuda_server: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/cuda_server -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/a.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/a.out -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/a.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/a.out -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/timeline.prof: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/timeline.prof -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/cuda_server: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/cuda_server -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/cuda_server: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/cuda_server -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/cuda_server: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/cuda_server -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/timeline.prof: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/timeline.prof -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/network_client_sende: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/network_client_sende -------------------------------------------------------------------------------- /FPGA/common/includes/xcl2/xcl2.mk: -------------------------------------------------------------------------------- 1 | xcl2_SRCS:=${COMMON_REPO}/common/includes/xcl2/xcl2.cpp 2 | xcl2_HDRS:=${COMMON_REPO}/common/includes/xcl2/xcl2.hpp 3 | 4 | xcl2_CXXFLAGS:=-I${COMMON_REPO}/common/includes/xcl2 5 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/network_client_sende: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/network_client_sende -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/network_client_sende: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/network_client_sende -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/network_client_sende: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/network_client_sende -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/simple_2_thread_latency: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/simple_2_thread_latency -------------------------------------------------------------------------------- /FPGA/common/includes/bitmap/bitmap.mk: -------------------------------------------------------------------------------- 1 | bitmap_SRCS:=${COMMON_REPO}/common/includes/bitmap/bitmap.cpp 2 | bitmap_HDRS:=${COMMON_REPO}/common/includes/bitmap/bitmap.h 3 | bitmap_CXXFLAGS:=-I${COMMON_REPO}/common/includes/bitmap 4 | -------------------------------------------------------------------------------- /FPGA/common/includes/logger/logger.mk: -------------------------------------------------------------------------------- 1 | logger_SRCS:=${COMMON_REPO}/common/includes/logger/logger.cpp 2 | logger_HDRS:=${COMMON_REPO}/common/includes/logger/logger.h 3 | logger_CXXFLAGS:=-I${COMMON_REPO}/common/includes/logger 4 | -------------------------------------------------------------------------------- /FPGA/common/includes/lodepng/lodepng.mk: -------------------------------------------------------------------------------- 1 | lodepng_SRCS:=${COMMON_REPO}/common/includes/lodepng/lodepng.cpp 2 | lodepng_HDRS:=${COMMON_REPO}/common/includes/lodepng/lodepng.h 3 | lodepng_CXXFLAGS:=-I${COMMON_REPO}/common/includes/lodepng 4 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/simple_2_thread_latency.sh: -------------------------------------------------------------------------------- 1 | # Run this script before client 2 | rm simple_2_thread_latency 3 | gcc -lpthread simple_2_thread_latency.c -o simple_2_thread_latency 4 | ./simple_2_thread_latency 5 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/single_connection_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/single_connection_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/single_connection_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/single_connection_network_server_receiver -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/single_connection_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/single_connection_network_client_sender -------------------------------------------------------------------------------- /FPGA/common/includes/simplebmp/simplebmp.mk: -------------------------------------------------------------------------------- 1 | simplebmp_SRCS:=${COMMON_REPO}/common/includes/simplebmp/simplebmp.cpp 2 | simplebmp_HDRS:=${COMMON_REPO}/common/includes/simplebmp/simplebmp.h 3 | simplebmp_CXXFLAGS:=-I${COMMON_REPO}/common/includes/simplebmp 4 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/single_connection_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/single_connection_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/single_connection_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/single_connection_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/multiple_connections_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/multiple_connections_network_server_receiver -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/run_cuda_server.sh: -------------------------------------------------------------------------------- 1 | # Run this script before client 2 | rm cuda_server 3 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 4 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/single_connection_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/single_connection_network_server_receiver -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/single_connection_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/single_connection_network_server_receiver -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/single_connection_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/single_connection_network_server_receiver -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/CPU0_multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/CPU0_multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/FPGA0_multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/FPGA0_multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/FPGA1_multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_3_nodes/FPGA1_multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/multiple_connections_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/measure_network_cuda_cp_latency_single_node/multiple_connections_network_server_receiver -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/run_cuda_server.sh: -------------------------------------------------------------------------------- 1 | # Run this script before client 2 | rm cuda_server 3 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 4 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server -------------------------------------------------------------------------------- /FPGA/common/includes/cmdparser/cmdparser.mk: -------------------------------------------------------------------------------- 1 | cmdparser_SRCS:=${COMMON_REPO}/common/includes/cmdparser/cmdlineparser.cpp 2 | cmdparser_HDRS:=${COMMON_REPO}/common/includes/cmdparser/cmdlineparser.h 3 | cmdparser_CXXFLAGS:=-I${COMMON_REPO}/common/includes/cmdparser 4 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/multiple_connections_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_1_node_no_FIFO_scatter/multiple_connections_network_server_receiver -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/multiple_connections_network_server_receiver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/multiple_connections_network_server_receiver -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/CPU0_multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/CPU0_multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/FPGA0_multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/FPGA0_multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/FPGA1_multiple_connections_network_client_sender: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fpgasystems/GPU-FPGA-Recommendation-System/HEAD/GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/FPGA1_multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/run_client_sender.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm multiple_connections_network_client_sender 3 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 4 | ./multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/run_client_sender.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm multiple_connections_network_client_sender 3 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 4 | ./multiple_connections_network_client_sender -------------------------------------------------------------------------------- /FPGA/common/includes/oclHelper/oclHelper.mk: -------------------------------------------------------------------------------- 1 | oclHelper_SRCS:=${COMMON_REPO}/common/includes/oclHelper/oclHelper.cpp ${COMMON_REPO}/common/includes/oclHelper/oclErrorCodes.cpp 2 | oclHelper_HDRS:=${COMMON_REPO}/common/includes/oclHelper/oclHelper.h 3 | oclHelper_CXXFLAGS:=-I${COMMON_REPO}/common/includes/oclHelper 4 | -------------------------------------------------------------------------------- /FPGA/common/utility/readme_gen/gs_summary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, re 4 | import fnmatch 5 | import json 6 | import sys 7 | 8 | # To avoid .pyc files 9 | sys.dont_write_bytecode = True 10 | 11 | sys.path.append(".") 12 | import gs_summary_util 13 | 14 | gs_summary_util.genReadMe(".") 15 | -------------------------------------------------------------------------------- /FPGA/common/utility/readme_gen/gs_summary_subdir.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, re 4 | import fnmatch 5 | import json 6 | import sys 7 | 8 | # To avoid .pyc files 9 | sys.dont_write_bytecode = True 10 | 11 | sys.path.append(".") 12 | import gs_summary_util 13 | 14 | gs_summary_util.genReadMe2(".") 15 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/run_single_sender_CPU0.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm CPU0_multiple_connections_network_client_sender 3 | gcc CPU0_multiple_connections_network_client_sender.c -lpthread -o CPU0_multiple_connections_network_client_sender 4 | ./CPU0_multiple_connections_network_client_sender & -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/run_single_sender_FPGA0.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm FPGA0_multiple_connections_network_client_sender 3 | gcc FPGA0_multiple_connections_network_client_sender.c -lpthread -o FPGA0_multiple_connections_network_client_sender 4 | ./FPGA0_multiple_connections_network_client_sender & -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/run_single_sender_FPGA1.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm FPGA1_multiple_connections_network_client_sender 3 | gcc FPGA1_multiple_connections_network_client_sender.c -lpthread -o FPGA1_multiple_connections_network_client_sender 4 | ./FPGA1_multiple_connections_network_client_sender & 5 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/run_cuda_server.sh: -------------------------------------------------------------------------------- 1 | # Run this script before client 2 | rm cuda_server 3 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 4 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server 5 | #nvprof -f --export-profile timeline.prof --concurrent-kernels off ./cuda_server 6 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/run_cuda_server.sh: -------------------------------------------------------------------------------- 1 | # Run this script before client 2 | rm cuda_server 3 | nvcc -std=c++11 -l cublasLt -lpthread cuda_server.c -o cuda_server 4 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server 5 | #nvprof -f --export-profile timeline.prof --concurrent-kernels off ./cuda_server 6 | -------------------------------------------------------------------------------- /FPGA/common/utility/parse_platform_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import sys 4 | 5 | def main (): 6 | dev = sys.argv[1] 7 | if "PLATFORM_REPO_PATHS" in os.environ: 8 | plist = os.environ['PLATFORM_REPO_PATHS'].split(":") 9 | for shell in plist: 10 | if os.path.isdir(shell + "/" + dev): 11 | return shell 12 | 13 | print (main()) 14 | -------------------------------------------------------------------------------- /FPGA/common/includes/opencl/opencl.mk: -------------------------------------------------------------------------------- 1 | # Definition of include file locations 2 | xrt_path = $(XILINX_XRT) 3 | ifneq ($(HOST_ARCH), x86) 4 | xrt_path = $(SYSROOT)/usr/ 5 | endif 6 | 7 | OPENCL_INCLUDE:= $(xrt_path)/include 8 | ifneq ($(HOST_ARCH), x86) 9 | OPENCL_INCLUDE:= $(xrt_path)/include/xrt 10 | endif 11 | 12 | VIVADO_INCLUDE:= $(XILINX_VIVADO)/include 13 | opencl_CXXFLAGS=-I$(OPENCL_INCLUDE) -I$(VIVADO_INCLUDE) 14 | OPENCL_LIB:= $(xrt_path)/lib 15 | opencl_LDFLAGS=-L$(OPENCL_LIB) -lOpenCL -lpthread 16 | -------------------------------------------------------------------------------- /FPGA/README.md: -------------------------------------------------------------------------------- 1 | ### Vitis with Network Stack 2 | 3 | Adding the network stack to the Vitis shell. 4 | 5 | ## Setup 6 | Git Clone 7 | 8 | git clone 9 | git submodule update --init --recursive 10 | 11 | Setup the HLS IPs: 12 | 13 | mkdir build 14 | cd build 15 | cmake .. -DFDEV_NAME=u280 -DTCP_STACK_EN=1 -DTCP_STACK_RX_DDR_BYPASS_EN=1 16 | make installip 17 | 18 | Create the Vitis kernel: 19 | 20 | make all DEVICE=/opt/xilinx/platforms/xilinx_u280_xdma_201920_3/xilinx_u280_xdma_201920_3.xpfm USER_KRNL=scatter_krnl NETH=4 21 | -------------------------------------------------------------------------------- /FPGA/common/utility/check_target_device.py: -------------------------------------------------------------------------------- 1 | import json, sys 2 | 3 | descriptionfile = sys.argv[1] 4 | target = sys.argv[2] 5 | device = sys.argv[3] 6 | 7 | with open(descriptionfile) as json_file: 8 | data = json.load(json_file) 9 | 10 | targetNotSupported = 'targets' in data and target not in data['targets'] 11 | if targetNotSupported: 12 | print("%s target not supported for this example" % target) 13 | 14 | deviceNotSupported = 'nboard' in data and any(nboard in device for nboard in data['nboard']) 15 | if deviceNotSupported: 16 | print("%s device not supported for this example" % device) 17 | 18 | sys.exit(not(targetNotSupported or deviceNotSupported)) 19 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/run_client_sender.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm FPGA0_multiple_connections_network_client_sender FPGA1_multiple_connections_network_client_sender CPU0_multiple_connections_network_client_sender 3 | gcc FPGA0_multiple_connections_network_client_sender.c -lpthread -o FPGA0_multiple_connections_network_client_sender 4 | gcc FPGA1_multiple_connections_network_client_sender.c -lpthread -o FPGA1_multiple_connections_network_client_sender 5 | gcc CPU0_multiple_connections_network_client_sender.c -lpthread -o CPU0_multiple_connections_network_client_sender 6 | ./FPGA0_multiple_connections_network_client_sender & 7 | ./FPGA1_multiple_connections_network_client_sender & 8 | ./CPU0_multiple_connections_network_client_sender & -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/run_client_sender.sh: -------------------------------------------------------------------------------- 1 | # run this script after server 2 | rm FPGA0_multiple_connections_network_client_sender FPGA1_multiple_connections_network_client_sender CPU0_multiple_connections_network_client_sender 3 | gcc FPGA0_multiple_connections_network_client_sender.c -lpthread -o FPGA0_multiple_connections_network_client_sender 4 | gcc FPGA1_multiple_connections_network_client_sender.c -lpthread -o FPGA1_multiple_connections_network_client_sender 5 | gcc CPU0_multiple_connections_network_client_sender.c -lpthread -o CPU0_multiple_connections_network_client_sender 6 | ./CPU0_multiple_connections_network_client_sender & 7 | sleep 10 8 | ./FPGA0_multiple_connections_network_client_sender & 9 | sleep 10 10 | ./FPGA1_multiple_connections_network_client_sender & 11 | -------------------------------------------------------------------------------- /FPGA/kernel/common/types/network_types.svh.in: -------------------------------------------------------------------------------- 1 | `ifndef NETWORK_TYPES_SVH 2 | `define NETWORK_TYPES_SVH 3 | 4 | `define DRAM_EN${DRAM_EN}${TCP_STACK_EN} 5 | 6 | `ifdef DRAM_EN1 7 | `define USE_DDR 8 | `endif 9 | 10 | `ifdef DRAM_EN10 11 | `define USE_DDR 12 | `endif 13 | 14 | `ifdef DRAM_EN01 15 | `define USE_DDR 16 | `endif 17 | 18 | `define USE_${NETWORK_INTERFACE}G 19 | 20 | parameter NETWORK_STACK_WIDTH = 512; 21 | parameter UDP_META_WIDTH = 176; 22 | 23 | // TCP/IP 24 | parameter TCP_STACK_EN = ${TCP_STACK_EN}; 25 | parameter TCP_RX_BYPASS_EN = ${TCP_STACK_RX_DDR_BYPASS_EN}; 26 | 27 | //UDP/IP 28 | parameter UDP_STACK_EN = ${UDP_STACK_EN}; 29 | 30 | //RoCEv2 31 | parameter ROCE_STACK_EN = 0; 32 | 33 | //DRAM 34 | parameter NUM_DDR_CHANNELS = 2; 35 | parameter NUM_TCP_CHANNELS = 2; 36 | parameter NUM_NET_PORTS = 2; 37 | 38 | `endif -------------------------------------------------------------------------------- /FPGA/cmake/FindVivado.cmake: -------------------------------------------------------------------------------- 1 | # Author: Johannes de Fine Licht (johannes.definelicht@inf.ethz.ch) 2 | # Created: October 2016 3 | # 4 | # To specify the path to the Vivado installation, provide: 5 | # -DVIVADO_ROOT_DIR= 6 | # If successful, this script defines: 7 | # VIVADO_FOUND 8 | # VIVADO_BINARY 9 | 10 | cmake_minimum_required(VERSION 3.0) 11 | 12 | find_path(VIVADO_PATH 13 | NAMES vivado 14 | PATHS ${VIVADO_ROOT_DIR} ENV XILINX_VIVADO 15 | PATH_SUFFIXES bin 16 | ) 17 | 18 | if(NOT EXISTS ${VIVADO_PATH}) 19 | 20 | message(WARNING "Vivado not found.") 21 | 22 | else() 23 | 24 | get_filename_component(VIVADO_ROOT_DIR ${VIVADO_PATH} DIRECTORY) 25 | 26 | set(VIVADO_FOUND TRUE) 27 | set(VIVADO_BINARY ${VIVADO_ROOT_DIR}/bin/vivado) 28 | 29 | message(STATUS "Found Vivado at ${VIVADO_ROOT_DIR}.") 30 | 31 | endif() 32 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/out: -------------------------------------------------------------------------------- 1 | concurrentKernels: 1 2 | = 1: Concurrent Kernel Execution 3 | asyncEngineCount: 3 4 | > 0: Overlap of Data Transfer and Kernel Execution 5 | = 2: Concurrent Data Transfers 6 | Device 0 has compute capability 7.5. 7 | Before Thread 8 | Successfully built connection with CPU0. 9 | Successfully built connection with CPU0. 10 | Successfully built connection with CPU0. 11 | Successfully built connection with CPU0. 12 | Successfully built connection with FPGA0. 13 | Successfully built connection with FPGA0. 14 | Successfully built connection with FPGA0. 15 | Successfully built connection with FPGA0. 16 | Successfully built connection with FPGA1. 17 | batch_count: 0 18 | Successfully built connection with FPGA1. 19 | batch_count: 1 20 | Successfully built connection with FPGA1. 21 | batch_count: 2 22 | Successfully built connection with FPGA1. 23 | batch_count: 3 24 | -------------------------------------------------------------------------------- /FPGA/cmake/FindVivadoHLS.cmake: -------------------------------------------------------------------------------- 1 | # Author: Johannes de Fine Licht (johannes.definelicht@inf.ethz.ch) 2 | # Created: October 2016 3 | # 4 | # To specify the path to the Vivado HLS installation, provide: 5 | # -DVIVADO_HLS_ROOT_DIR= 6 | # If successful, this script defines: 7 | # VIVADO_HLS_FOUND 8 | # VIVADO_HLS_BINARY 9 | # VIVADO_HLS_INCLUDE_DIRS 10 | 11 | cmake_minimum_required(VERSION 3.0) 12 | 13 | find_path(VIVADO_HLS_PATH 14 | NAMES vivado_hls 15 | PATHS ${VIVADO_HLS_ROOT_DIR} ENV XILINX_VIVADO_HLS ENV XILINX_HLS 16 | PATH_SUFFIXES bin 17 | ) 18 | 19 | if(NOT EXISTS ${VIVADO_HLS_PATH}) 20 | 21 | message(WARNING "Vivado HLS not found.") 22 | 23 | else() 24 | 25 | get_filename_component(VIVADO_HLS_ROOT_DIR ${VIVADO_HLS_PATH} DIRECTORY) 26 | 27 | set(VIVADO_HLS_FOUND TRUE) 28 | set(VIVADO_HLS_INCLUDE_DIRS ${VIVADO_HLS_ROOT_DIR}/include/) 29 | set(VIVADO_HLS_BINARY ${VIVADO_HLS_ROOT_DIR}/bin/vivado_hls) 30 | 31 | message(STATUS "Found Vivado HLS at ${VIVADO_HLS_ROOT_DIR}.") 32 | 33 | endif() 34 | -------------------------------------------------------------------------------- /FPGA/kernel/network_krnl/src/hdl/axis_meta_reg.sv: -------------------------------------------------------------------------------- 1 | `include "network_intf.svh" 2 | `include "network_types.svh" 3 | 4 | module axis_meta_reg #( 5 | parameter WIDTH = 32 6 | ) ( 7 | input wire aclk, 8 | input wire aresetn, 9 | 10 | axis_meta.slave meta_in, 11 | axis_meta.master meta_out 12 | ); 13 | 14 | if(WIDTH == 56) begin 15 | axis_register_slice_meta_56_0 inst_reg_slice ( 16 | .aclk(aclk), 17 | .aresetn(aresetn), 18 | .s_axis_tvalid(meta_in.valid), 19 | .s_axis_tready(meta_in.ready), 20 | .s_axis_tdata(meta_in.data), 21 | .m_axis_tvalid(meta_out.valid), 22 | .m_axis_tready(meta_out.ready), 23 | .m_axis_tdata(meta_out.data) 24 | ); 25 | end 26 | else if(WIDTH == 32) begin 27 | axis_register_slice_meta_32_0 inst_reg_slice ( 28 | .aclk(aclk), 29 | .aresetn(aresetn), 30 | .s_axis_tvalid(meta_in.valid), 31 | .s_axis_tready(meta_in.ready), 32 | .s_axis_tdata(meta_in.data), 33 | .m_axis_tvalid(meta_out.valid), 34 | .m_axis_tready(meta_out.ready), 35 | .m_axis_tdata(meta_out.data) 36 | ); 37 | end 38 | 39 | endmodule -------------------------------------------------------------------------------- /FPGA/kernel/cmac_krnl/cmac_krnl.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /FPGA/common/utility/makefile_gen/update_makegen_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | appDir=$(dirname $(dirname $(dirname $(readlink -f $0)))) 3 | 4 | echo "-----------------------" 5 | echo "-- UPDATING MAKEFILES --" 6 | echo "-----------------------" 7 | 8 | update_file() { 9 | ignore=0 10 | 11 | for i in $IGNORE; do 12 | if [[ $1 =~ ^description.json ]]; then 13 | ignore=1 14 | fi 15 | done 16 | 17 | if [[ $VERBOSE == "true" ]]; then 18 | echo -n "Checking $1 ... " 19 | fi 20 | if [[ $ignore == 1 ]]; then 21 | if [[ $VERBOSE == "true" ]]; then 22 | echo "SKIP" 23 | fi 24 | else 25 | pushd . > /dev/null 26 | cd $(dirname $1) 27 | $appDir/utility/makefile_gen/makegen.py description.json #> /dev/null 2>&1 28 | popd >/dev/null 29 | fi 30 | } 31 | 32 | 33 | VCS_FILES=$(git ls-files) 34 | 35 | for f in $VCS_FILES; do 36 | if [[ ($f == */description.json) ]]; then 37 | if grep -q '"match_makefile": "false"' $f; then 38 | echo $f 39 | echo "Makefile Manually Edited:: AutoMakefile Generator Failed" 40 | else 41 | echo $f 42 | update_file $(readlink -f $f) 43 | fi 44 | fi 45 | done 46 | -------------------------------------------------------------------------------- /FPGA/common/utility/makefile_gen/update_descgen_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | appDir=$(dirname $(dirname $(dirname $(readlink -f $0)))) 3 | 4 | echo "-----------------------" 5 | echo "-- UPDATING MAKEFILES --" 6 | echo "-----------------------" 7 | 8 | update_file() { 9 | ignore=0 10 | 11 | for i in $IGNORE; do 12 | if [[ $1 =~ ^description.json ]]; then 13 | ignore=1 14 | fi 15 | done 16 | 17 | if [[ $VERBOSE == "true" ]]; then 18 | echo -n "Checking $1 ... " 19 | fi 20 | if [[ $ignore == 1 ]]; then 21 | if [[ $VERBOSE == "true" ]]; then 22 | echo "SKIP" 23 | fi 24 | else 25 | pushd . > /dev/null 26 | cd $(dirname $1) 27 | $appDir/utility/makefile_gen/descgen.py description.json #> /dev/null 2>&1 28 | popd >/dev/null 29 | fi 30 | } 31 | 32 | 33 | VCS_FILES=$(git ls-files) 34 | 35 | for f in $VCS_FILES; do 36 | if [[ ($f == */description.json) ]]; then 37 | if grep -q '"match_makefile": "false"' $f; then 38 | echo $f 39 | echo "Makefile Manually Edited:: AutoMakefile Generator Failed" 40 | else 41 | echo $f 42 | update_file $(readlink -f $f) 43 | fi 44 | fi 45 | done 46 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/src/hls/make.tcl.in: -------------------------------------------------------------------------------- 1 | 2 | open_project ${PROJECT_NAME}_prj 3 | 4 | open_solution "solution1" 5 | set_part ${FPGA_PART} 6 | create_clock -period ${CLOCK_PERIOD} -name default 7 | 8 | set_top ${PROJECT_NAME} 9 | 10 | add_files ${CMAKE_CURRENT_SOURCE_DIR}/scatter.cpp -cflags "-I${CMAKE_CURRENT_BINARY_DIR}" 11 | 12 | 13 | add_files -tb ${CMAKE_CURRENT_SOURCE_DIR}/test_scatter.cpp 14 | 15 | 16 | #Check which command 17 | set command [lindex $argv 2] 18 | 19 | if {$command == "synthesis"} { 20 | csynth_design 21 | } elseif {$command == "csim"} { 22 | csim_design 23 | } elseif {$command == "ip"} { 24 | export_design -format ip_catalog -ipname "scatter" -display_name "scatter" -vendor "ethz.systems.fpga" -version "1.0" 25 | } elseif {$command == "installip"} { 26 | file mkdir ${IPREPO_DIR} 27 | file delete -force ${IPREPO_DIR}/${PROJECT_NAME} 28 | file copy -force ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}_prj/solution1/impl/ip ${IPREPO_DIR}/${PROJECT_NAME}/ 29 | } else { 30 | puts "No valid command specified. Use vivado_hls -f make.tcl ." 31 | } 32 | 33 | 34 | exit 35 | -------------------------------------------------------------------------------- /FPGA/common/utility/readme_gen/update_all_readme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script regenerates all of the README files in the Vitis example repository 3 | # An example with an auto-generated README file requires a description.json file 4 | # Only examples with a valid description.json file are updated by this script 5 | 6 | BASEDIR=$(pwd) 7 | 8 | dir_list=( $(git ls-files | grep 'description.json' | sed -r 's|/[^/]+$||' | sort | uniq )) 9 | 10 | echo ${dir_list[@]} 11 | echo $BASEDIR 12 | 13 | for i in "${dir_list[@]}" 14 | do 15 | cd $i 16 | if grep -qr '"match_readme": "false"' .; then 17 | echo "Ignoring README.md ::" $i 18 | else 19 | echo "Updating README for = $i" 20 | rm README.md 21 | fi 22 | make docs 23 | git add README.md 24 | cd $BASEDIR 25 | done 26 | 27 | summary_list=( $(git ls-files | grep 'summary.json' | sed -r 's|/[^/]+$||' | sort | uniq )) 28 | echo ${summary_list[@]} 29 | echo $BASEDIR 30 | for i in "${summary_list[@]}" 31 | do 32 | cd $i 33 | echo "Updating README for = $i" 34 | rm README.md 35 | make docs 36 | git add README.md 37 | cd $BASEDIR 38 | done 39 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/iperf_krnl/src/hls/make.tcl.in: -------------------------------------------------------------------------------- 1 | 2 | open_project ${PROJECT_NAME}_prj 3 | 4 | open_solution "solution1" 5 | set_part ${FPGA_PART} 6 | create_clock -period ${CLOCK_PERIOD} -name default 7 | 8 | set_top ${PROJECT_NAME} 9 | 10 | add_files ${CMAKE_CURRENT_SOURCE_DIR}/iperf_client.cpp -cflags "-I${CMAKE_CURRENT_BINARY_DIR}" 11 | 12 | 13 | add_files -tb ${CMAKE_CURRENT_SOURCE_DIR}/test_iperf_client.cpp 14 | 15 | 16 | #Check which command 17 | set command [lindex $argv 2] 18 | 19 | if {$command == "synthesis"} { 20 | csynth_design 21 | } elseif {$command == "csim"} { 22 | csim_design 23 | } elseif {$command == "ip"} { 24 | export_design -format ip_catalog -ipname "iperf_client" -display_name "iperf client" -vendor "ethz.systems.fpga" -version "1.0" 25 | } elseif {$command == "installip"} { 26 | file mkdir ${IPREPO_DIR} 27 | file delete -force ${IPREPO_DIR}/${PROJECT_NAME} 28 | file copy -force ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}_prj/solution1/impl/ip ${IPREPO_DIR}/${PROJECT_NAME}/ 29 | } else { 30 | puts "No valid command specified. Use vivado_hls -f make.tcl ." 31 | } 32 | 33 | 34 | exit 35 | -------------------------------------------------------------------------------- /FPGA/config_rtl.mk: -------------------------------------------------------------------------------- 1 | VIVADO := $(XILINX_VIVADO)/bin/vivado 2 | $(TEMP_DIR)/${KRNL_1}.xo: kernel/network_krnl/network_krnl.xml scripts/package_network_krnl.tcl scripts/gen_xo.tcl kernel/network_krnl/src/hdl/*.sv 3 | mkdir -p $(TEMP_DIR) 4 | $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_1}.xo ${KRNL_1} $(TARGET) $(DEVICE) $(XSA) kernel/network_krnl/network_krnl.xml ./scripts/package_network_krnl.tcl 5 | 6 | $(TEMP_DIR)/${KRNL_2}.xo: kernel/user_krnl/${KRNL_2}/${KRNL_2}.xml scripts/package_${KRNL_2}.tcl scripts/gen_xo.tcl kernel/user_krnl/${KRNL_2}/src/hdl/*.sv 7 | mkdir -p $(TEMP_DIR) 8 | $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_2}.xo ${KRNL_2} $(TARGET) $(DEVICE) $(XSA) kernel/user_krnl/${KRNL_2}/${KRNL_2}.xml ./scripts/package_${KRNL_2}.tcl 9 | 10 | $(TEMP_DIR)/${KRNL_3}.xo: kernel/cmac_krnl/cmac_krnl.xml scripts/package_cmac_krnl.tcl scripts/gen_xo.tcl kernel/cmac_krnl/src/hdl/*.sv 11 | mkdir -p $(TEMP_DIR) 12 | $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_3}.xo ${KRNL_3} $(TARGET) $(DEVICE) $(XSA) kernel/cmac_krnl/cmac_krnl.xml ./scripts/package_cmac_krnl.tcl 13 | -------------------------------------------------------------------------------- /FPGA/common/utility/device_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # utility that creates a file that lists down the supported and unsupported devices 5 | # for each example 6 | # 7 | 8 | import glob 9 | import json 10 | import re 11 | import sys 12 | import os 13 | 14 | import os.path 15 | 16 | string = "" 17 | for dirpath, dirnames, filenames in os.walk("../../."): 18 | for filename in [f for f in filenames if (f.endswith("description.json") and f not in "../../common/.")]: 19 | 20 | f = open(os.path.join(dirpath, filename), "r+") 21 | listing = [] 22 | flag = 0 23 | name_flag = 0 24 | 25 | for txt in f: 26 | 27 | x = re.search(".*device\".*", txt) 28 | 29 | if (x): 30 | if(name_flag is 0): 31 | name_flag = 1 32 | string = string + "\n" + dirpath + "\n" 33 | 34 | if(',' not in txt): 35 | flag = 1 36 | 37 | string = string + txt 38 | continue 39 | 40 | if (flag): 41 | string = string + txt 42 | 43 | if(']' in txt): 44 | flag = 0 45 | f.close() 46 | 47 | g = open ("Data.txt", "w") 48 | g.write(string) 49 | g.close() 50 | 51 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/pthread_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include //Header file for sleep(). man 3 sleep for details. 4 | #include 5 | 6 | struct Thread_info { 7 | int port; 8 | float* buffer; 9 | }; 10 | 11 | // A normal C function that is executed as a thread 12 | void *myThreadFun(void* vargp) 13 | { 14 | sleep(1); 15 | struct Thread_info* t_info = (struct Thread_info*) vargp; 16 | printf("Printing GeeksQuiz from Thread %d\n", t_info -> port); 17 | for (int i = 0; i < 32; i++) { 18 | printf("%f\t", t_info -> buffer[i]); 19 | } 20 | return NULL; 21 | } 22 | 23 | int main() 24 | { 25 | pthread_t thread_id; 26 | printf("Before Thread\n"); 27 | 28 | int port = 8080; 29 | float* buffer = malloc(128); 30 | for (int i = 0; i < 32; i++) { 31 | buffer[i] = i; 32 | } 33 | 34 | struct Thread_info t_info_0; 35 | t_info_0.port = port; 36 | t_info_0.buffer = buffer; 37 | 38 | pthread_create(&thread_id, NULL, myThreadFun, (void*) &t_info_0); 39 | // pthread_create(&thread_id, NULL, myThreadFun, NULL); 40 | pthread_join(thread_id, NULL); 41 | printf("After Thread\n"); 42 | exit(0); 43 | } -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/pthread_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include //Header file for sleep(). man 3 sleep for details. 4 | #include 5 | 6 | struct Thread_info { 7 | int port; 8 | float* buffer; 9 | }; 10 | 11 | // A normal C function that is executed as a thread 12 | void *myThreadFun(void* vargp) 13 | { 14 | sleep(1); 15 | struct Thread_info* t_info = (struct Thread_info*) vargp; 16 | printf("Printing GeeksQuiz from Thread %d\n", t_info -> port); 17 | for (int i = 0; i < 32; i++) { 18 | printf("%f\t", t_info -> buffer[i]); 19 | } 20 | return NULL; 21 | } 22 | 23 | int main() 24 | { 25 | pthread_t thread_id; 26 | printf("Before Thread\n"); 27 | 28 | int port = 8080; 29 | float* buffer = malloc(128); 30 | for (int i = 0; i < 32; i++) { 31 | buffer[i] = i; 32 | } 33 | 34 | struct Thread_info t_info_0; 35 | t_info_0.port = port; 36 | t_info_0.buffer = buffer; 37 | 38 | pthread_create(&thread_id, NULL, myThreadFun, (void*) &t_info_0); 39 | // pthread_create(&thread_id, NULL, myThreadFun, NULL); 40 | pthread_join(thread_id, NULL); 41 | printf("After Thread\n"); 42 | exit(0); 43 | } -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/pthread_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include //Header file for sleep(). man 3 sleep for details. 4 | #include 5 | 6 | struct Thread_info { 7 | int port; 8 | float* buffer; 9 | }; 10 | 11 | // A normal C function that is executed as a thread 12 | void *myThreadFun(void* vargp) 13 | { 14 | sleep(1); 15 | struct Thread_info* t_info = (struct Thread_info*) vargp; 16 | printf("Printing GeeksQuiz from Thread %d\n", t_info -> port); 17 | for (int i = 0; i < 32; i++) { 18 | printf("%f\t", t_info -> buffer[i]); 19 | } 20 | return NULL; 21 | } 22 | 23 | int main() 24 | { 25 | pthread_t thread_id; 26 | printf("Before Thread\n"); 27 | 28 | int port = 8080; 29 | float* buffer = malloc(128); 30 | for (int i = 0; i < 32; i++) { 31 | buffer[i] = i; 32 | } 33 | 34 | struct Thread_info t_info_0; 35 | t_info_0.port = port; 36 | t_info_0.buffer = buffer; 37 | 38 | pthread_create(&thread_id, NULL, myThreadFun, (void*) &t_info_0); 39 | // pthread_create(&thread_id, NULL, myThreadFun, NULL); 40 | pthread_join(thread_id, NULL); 41 | printf("After Thread\n"); 42 | exit(0); 43 | } -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/pthread_test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include //Header file for sleep(). man 3 sleep for details. 4 | #include 5 | 6 | struct Thread_info { 7 | int port; 8 | float* buffer; 9 | }; 10 | 11 | // A normal C function that is executed as a thread 12 | void *myThreadFun(void* vargp) 13 | { 14 | sleep(1); 15 | struct Thread_info* t_info = (struct Thread_info*) vargp; 16 | printf("Printing GeeksQuiz from Thread %d\n", t_info -> port); 17 | for (int i = 0; i < 32; i++) { 18 | printf("%f\t", t_info -> buffer[i]); 19 | } 20 | return NULL; 21 | } 22 | 23 | int main() 24 | { 25 | pthread_t thread_id; 26 | printf("Before Thread\n"); 27 | 28 | int port = 8080; 29 | float* buffer = malloc(128); 30 | for (int i = 0; i < 32; i++) { 31 | buffer[i] = i; 32 | } 33 | 34 | struct Thread_info t_info_0; 35 | t_info_0.port = port; 36 | t_info_0.buffer = buffer; 37 | 38 | pthread_create(&thread_id, NULL, myThreadFun, (void*) &t_info_0); 39 | // pthread_create(&thread_id, NULL, myThreadFun, NULL); 40 | pthread_join(thread_id, NULL); 41 | printf("After Thread\n"); 42 | exit(0); 43 | } -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/iperf_krnl/config_sp_iperf_krnl.txt: -------------------------------------------------------------------------------- 1 | [connectivity] 2 | sp=network_krnl_1.m00_axi:HBM[30] 3 | sp=network_krnl_1.m01_axi:HBM[31] 4 | sc=network_krnl_1.m_axis_udp_rx:iperf_krnl_1.s_axis_udp_rx 5 | sc=network_krnl_1.m_axis_udp_rx_meta:iperf_krnl_1.s_axis_udp_rx_meta 6 | sc=network_krnl_1.m_axis_tcp_port_status:iperf_krnl_1.s_axis_tcp_port_status 7 | sc=network_krnl_1.m_axis_tcp_open_status:iperf_krnl_1.s_axis_tcp_open_status 8 | sc=network_krnl_1.m_axis_tcp_notification:iperf_krnl_1.s_axis_tcp_notification 9 | sc=network_krnl_1.m_axis_tcp_rx_meta:iperf_krnl_1.s_axis_tcp_rx_meta 10 | sc=network_krnl_1.m_axis_tcp_rx_data:iperf_krnl_1.s_axis_tcp_rx_data 11 | sc=network_krnl_1.m_axis_tcp_tx_status:iperf_krnl_1.s_axis_tcp_tx_status 12 | 13 | sc=iperf_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 14 | sc=iperf_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 15 | sc=iperf_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 16 | sc=iperf_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 17 | sc=iperf_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 18 | sc=iperf_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 19 | sc=iperf_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 20 | sc=iperf_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 21 | 22 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 23 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx 24 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/config_sp_scatter_krnl.txt: -------------------------------------------------------------------------------- 1 | [connectivity] 2 | sp=network_krnl_1.m00_axi:HBM[30] 3 | sp=network_krnl_1.m01_axi:HBM[31] 4 | sc=network_krnl_1.m_axis_udp_rx:scatter_krnl_1.s_axis_udp_rx 5 | sc=network_krnl_1.m_axis_udp_rx_meta:scatter_krnl_1.s_axis_udp_rx_meta 6 | sc=network_krnl_1.m_axis_tcp_port_status:scatter_krnl_1.s_axis_tcp_port_status 7 | sc=network_krnl_1.m_axis_tcp_open_status:scatter_krnl_1.s_axis_tcp_open_status 8 | sc=network_krnl_1.m_axis_tcp_notification:scatter_krnl_1.s_axis_tcp_notification 9 | sc=network_krnl_1.m_axis_tcp_rx_meta:scatter_krnl_1.s_axis_tcp_rx_meta 10 | sc=network_krnl_1.m_axis_tcp_rx_data:scatter_krnl_1.s_axis_tcp_rx_data 11 | sc=network_krnl_1.m_axis_tcp_tx_status:scatter_krnl_1.s_axis_tcp_tx_status 12 | 13 | sc=scatter_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 14 | sc=scatter_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 15 | sc=scatter_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 16 | sc=scatter_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 17 | sc=scatter_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 18 | sc=scatter_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 19 | sc=scatter_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 20 | sc=scatter_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 21 | 22 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 23 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx 24 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/hls_test_krnl/config_sp_hls_test_krnl.txt: -------------------------------------------------------------------------------- 1 | [connectivity] 2 | sp=network_krnl_1.m00_axi:HBM[30] 3 | sp=network_krnl_1.m01_axi:HBM[31] 4 | sc=network_krnl_1.m_axis_udp_rx:hls_test_krnl_1.s_axis_udp_rx 5 | sc=network_krnl_1.m_axis_udp_rx_meta:hls_test_krnl_1.s_axis_udp_rx_meta 6 | sc=network_krnl_1.m_axis_tcp_port_status:hls_test_krnl_1.s_axis_tcp_port_status 7 | sc=network_krnl_1.m_axis_tcp_open_status:hls_test_krnl_1.s_axis_tcp_open_status 8 | sc=network_krnl_1.m_axis_tcp_notification:hls_test_krnl_1.s_axis_tcp_notification 9 | sc=network_krnl_1.m_axis_tcp_rx_meta:hls_test_krnl_1.s_axis_tcp_rx_meta 10 | sc=network_krnl_1.m_axis_tcp_rx_data:hls_test_krnl_1.s_axis_tcp_rx_data 11 | sc=network_krnl_1.m_axis_tcp_tx_status:hls_test_krnl_1.s_axis_tcp_tx_status 12 | 13 | sc=hls_test_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 14 | sc=hls_test_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 15 | sc=hls_test_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 16 | sc=hls_test_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 17 | sc=hls_test_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 18 | sc=hls_test_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 19 | sc=hls_test_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 20 | sc=hls_test_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 21 | 22 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 23 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx -------------------------------------------------------------------------------- /FPGA/common/utility/build_what.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | HEAD= 4 | 5 | if [[ "$BRANCH_NAME" == "" ]]; then 6 | HEAD=remotes/origin/master 7 | else 8 | HEAD=remotes/origin/${BRANCH_NAME} 9 | fi 10 | 11 | PROJS=$(git ls-files | grep description.json | sed -e 's/\.\///' -e 's/\/description.json//') 12 | CHANGES=$(git diff --name-only $HEAD) 13 | 14 | howmany() { echo $#; } 15 | NUM_CHANGES=$(howmany $CHANGES) 16 | 17 | echo NUM_CHANGES=$NUM_CHANGES 18 | 19 | REBUILDS= 20 | for change in $CHANGES; do 21 | IN_PROJS= 22 | for proj in $PROJS; do 23 | if [[ "$change" == ${proj}* ]]; then 24 | IN_PROJS="$proj $IN_PROJS" 25 | fi 26 | done 27 | 28 | if [[ "$change" == */README.md 29 | || "$change" == "utility/build_what.sh" 30 | || "$change" == "Jenkinsfile" ]]; then 31 | echo "SKIPPING $change" 32 | NUM_CHANGES=$((NUM_CHANGES-1)) 33 | elif [[ "$IN_PROJS" != "" ]]; then 34 | echo "REBUILD $change" 35 | NUM_CHANGES=$((NUM_CHANGES-1)) 36 | REBUILDS="$IN_PROJS $REBUILDS" 37 | else 38 | echo "UNKNOWN $change" 39 | fi 40 | done 41 | 42 | UNIQ_REBUILDS=$(echo $REBUILDS | xargs -n 1 | sort -u | xargs) 43 | 44 | echo UNIQ_REBUILDS = $UNIQ_REBUILDS 45 | echo NUM_CHANGES = $NUM_CHANGES 46 | 47 | # if we know that we only changed something inside a single example then do a rebuild 48 | # of that example only else rebuild all examples. 49 | cat /dev/null > examples.dat 50 | if [[ "$NUM_CHANGES" == "0" && "$UNIQ_REBUILDS" != "" ]]; then 51 | for rebuild in $UNIQ_REBUILDS; do 52 | echo $rebuild >> examples.dat 53 | done 54 | else 55 | for proj in $PROJS; do 56 | echo $proj >> examples.dat 57 | done 58 | fi 59 | -------------------------------------------------------------------------------- /FPGA/kernel/cmac_krnl/src/hdl/axis_data_reg_array.sv: -------------------------------------------------------------------------------- 1 | `include "network_types.svh" 2 | `include "network_intf.svh" 3 | 4 | module axis_data_reg_array #( 5 | parameter integer N_STAGES = 2 6 | ) ( 7 | input wire aclk, 8 | input wire aresetn, 9 | axi_stream.slave s_axis, 10 | axi_stream.master m_axis 11 | ); 12 | 13 | // ----------------------------------------------------------------------------------------------------------------------- 14 | // -- Register slices ---------------------------------------------------------------------------------------------------- 15 | // ----------------------------------------------------------------------------------------------------------------------- 16 | axi_stream axis_int [N_STAGES+1] (); 17 | 18 | always_comb begin 19 | axis_int[0].valid = s_axis.valid; 20 | axis_int[0].data = s_axis.data; 21 | axis_int[0].keep = s_axis.keep; 22 | axis_int[0].last = s_axis.last; 23 | s_axis.ready = axis_int[0].ready; 24 | 25 | m_axis.valid = axis_int[N_STAGES].valid; 26 | m_axis.data = axis_int[N_STAGES].data; 27 | m_axis.keep = axis_int[N_STAGES].keep; 28 | m_axis.last = axis_int[N_STAGES].last; 29 | axis_int[N_STAGES].ready = m_axis.ready; 30 | end 31 | 32 | for(genvar i = 0; i < N_STAGES; i++) begin 33 | axis_data_reg inst_reg (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_int[i]), .m_axis(axis_int[i+1])); 34 | end 35 | 36 | endmodule -------------------------------------------------------------------------------- /FPGA/kernel/network_krnl/src/hdl/axis_data_reg_array.sv: -------------------------------------------------------------------------------- 1 | `include "network_types.svh" 2 | `include "network_intf.svh" 3 | 4 | module axis_data_reg_array #( 5 | parameter integer N_STAGES = 2 6 | ) ( 7 | input wire aclk, 8 | input wire aresetn, 9 | axi_stream.slave s_axis, 10 | axi_stream.master m_axis 11 | ); 12 | 13 | // ----------------------------------------------------------------------------------------------------------------------- 14 | // -- Register slices ---------------------------------------------------------------------------------------------------- 15 | // ----------------------------------------------------------------------------------------------------------------------- 16 | axi_stream axis_int [N_STAGES+1] (); 17 | 18 | always_comb begin 19 | axis_int[0].valid = s_axis.valid; 20 | axis_int[0].data = s_axis.data; 21 | axis_int[0].keep = s_axis.keep; 22 | axis_int[0].last = s_axis.last; 23 | s_axis.ready = axis_int[0].ready; 24 | 25 | m_axis.valid = axis_int[N_STAGES].valid; 26 | m_axis.data = axis_int[N_STAGES].data; 27 | m_axis.keep = axis_int[N_STAGES].keep; 28 | m_axis.last = axis_int[N_STAGES].last; 29 | axis_int[N_STAGES].ready = m_axis.ready; 30 | end 31 | 32 | for(genvar i = 0; i < N_STAGES; i++) begin 33 | axis_data_reg inst_reg (.aclk(aclk), .aresetn(aresetn), .s_axis(axis_int[i]), .m_axis(axis_int[i+1])); 34 | end 35 | 36 | endmodule -------------------------------------------------------------------------------- /FPGA/common/utility/md2rst/update_md2rst_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | appDir=$(dirname $(dirname $(dirname $(readlink -f $0)))) 3 | 4 | echo "-----------------------" 5 | echo "-- UPDATING .md to .rst --" 6 | echo "-----------------------" 7 | 8 | update_file() { 9 | ignore=0 10 | 11 | for i in $IGNORE; do 12 | if [[ $1 =~ ^description.json ]]; then 13 | ignore=1 14 | fi 15 | done 16 | 17 | if [[ $VERBOSE == "true" ]]; then 18 | echo -n "Checking $1 ... " 19 | fi 20 | if [[ $ignore == 1 ]]; then 21 | if [[ $VERBOSE == "true" ]]; then 22 | echo "SKIP" 23 | fi 24 | else 25 | pushd . > /dev/null 26 | # Migrate to the example directory 27 | cd $(dirname $1) 28 | # Take out the name of the example 29 | b_name=$(basename $(dirname $1)) 30 | # Run the detailed .md generator 31 | $appDir/utility/md2rst/md2rst.py description.json #> /dev/null 2>&1 32 | # Run the .md to .rst file generator using correct path to pandoc 33 | $appDir/pandoc-2.7.3/bin/pandoc -f markdown D_README.md -t rst -o $b_name.rst 34 | # move the generated .rst to desired folder 35 | mv $b_name.rst $appDir/../../test/ 36 | # delete the detailed readme 37 | rm D_README.md 38 | # Locate the desired folder and run sphinx to generate html files from 39 | # rst files. Go live... 40 | popd >/dev/null 41 | fi 42 | } 43 | 44 | VCS_FILES=$(git ls-files) 45 | 46 | for f in $VCS_FILES; do 47 | if [[ ($f == */description.json) ]]; then 48 | if grep -q '"match_readme": "false"' $f; then 49 | echo $f 50 | echo "Readme Manually Edited:: Autofile Generator Failed" 51 | else 52 | echo $f 53 | update_file $(readlink -f $f) 54 | fi 55 | fi 56 | done 57 | -------------------------------------------------------------------------------- /FPGA/common/utility/check_license.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if LICENSE.txt is provided as an argument 4 | 5 | if [ $# -eq 0 ]; then 6 | echo "ERROR: LICENSE.txt not found" 7 | exit 1 8 | fi 9 | 10 | # Check if all source files have the correct license 11 | 12 | LICENSE=$1 13 | TYPES="c cpp h cl" 14 | IGNORE=$(cat .LICENSE_IGNORE.txt) 15 | 16 | LICENSE_LEN=$(cat $LICENSE | wc -l) 17 | 18 | echo "-------------------------------------" 19 | echo "-- CHECKING LICENSE of all $TYPES --" 20 | echo "-------------------------------------" 21 | echo "-- IGNORING --" 22 | echo "$IGNORE" 23 | echo "--------------" 24 | 25 | FAIL=0 26 | 27 | check_file() { 28 | ignore=0 29 | 30 | for i in $IGNORE; do 31 | if [[ $1 =~ $i ]]; then 32 | ignore=1 33 | fi 34 | done 35 | 36 | if [[ $VERBOSE == "true" ]]; then 37 | echo -n "Checking $1 ... " 38 | fi 39 | if [[ $ignore == 1 ]]; then 40 | if [[ $VERBOSE == "true" ]]; then 41 | echo "SKIP" 42 | fi 43 | else 44 | diff $LICENSE <(head -n$LICENSE_LEN $1) 2>/dev/null 1>&2 45 | if [[ $? == 0 ]]; then 46 | if [[ $VERBOSE == "true" ]]; then 47 | echo "PASS" 48 | fi 49 | else 50 | if [[ $VERBOSE == "true" ]]; then 51 | echo "FAIL" 52 | diff $LICENSE <(head -n$LICENSE_LEN $1) 53 | else 54 | echo "$1" 55 | fi 56 | (( FAIL += 1 )) 57 | fi 58 | fi 59 | } 60 | 61 | 62 | VCS_FILES=$(git ls-files) 63 | 64 | for f in $VCS_FILES; do 65 | for t in $TYPES; do 66 | if [[ $f == *.$t ]]; then 67 | check_file $f 68 | fi 69 | done 70 | done 71 | 72 | if [[ $FAIL != 0 ]]; then 73 | echo "ERROR: License check failed" 74 | echo "ERROR: please fix the license in these files (or add to ignored if external)" 75 | fi 76 | 77 | exit $FAIL 78 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/constant.h: -------------------------------------------------------------------------------- 1 | // Input: (INPUT_FEATURE_LEN, BATCH_SIZE) 2 | // Layer1: W1 * INPUT + B1 3 | // -> W1 (HIDDEN_SIZE1, INPUT_FEATURE_LEN) 4 | // -> B1 (HIDDEN_SIZE1) 5 | // -> Result1 (HIDDEN_SIZE1, BATCH_SIZE) 6 | // Layer2: W2 * Result1 + B2 7 | // -> W2 (HIDDEN_SIZE2, HIDDEN_SIZE1) 8 | // -> B2 (HIDDEN_SIZE2) 9 | // -> Result2 (HIDDEN_SIZE2, BATCH_SIZE) 10 | // Layer3: W3 * Result2 + B3 11 | // -> W3 (HIDDEN_SIZE3, HIDDEN_SIZE2) 12 | // -> B3 (HIDDEN_SIZE3) 13 | // -> Result3 (HIDDEN_SIZE3, BATCH_SIZE) 14 | // Output Layer: W_OUT * Result3 + B_OUT 15 | // -> W3 (OUTPUT_FEATURE_LEN, HIDDEN_SIZE3) 16 | // -> B3 (OUTPUT_FEATURE_LEN) 17 | // -> Result3 (OUTPUT_FEATURE_LEN, BATCH_SIZE) 18 | 19 | ///////////// OPTION: small model 352 -> 512 ///////////// 20 | ///////////// OPTION: large model 880 -> 1024 ///////////// 21 | #define INPUT_FEATURE_LEN 880 22 | 23 | // BATCH_SIZE GIVEN IN constant.h, need to revisit the constant definition later 24 | #define HIDDEN_SIZE1 1024 25 | #define HIDDEN_SIZE2 512 26 | #define HIDDEN_SIZE3 256 27 | #define OUTPUT_FEATURE_LEN 1 28 | 29 | /* constraint: SHM_DATA_SIZE === 1 GB */ 30 | /* FLOAT_SIZE * BATCH_SIZE * INPUT_DIM * BATCH_NUM_PER_LOOP = 1024 **3 */ 31 | #define FLOAT_SIZE 4 32 | #define BATCH_SIZE 128 // 1024 33 | #define TOTAL_BATCH_NUM (2 * 1024 * 1024 / BATCH_SIZE) 34 | 35 | /* matrix (batch * input_dim): 1024 * 1024, float: 4 byte, 1024 batches in queue */ 36 | /* 4 GB in total, (1024 * 1024 * 4 * 1024) */ 37 | #define BLOCK_ENTRY_NUM (BATCH_SIZE * INPUT_FEATURE_LEN) 38 | #define BLOCK_SIZE (BLOCK_ENTRY_NUM * FLOAT_SIZE) 39 | 40 | #define PORT 8080 // 8080 41 | 42 | #define THREAD_NUM 4 // the number of sender / receiver threads 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GPU-FPGA-Recommendation-System 2 | The source code of our paper published in KDD 2021--- [FleetRec: Large-Scale Recommendation Inference on Hybrid GPU-FPGA Clusters](https://www.research-collection.ethz.ch/bitstream/handle/20.500.11850/485153/1/FleetRec_camera_ready.pdf). 3 | 4 | 5 | 6 | There are two folders for the FPGA and GPU implementations. To build the FPGA, refer to the README.md in the folder. The supported device is Alveo U280 and we used an [open-source TCP/IP stack for Vitis](https://github.com/fpgasystems/Vitis_with_100Gbps_TCP-IP). The GPU implementation requires CUDA version of at least 11.0 and should support a wide range of GPU models. 7 | 8 | 9 | 10 | There are three experiments on different recommendation models. The FPGA kernels can be found [here](./FPGA/kernel/user_krnl), and the GPU kernels can be found [here](./GPU). There are respective READMEs in those FPGA and GPU folders. 11 | 12 | 13 | 14 | ## Reference 15 | 16 | The paper corresponds to this repository: 17 | 18 | Jiang, W., He, Z., Zhang, S., Zeng, K., Feng, L., Zhang, J., ... & Alonso, G. (2021, August). FleetRec: Large-Scale Recommendation Inference on Hybrid GPU-FPGA Clusters. In *27th SIGKDD Conference on Knowledge Discovery and Data Mining (KDD 2021)*. 19 | 20 | The FPGA implementation is based on a previous paper: 21 | 22 | Jiang, W., He, Z., Zhang, S., Preußer, T. B., Zeng, K., Feng, L., ... & Alonso, G. (2021). MicroRec: efficient recommendation inference by hardware and data structure solutions. *Proceedings of Machine Learning and Systems*, *3*. 23 | 24 | The FPGA network stack we used: 25 | 26 | Zhenhao He, Dario Korolija, and Gustavo Alonso. 2021. EasyNet: 100 Gbps Network for HLS. In 2021 31th International Conference on Field Programmable Logic and Applications (FPL) -------------------------------------------------------------------------------- /FPGA/common/utility/Consolidation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # utility that creates a folder that contains the common folder and 5 | # all the files of the example 6 | # 7 | 8 | from sys import argv 9 | import re 10 | import sys 11 | import os 12 | import os.path 13 | 14 | path = os.getcwd() 15 | 16 | actual_folder = path 17 | folder_created = path + '_backup' 18 | route = argv[0].split('common') 19 | 20 | if(not os.path.isdir(folder_created)): 21 | cmd = 'cp -rf ' + actual_folder + ' ' + folder_created 22 | os.system(cmd) 23 | os.chmod(folder_created, 0o777) 24 | 25 | f = open(folder_created + '/Makefile', "r+") 26 | 27 | string = "" 28 | listing = ['opencl'] 29 | 30 | for txt in f: 31 | 32 | x = re.search("^COMMON_REPO =.*", txt) 33 | 34 | if (x): 35 | txt = "COMMON_REPO = ./\n" 36 | 37 | string = string + txt 38 | 39 | f.close() 40 | 41 | f = open(folder_created + '/description.json', "r+") 42 | 43 | flag = 0 44 | for txt in f: 45 | if "\"includepaths\"" in txt: 46 | flag = 1 47 | continue 48 | 49 | if (flag == 1): 50 | if (']' in txt or '}' in txt): 51 | break 52 | else: 53 | listing.append(txt[txt.find("includes/") + 9 : txt.rfind('\"')]) 54 | 55 | f.close() 56 | 57 | commonfolders = route[0] + "common/includes/" 58 | 59 | for foldername in os.listdir(commonfolders): 60 | if foldername in listing: 61 | cmd1 = 'mkdir -p ' + folder_created + '/common/includes/' + foldername 62 | cmd2 = 'cp -rf ' + commonfolders + '/' + foldername + '/* ' + folder_created + '/common/includes/' + foldername 63 | os.system(cmd1) 64 | os.system(cmd2) 65 | 66 | g = open(folder_created + '/Makefile', "w") 67 | g.write(string) 68 | g.close() 69 | 70 | print ("The new folder's location is %s" % folder_created) 71 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/iperf_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/hls_test_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_377_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_47_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_98_krnl/src/hls/mem_utils.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef MEM_UTILS_HPP 28 | #define MEM_UTILS_HPP 29 | 30 | 31 | struct memCmd 32 | { 33 | ap_uint<64> addr; 34 | ap_uint<32> len; 35 | memCmd() {} 36 | memCmd(ap_uint<64> addr, ap_uint<32> len) 37 | :addr(addr), len(len) {} 38 | }; 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /FPGA/common/utility/check_readme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if all examples have correct Readme 4 | 5 | echo "-----------------------" 6 | echo "-- CHECKING READMEs --" 7 | echo "-----------------------" 8 | 9 | FAIL=0 10 | 11 | check_file() { 12 | ignore=0 13 | 14 | for i in $IGNORE; do 15 | if [[ $1 =~ ^description.json ]]; then 16 | ignore=1 17 | fi 18 | done 19 | 20 | if [[ $VERBOSE == "true" ]]; then 21 | echo -n "Checking $1 ... " 22 | fi 23 | if [[ $ignore == 1 ]]; then 24 | if [[ $VERBOSE == "true" ]]; then 25 | echo "SKIP" 26 | fi 27 | else 28 | pushd . > /dev/null 29 | cd $(dirname $1) 30 | mv README.md README.md.check > /dev/null 2>&1 31 | make README.md 2>/dev/null 1>&2 32 | rc=$? 33 | if [[ $2 != "false" ]]; then 34 | diff README.md README.md.check 2>/dev/null 1>&2 35 | if [[ $rc == 0 && $? == 0 ]]; then 36 | if [[ $VERBOSE == "true" ]]; then 37 | echo "PASS" 38 | fi 39 | else 40 | if [[ $VERBOSE == "true" ]]; then 41 | echo "FAIL" 42 | diff README.md README.md.check 43 | else 44 | echo "$1" 45 | fi 46 | (( FAIL += 1 )) 47 | fi 48 | fi 49 | 50 | mv README.md.check README.md > /dev/null 2>&1 51 | popd >/dev/null 52 | fi 53 | } 54 | 55 | 56 | VCS_FILES=$(git ls-files) 57 | 58 | for f in $VCS_FILES; do 59 | CHECK_MATCH=true 60 | if [[ ($f == */description.json) || ($f == */summary.json) ]]; then 61 | if grep -q '"match_readme": "false"' $f; then 62 | CHECK_MATCH=false 63 | echo "Ignoring README.md ::" $f 64 | fi 65 | check_file $f $CHECK_MATCH 66 | fi 67 | done 68 | 69 | if [[ $FAIL != 0 ]]; then 70 | echo "ERROR: Readme check failed" 71 | echo "ERROR: please fix the README.md in these files" 72 | fi 73 | 74 | exit $FAIL 75 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/constant.h: -------------------------------------------------------------------------------- 1 | // Input: (INPUT_FEATURE_LEN, BATCH_SIZE) 2 | // Layer1: W1 * INPUT + B1 3 | // -> W1 (HIDDEN_SIZE1, INPUT_FEATURE_LEN) 4 | // -> B1 (HIDDEN_SIZE1) 5 | // -> Result1 (HIDDEN_SIZE1, BATCH_SIZE) 6 | // Layer2: W2 * Result1 + B2 7 | // -> W2 (HIDDEN_SIZE2, HIDDEN_SIZE1) 8 | // -> B2 (HIDDEN_SIZE2) 9 | // -> Result2 (HIDDEN_SIZE2, BATCH_SIZE) 10 | // Layer3: W3 * Result2 + B3 11 | // -> W3 (HIDDEN_SIZE3, HIDDEN_SIZE2) 12 | // -> B3 (HIDDEN_SIZE3) 13 | // -> Result3 (HIDDEN_SIZE3, BATCH_SIZE) 14 | // Output Layer: W_OUT * Result3 + B_OUT 15 | // -> W3 (OUTPUT_FEATURE_LEN, HIDDEN_SIZE3) 16 | // -> B3 (OUTPUT_FEATURE_LEN) 17 | // -> Result3 (OUTPUT_FEATURE_LEN, BATCH_SIZE) 18 | 19 | ///////////// OPTION: small model 384 -> 512 ///////////// 20 | ///////////// OPTION: large model 876 -> 1024 ///////////// 21 | #define INPUT_FEATURE_LEN 880 22 | 23 | // BATCH_SIZE GIVEN IN constant.h, need to revisit the constant definition later 24 | #define HIDDEN_SIZE1 1024 25 | #define HIDDEN_SIZE2 512 26 | #define HIDDEN_SIZE3 256 27 | #define OUTPUT_FEATURE_LEN 1 28 | 29 | /* constraint: SHM_DATA_SIZE === 1 GB */ 30 | /* FLOAT_SIZE * BATCH_SIZE * INPUT_DIM * BATCH_NUM_PER_LOOP = 1024 **3 */ 31 | #define FLOAT_SIZE 4 32 | #define BATCH_SIZE 256 // 1024 33 | #define BATCH_NUM_PER_LOOP (1024 * 256 / BATCH_SIZE) // -> should be renamed as FIFO_BATCH_NUM 34 | 35 | // LOOP = number of GBs to perform 36 | #define LOOP_NUM 1 37 | 38 | #define TOTAL_BATCH_NUM (BATCH_NUM_PER_LOOP * LOOP_NUM) 39 | 40 | /* matrix (batch * input_dim): 1024 * 1024, float: 4 byte, 1024 batches in queue */ 41 | /* 4 GB in total, (1024 * 1024 * 4 * 1024) */ 42 | #define BLOCK_ENTRY_NUM (BATCH_SIZE * INPUT_FEATURE_LEN) 43 | #define BLOCK_SIZE (BLOCK_ENTRY_NUM * FLOAT_SIZE) 44 | // maximum shared memory size: 1 GB 45 | #define SHM_DATA_SIZE (BLOCK_SIZE * BATCH_NUM_PER_LOOP) 46 | 47 | #define SHM_CONTROL_SIZE 1024 48 | 49 | #define STREAM_NUM 4 // Stream 0 port: PORT, Stream 1 port: PORT + 1, ... 50 | 51 | #define PORT 8080 -------------------------------------------------------------------------------- /FPGA/config_hls.mk: -------------------------------------------------------------------------------- 1 | # $(TEMP_DIR)/${KRNL_NAME_2}.xo: kernel/user_krnl/src/C/*.cpp 2 | # mkdir -p $(TEMP_DIR) 3 | # $(VPP) $(CLFLAGS) -c -k ${KRNL_NAME_2} -o $(TEMP_DIR)/${KRNL_NAME_2}.xo --input_files kernel/user_krnl/src/C/*.cpp 4 | 5 | # VIVADO := $(XILINX_VIVADO)/bin/vivado 6 | # $(TEMP_DIR)/${KRNL_NAME_1}.xo: kernel/network_krnl.xml scripts/package_network_krnl.tcl scripts/gen_xo.tcl kernel/network_krnl/src/hdl/*.sv 7 | # mkdir -p $(TEMP_DIR) 8 | # $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_NAME_1}.xo ${KRNL_NAME_1} $(TARGET) $(DEVICE) $(XSA) 9 | 10 | # # $(TEMP_DIR)/${KRNL_NAME_2}.xo: kernel/user_krnl.xml scripts/package_user_krnl.tcl scripts/gen_xo.tcl kernel/user_krnl/src/hdl/*.sv 11 | # # mkdir -p $(TEMP_DIR) 12 | # # $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_NAME_2}.xo ${KRNL_NAME_2} $(TARGET) $(DEVICE) $(XSA) 13 | 14 | 15 | # $(TEMP_DIR)/${KRNL_NAME_3}.xo: kernel/cmac_krnl.xml scripts/package_cmac_krnl.tcl scripts/gen_xo.tcl kernel/cmac_krnl/src/hdl/*.sv 16 | # mkdir -p $(TEMP_DIR) 17 | # $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_NAME_3}.xo ${KRNL_NAME_3} $(TARGET) $(DEVICE) $(XSA) 18 | 19 | 20 | VIVADO := $(XILINX_VIVADO)/bin/vivado 21 | $(TEMP_DIR)/${KRNL_1}.xo: kernel/network_krnl/network_krnl.xml scripts/package_network_krnl.tcl scripts/gen_xo.tcl kernel/network_krnl/src/hdl/*.sv 22 | mkdir -p $(TEMP_DIR) 23 | $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_1}.xo ${KRNL_1} $(TARGET) $(DEVICE) $(XSA) kernel/network_krnl/network_krnl.xml ./scripts/package_network_krnl.tcl 24 | 25 | $(TEMP_DIR)/${KRNL_2}.xo: kernel/user_krnl/${KRNL_2}/src/hls/*.cpp 26 | mkdir -p $(TEMP_DIR) 27 | $(VPP) $(CLFLAGS) -c -k ${KRNL_2} -o $(TEMP_DIR)/${KRNL_2}.xo --input_files kernel/user_krnl/${KRNL_2}/src/hls/*.cpp 28 | 29 | 30 | $(TEMP_DIR)/${KRNL_3}.xo: kernel/cmac_krnl/cmac_krnl.xml scripts/package_cmac_krnl.tcl scripts/gen_xo.tcl kernel/cmac_krnl/src/hdl/*.sv 31 | mkdir -p $(TEMP_DIR) 32 | $(VIVADO) -mode batch -source scripts/gen_xo.tcl -tclargs $(TEMP_DIR)/${KRNL_3}.xo ${KRNL_3} $(TARGET) $(DEVICE) $(XSA) kernel/cmac_krnl/cmac_krnl.xml ./scripts/package_cmac_krnl.tcl 33 | -------------------------------------------------------------------------------- /FPGA/kernel/network_krnl/src/hdl/axis_udp_meta_reg.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | `timescale 1ns / 1ps 28 | `default_nettype none 29 | 30 | // 512 axi stream register slice 31 | module axis_udp_meta_reg ( 32 | input wire aclk, 33 | input wire aresetn, 34 | axis_meta.slave s_axis, 35 | axis_meta.master m_axis 36 | ); 37 | 38 | axis_register_slice_176 slice_inst( 39 | .aclk(aclk), 40 | .aresetn(aresetn), 41 | .s_axis_tvalid(s_axis.valid), 42 | .s_axis_tready(s_axis.ready), 43 | .s_axis_tdata(s_axis.data), 44 | .m_axis_tvalid(m_axis.valid), 45 | .m_axis_tready(m_axis.ready), 46 | .m_axis_tdata(m_axis.data) 47 | ); 48 | 49 | endmodule 50 | `default_nettype wire 51 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/README.md: -------------------------------------------------------------------------------- 1 | # Programs 2 | 3 | First start Terminal 1: run cuda_server first (./run_cuda_server.sh) 4 | 5 | Then start Terminal 2: run client to send data (./run_client_sender.sh) -> Remember to Adjust the Server IP Address 6 | 7 | Correct Results: 8 | 9 | Input Feature Size = 512 -> 68719476736 10 | 11 | Input Feature Size = 1024 -> 137438953472 12 | 13 | ## cuda_server.c 14 | 15 | rm cuda_server 16 | 17 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 18 | 19 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server 20 | 21 | ## multiple_connections_network_client_sender.c 22 | 23 | This program simulates FPGA that opens 4 connections and sending data to the CUDA server. 24 | 25 | Start CUDA server first, then client. 26 | 27 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 28 | 29 | ./multiple_connections_network_client_sender 30 | 31 | 32 | # Other programs (for building the final version) 33 | 34 | ## pthread_test.c 35 | 36 | https://www.geeksforgeeks.org/multithreading-c-2/ 37 | 38 | Pass port info and memory address space to the thread as a structure, and execute that thread. 39 | 40 | gcc pthread_test.c -lpthread 41 | 42 | ./a.out 43 | 44 | ## single_connection_network_server_receiver.c 45 | 46 | Start server first, then client. 47 | 48 | gcc single_connection_network_server_receiver.c -lpthread -o single_connection_network_server_receiver 49 | 50 | ./single_connection_network_server_receiver 51 | 52 | ## single_connection_network_client_sender.c 53 | 54 | Start server first, then client. 55 | 56 | gcc single_connection_network_client_sender.c -lpthread -o single_connection_network_client_sender 57 | 58 | ./single_connection_network_client_sender 59 | 60 | 61 | ## multiple_connections_network_server_receiver.c 62 | 63 | Start server first, then client. 64 | 65 | 4 TCP connections. 66 | 67 | gcc multiple_connections_network_server_receiver.c -lpthread -o multiple_connections_network_server_receiver 68 | 69 | ./multiple_connections_network_server_receiver 70 | 71 | ## multiple_connections_network_client_sender.c 72 | 73 | Start server first, then client. 74 | 75 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 76 | 77 | ./multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/README.md: -------------------------------------------------------------------------------- 1 | # Programs 2 | 3 | First start Terminal 1: run cuda_server first (./run_cuda_server.sh) 4 | 5 | Then start Terminal 2: run client to send data (./run_client_sender.sh) -> Remember to Adjust the Server IP Address 6 | 7 | Correct Results: 8 | 9 | Input Feature Size = 512 -> 68719476736 10 | 11 | Input Feature Size = 1024 -> 137438953472 12 | 13 | ## cuda_server.c 14 | 15 | rm cuda_server 16 | 17 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 18 | 19 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server 20 | 21 | ## multiple_connections_network_client_sender.c 22 | 23 | This program simulates FPGA that opens 4 connections and sending data to the CUDA server. 24 | 25 | Start CUDA server first, then client. 26 | 27 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 28 | 29 | ./multiple_connections_network_client_sender 30 | 31 | 32 | # Other programs (for building the final version) 33 | 34 | ## pthread_test.c 35 | 36 | https://www.geeksforgeeks.org/multithreading-c-2/ 37 | 38 | Pass port info and memory address space to the thread as a structure, and execute that thread. 39 | 40 | gcc pthread_test.c -lpthread 41 | 42 | ./a.out 43 | 44 | ## single_connection_network_server_receiver.c 45 | 46 | Start server first, then client. 47 | 48 | gcc single_connection_network_server_receiver.c -lpthread -o single_connection_network_server_receiver 49 | 50 | ./single_connection_network_server_receiver 51 | 52 | ## single_connection_network_client_sender.c 53 | 54 | Start server first, then client. 55 | 56 | gcc single_connection_network_client_sender.c -lpthread -o single_connection_network_client_sender 57 | 58 | ./single_connection_network_client_sender 59 | 60 | 61 | ## multiple_connections_network_server_receiver.c 62 | 63 | Start server first, then client. 64 | 65 | 4 TCP connections. 66 | 67 | gcc multiple_connections_network_server_receiver.c -lpthread -o multiple_connections_network_server_receiver 68 | 69 | ./multiple_connections_network_server_receiver 70 | 71 | ## multiple_connections_network_client_sender.c 72 | 73 | Start server first, then client. 74 | 75 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 76 | 77 | ./multiple_connections_network_client_sender -------------------------------------------------------------------------------- /FPGA/common/utility/check_makefile.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if all examples have correct Makefiles 4 | 5 | echo "-----------------------" 6 | echo "-- CHECKING MAKEFILES --" 7 | echo "-----------------------" 8 | 9 | FAIL=0 10 | 11 | check_file() { 12 | ignore=0 13 | 14 | for i in $IGNORE; do 15 | if [[ $1 =~ ^description.json ]]; then 16 | ignore=1 17 | fi 18 | done 19 | 20 | if [[ $VERBOSE == "true" ]]; then 21 | echo -n "Checking $1 ... " 22 | fi 23 | if [[ $ignore == 1 ]]; then 24 | if [[ $VERBOSE == "true" ]]; then 25 | echo "SKIP" 26 | fi 27 | else 28 | pushd . > /dev/null 29 | jsonDir=$(dirname $(readlink -f $1)) 30 | cd $jsonDir 31 | mv Makefile Makefile.check > /dev/null 2>&1 32 | mv utils.mk utils.mk.check > /dev/null 2>&1 33 | $utilityDir/makefile_gen/makegen.py $1 > /dev/null 2>&1 34 | rc=$? 35 | diff Makefile Makefile.check 2>/dev/null 1>&2 36 | if [[ $rc == 0 && $? == 0 ]]; then 37 | #echo 'pass file' 38 | if [[ $VERBOSE == "true" ]]; then 39 | echo "PASS" 40 | fi 41 | else 42 | if [[ $VERBOSE == "true" ]]; then 43 | echo "FAIL" 44 | diff Makefile Makefile.check 45 | else 46 | echo "$1" 47 | fi 48 | (( FAIL += 1 )) 49 | fi 50 | mv Makefile.check Makefile > /dev/null 2>&1 51 | 52 | diff utils.mk utils.mk.check 2>/dev/null 1>&2 53 | if [[ $rc == 0 && $? == 0 ]]; then 54 | #echo 'pass file' 55 | if [[ $VERBOSE == "true" ]]; then 56 | echo "PASS" 57 | fi 58 | else 59 | if [[ $VERBOSE == "true" ]]; then 60 | echo "FAIL" 61 | diff utils.mk utils.mk.check 62 | else 63 | echo "$1" 64 | fi 65 | (( FAIL += 1 )) 66 | fi 67 | mv utils.mk.check utils.mk > /dev/null 2>&1 68 | popd >/dev/null 69 | fi 70 | } 71 | 72 | utilityDir=$(dirname $(readlink -f $0)) 73 | cd $utilityDir 74 | cd .. 75 | VCS_FILES=$(git ls-files) 76 | 77 | for f in $VCS_FILES; do 78 | if [[ ($f == */description.json) ]]; then 79 | if grep -q '"match_ini": "false"' $f; then 80 | echo "Manually Edited ini File ::" $f 81 | fi 82 | if grep -q '"match_makefile": "false"' $f; then 83 | echo "Ignoring ::" $f 84 | else 85 | check_file $(readlink -f $f) 86 | fi 87 | fi 88 | done 89 | 90 | if [[ $FAIL != 0 ]]; then 91 | echo "ERROR: Makefile check failed" 92 | echo "ERROR: please fix the makefile in these files" 93 | fi 94 | 95 | exit $FAIL 96 | -------------------------------------------------------------------------------- /FPGA/common/utility/check_descr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # 4 | # utility that lists down 5 | # all the examples with respective unnecessary keywords mentioned 6 | # 7 | 8 | import glob 9 | import json 10 | import re 11 | import sys 12 | import os 13 | 14 | import os.path 15 | 16 | for dirpath, dirnames, filenames in os.walk("../.././"): 17 | for filename in [f for f in filenames if (f.endswith("description.json") and f not in "../../common/.")]: 18 | 19 | f = open(os.path.join(dirpath, filename), "r+") 20 | flag = 0 21 | t = 0 22 | string_check = "" 23 | 24 | for txt in f: 25 | if ("keywords" in txt and flag == 0): 26 | flag = 1 27 | continue 28 | 29 | if (flag): 30 | if('}' in txt or ']' in txt): 31 | break 32 | 33 | else: 34 | c_list = txt.split("\"") 35 | check_flag = 0 36 | for c_dirpath, c_dirnames, c_filenames in os.walk(os.path.join(dirpath)): 37 | for check_filename in [c_f for c_f in c_filenames if (not (c_f.endswith(".md") or c_f.endswith("description.json")))]: 38 | c_f = open(os.path.join(c_dirpath, check_filename), "rb+") 39 | 40 | for check_txt in c_f: 41 | if (c_list[1].encode('utf-8') in check_txt): 42 | check_flag = 1 43 | break 44 | 45 | c_f.close() 46 | 47 | if (check_flag is 0): 48 | string_check = string_check + txt 49 | t = 1 50 | 51 | if (t): 52 | print(os.path.join(dirpath)) 53 | print(string_check) 54 | 55 | f.close() 56 | -------------------------------------------------------------------------------- /FPGA/scripts/network_ultrascale.tcl: -------------------------------------------------------------------------------- 1 | create_ip -name fifo_generator -vendor xilinx.com -library ip -version 13.2 -module_name axis_sync_fifo 2 | set_property -dict [list CONFIG.INTERFACE_TYPE {AXI_STREAM} CONFIG.FIFO_Implementation_axis {Common_Clock_Block_RAM} CONFIG.TDATA_NUM_BYTES {8} CONFIG.TUSER_WIDTH {0} CONFIG.Enable_TLAST {true} CONFIG.HAS_TKEEP {true} CONFIG.Enable_Data_Counts_axis {true} CONFIG.Reset_Type {Asynchronous_Reset} CONFIG.Full_Flags_Reset_Value {1} CONFIG.TSTRB_WIDTH {8} CONFIG.TKEEP_WIDTH {8} CONFIG.FIFO_Implementation_wach {Common_Clock_Distributed_RAM} CONFIG.Full_Threshold_Assert_Value_wach {15} CONFIG.Empty_Threshold_Assert_Value_wach {14} CONFIG.FIFO_Implementation_wrch {Common_Clock_Distributed_RAM} CONFIG.Full_Threshold_Assert_Value_wrch {15} CONFIG.Empty_Threshold_Assert_Value_wrch {14} CONFIG.FIFO_Implementation_rach {Common_Clock_Distributed_RAM} CONFIG.Full_Threshold_Assert_Value_rach {15} CONFIG.Empty_Threshold_Assert_Value_rach {14}] [get_ips axis_sync_fifo] 3 | 4 | create_ip -name fifo_generator -vendor xilinx.com -library ip -version 13.2 -module_name cmd_fifo_xgemac_rxif 5 | set_property -dict [list CONFIG.Fifo_Implementation {Common_Clock_Block_RAM} CONFIG.Input_Data_Width {16} CONFIG.Output_Data_Width {16} CONFIG.Reset_Type {Asynchronous_Reset} CONFIG.Full_Flags_Reset_Value {1} CONFIG.Use_Embedded_Registers {false} CONFIG.Full_Threshold_Assert_Value {1022} CONFIG.Full_Threshold_Negate_Value {1021} CONFIG.Enable_Safety_Circuit {false}] [get_ips cmd_fifo_xgemac_rxif] 6 | 7 | create_ip -name fifo_generator -vendor xilinx.com -library ip -version 13.2 -module_name cmd_fifo_xgemac_txif 8 | set_property -dict [list CONFIG.Fifo_Implementation {Common_Clock_Block_RAM} CONFIG.Input_Data_Width {1} CONFIG.Output_Data_Width {1} CONFIG.Reset_Type {Asynchronous_Reset} CONFIG.Full_Flags_Reset_Value {1} CONFIG.Full_Threshold_Assert_Value {1022} CONFIG.Full_Threshold_Negate_Value {1021} CONFIG.Enable_Safety_Circuit {false}] [get_ips cmd_fifo_xgemac_txif] 9 | 10 | #create_ip -name ethernet_frame_padding -vendor ethz.systems.fpga -library hls -version 0.1 -module_name ethernet_frame_padding_ip 11 | 12 | create_ip -name axis_data_fifo -vendor xilinx.com -library ip -version 2.0 -module_name axis_pkg_fifo_512 13 | set_property -dict [list CONFIG.TDATA_NUM_BYTES {64} CONFIG.FIFO_MODE {2} CONFIG.HAS_TKEEP {1} CONFIG.HAS_TLAST {1} CONFIG.Component_Name {axis_pkg_fifo_512}] [get_ips axis_pkg_fifo_512] 14 | 15 | 16 | -------------------------------------------------------------------------------- /FPGA/common/includes/oclHelper/oclHelper.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #ifndef _OCL_HELP_H_ 30 | #define _OCL_HELP_H_ 31 | 32 | #include 33 | 34 | struct oclHardware { 35 | cl_platform_id mPlatform; 36 | cl_context mContext; 37 | cl_device_id mDevice; 38 | cl_command_queue mQueue; 39 | short mMajorVersion; 40 | short mMinorVersion; 41 | }; 42 | 43 | struct oclSoftware { 44 | cl_program mProgram; 45 | cl_kernel mKernel; 46 | char mKernelName[128]; 47 | char mFileName[1024]; 48 | char mCompileOptions[1024]; 49 | }; 50 | 51 | oclHardware getOclHardware(cl_device_type type); 52 | 53 | int getOclSoftware(oclSoftware &software, const oclHardware &hardware); 54 | 55 | void release(oclSoftware& software); 56 | 57 | void release(oclHardware& hardware); 58 | 59 | const char *oclErrorCode(cl_int code); 60 | 61 | #endif 62 | -------------------------------------------------------------------------------- /FPGA/kernel/cmac_krnl/src/hdl/axis_data_reg.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | `timescale 1ns / 1ps 28 | `default_nettype none 29 | 30 | // 512 axi stream register slice 31 | module axis_data_reg ( 32 | input wire aclk, 33 | input wire aresetn, 34 | axi_stream.slave s_axis, 35 | axi_stream.master m_axis 36 | ); 37 | 38 | axis_register_slice_512 slice_inst( 39 | .aclk(aclk), 40 | .aresetn(aresetn), 41 | .s_axis_tvalid(s_axis.valid), 42 | .s_axis_tready(s_axis.ready), 43 | .s_axis_tdata(s_axis.data), 44 | .s_axis_tkeep(s_axis.keep), 45 | .s_axis_tlast(s_axis.last), 46 | .m_axis_tvalid(m_axis.valid), 47 | .m_axis_tready(m_axis.ready), 48 | .m_axis_tdata(m_axis.data), 49 | .m_axis_tkeep(m_axis.keep), 50 | .m_axis_tlast(m_axis.last) 51 | ); 52 | 53 | endmodule 54 | `default_nettype wire 55 | -------------------------------------------------------------------------------- /FPGA/kernel/network_krnl/src/hdl/axis_data_reg.sv: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | `timescale 1ns / 1ps 28 | `default_nettype none 29 | 30 | // 512 axi stream register slice 31 | module axis_data_reg ( 32 | input wire aclk, 33 | input wire aresetn, 34 | axi_stream.slave s_axis, 35 | axi_stream.master m_axis 36 | ); 37 | 38 | axis_register_slice_512 slice_inst( 39 | .aclk(aclk), 40 | .aresetn(aresetn), 41 | .s_axis_tvalid(s_axis.valid), 42 | .s_axis_tready(s_axis.ready), 43 | .s_axis_tdata(s_axis.data), 44 | .s_axis_tkeep(s_axis.keep), 45 | .s_axis_tlast(s_axis.last), 46 | .m_axis_tvalid(m_axis.valid), 47 | .m_axis_tready(m_axis.ready), 48 | .m_axis_tdata(m_axis.data), 49 | .m_axis_tkeep(m_axis.keep), 50 | .m_axis_tlast(m_axis.last) 51 | ); 52 | 53 | endmodule 54 | `default_nettype wire 55 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/constant.h: -------------------------------------------------------------------------------- 1 | // Input: (INPUT_FEATURE_LEN, BATCH_SIZE) 2 | // Layer1: W1 * INPUT + B1 3 | // -> W1 (HIDDEN_SIZE1, INPUT_FEATURE_LEN) 4 | // -> B1 (HIDDEN_SIZE1) 5 | // -> Result1 (HIDDEN_SIZE1, BATCH_SIZE) 6 | // Layer2: W2 * Result1 + B2 7 | // -> W2 (HIDDEN_SIZE2, HIDDEN_SIZE1) 8 | // -> B2 (HIDDEN_SIZE2) 9 | // -> Result2 (HIDDEN_SIZE2, BATCH_SIZE) 10 | // Layer3: W3 * Result2 + B3 11 | // -> W3 (HIDDEN_SIZE3, HIDDEN_SIZE2) 12 | // -> B3 (HIDDEN_SIZE3) 13 | // -> Result3 (HIDDEN_SIZE3, BATCH_SIZE) 14 | // Output Layer: W_OUT * Result3 + B_OUT 15 | // -> W3 (OUTPUT_FEATURE_LEN, HIDDEN_SIZE3) 16 | // -> B3 (OUTPUT_FEATURE_LEN) 17 | // -> Result3 (OUTPUT_FEATURE_LEN, BATCH_SIZE) 18 | 19 | ///////////// OPTION: small model 384 -> 512 ///////////// 20 | ///////////// OPTION: large model 876 -> 1024 ///////////// 21 | 22 | /// TODO: CHANGE THIS 23 | // #define INPUT_FEATURE_LEN 1024 24 | 25 | #define INPUT_FEATURE_LEN_RECEIVER 3968 26 | #define INPUT_FEATURE_LEN_FPGA_SENDER 1952 27 | #define INPUT_FEATURE_LEN_CPU_SENDER 64 28 | 29 | // BATCH_SIZE GIVEN IN constant.h, need to revisit the constant definition later 30 | #define HIDDEN_SIZE1 2048 // 1024 31 | #define HIDDEN_SIZE2 512 32 | #define HIDDEN_SIZE3 256 33 | #define OUTPUT_FEATURE_LEN 1 34 | 35 | /* constraint: SHM_DATA_SIZE === 1 GB */ 36 | /* FLOAT_SIZE * BATCH_SIZE * INPUT_DIM * BATCH_NUM_PER_LOOP = 1024 **3 */ 37 | #define FLOAT_SIZE 4 38 | #define BATCH_SIZE 1024 // 1024 39 | #define TOTAL_BATCH_NUM (1 * 1024 * 1024 / BATCH_SIZE) 40 | 41 | // #define BATCH_NUM_PER_THREAD (TOTAL_BATCH_NUM / THREAD_NUM) 42 | 43 | /* matrix (batch * input_dim): 1024 * 1024, float: 4 byte, 1024 batches in queue */ 44 | /* 4 GB in total, (1024 * 1024 * 4 * 1024) */ 45 | 46 | #define BLOCK_ENTRY_NUM_RECEIVER (BATCH_SIZE * INPUT_FEATURE_LEN_RECEIVER) 47 | #define BLOCK_SIZE_RECEIVER (BLOCK_ENTRY_NUM_RECEIVER * FLOAT_SIZE) 48 | 49 | #define BLOCK_ENTRY_NUM_FPGA_SENDER (BATCH_SIZE * INPUT_FEATURE_LEN_FPGA_SENDER) 50 | #define BLOCK_SIZE_FPGA_SENDER (BLOCK_ENTRY_NUM_FPGA_SENDER * FLOAT_SIZE) 51 | 52 | #define BLOCK_ENTRY_NUM_CPU_SENDER (BATCH_SIZE * INPUT_FEATURE_LEN_CPU_SENDER) 53 | #define BLOCK_SIZE_CPU_SENDER (BLOCK_ENTRY_NUM_CPU_SENDER * FLOAT_SIZE) 54 | 55 | #define THREAD_NUM 16 56 | 57 | // Stream 0 port: PORT, Stream 1 port: PORT + 1, ... 58 | #define PORT_CPU_SENDER_0 7080 59 | #define PORT_FPGA_SENDER_0 8080 60 | #define PORT_FPGA_SENDER_1 9080 61 | -------------------------------------------------------------------------------- /FPGA/common/includes/bitmap/bitmap.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #ifndef BITMAP_DOT_H 30 | #define BITMAP_DOT_H 31 | 32 | #include 33 | 34 | class BitmapInterface 35 | { 36 | private: 37 | char* core ; 38 | char* dib ; 39 | const char* filename ; 40 | int* image ; 41 | 42 | // Core header information 43 | unsigned short magicNumber ; 44 | unsigned int fileSize ; 45 | unsigned int offsetOfImage ; 46 | 47 | // DIB information 48 | int sizeOfDIB ; 49 | int sizeOfImage ; 50 | int height ; 51 | int width ; 52 | 53 | public: 54 | BitmapInterface(const char* f) ; 55 | ~BitmapInterface() ; 56 | 57 | bool readBitmapFile() ; 58 | bool writeBitmapFile(int* otherImage = NULL); 59 | 60 | inline int* bitmap() { return image ; } 61 | unsigned int numPixels() { return sizeOfImage/3 ; } 62 | 63 | inline int getHeight() { return height ; } 64 | inline int getWidth() { return width ; } 65 | 66 | } ; 67 | 68 | #endif 69 | -------------------------------------------------------------------------------- /FPGA/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.0) 2 | project(vitis-network) 3 | 4 | # 5 | # Vivado 6 | # 7 | 8 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/cmake) 9 | 10 | set(IPREPO_DIR ${CMAKE_BINARY_DIR}/fpga-network-stack/iprepo) 11 | file(MAKE_DIRECTORY ${IPREPO_DIR}) 12 | 13 | # Device 14 | if(FDEV_NAME STREQUAL "u280") 15 | set(FPGA_PART "xcu280-fsvh2892-2L-e" CACHE STRING "FPGA device") 16 | set(NETWORK_BANDWIDTH 100 CACHE STRING "Network bandwidth") 17 | set(NETWORK_INTERFACE 100 CACHE STRING "Network bandwidth") 18 | set(DATA_WIDTH 64 CACHE STRING "Width of data path in bytes") 19 | set(CLOCK_PERIOD 3.2 CACHE STRING "Target clock period in nanoseconds") 20 | set(DEVICE "/opt/xilinx/platforms/xilinx_u280_xdma_201920_3/xilinx_u280_xdma_201920_3.xpfm") 21 | else() 22 | message(FATAL_ERROR "Target device not supported.") 23 | endif() 24 | 25 | # Config 26 | set(ROCE_STACK_EN 0 CACHE BOOL "Enable RDMA stack.") 27 | set(TCP_STACK_EN 0 CACHE BOOL "Enable TCP/IP stack") 28 | set(UDP_STACK_EN 1 CACHE BOOL "Enable UDP/IP stack") 29 | set(TCP_STACK_RX_DDR_BYPASS_EN 1 CACHE BOOL "Enabling DDR bypass on the RX path") 30 | 31 | 32 | 33 | 34 | # QSFP port 35 | set(QSFP_PORT 1 CACHE STRING "Network traffic route.") 36 | 37 | # 38 | # Network stack 39 | # 40 | 41 | add_subdirectory(fpga-network-stack) 42 | 43 | 44 | #User kernel IPs 45 | add_subdirectory(kernel/user_krnl/scatter_krnl/src/hls) 46 | add_subdirectory(kernel/user_krnl/iperf_krnl/src/hls) 47 | 48 | # 49 | # Find Vivado 50 | # 51 | 52 | find_package(Vivado REQUIRED) 53 | if (NOT VIVADO_FOUND) 54 | message(FATAL_ERROR "Vivado not found.") 55 | endif() 56 | 57 | configure_file(${CMAKE_SOURCE_DIR}/scripts/package_network_krnl.tcl.in ${CMAKE_SOURCE_DIR}/scripts/package_network_krnl.tcl) 58 | configure_file(${CMAKE_SOURCE_DIR}/scripts/package_cmac_krnl.tcl.in ${CMAKE_SOURCE_DIR}/scripts/package_cmac_krnl.tcl) 59 | configure_file(${CMAKE_SOURCE_DIR}/scripts/package_scatter_krnl.tcl.in ${CMAKE_SOURCE_DIR}/scripts/package_scatter_krnl.tcl) 60 | configure_file(${CMAKE_SOURCE_DIR}/scripts/package_iperf_krnl.tcl.in ${CMAKE_SOURCE_DIR}/scripts/package_iperf_krnl.tcl) 61 | 62 | configure_file(${CMAKE_SOURCE_DIR}/scripts/post_sys_link.tcl.in ${CMAKE_SOURCE_DIR}/scripts/post_sys_link.tcl) 63 | configure_file(${CMAKE_SOURCE_DIR}/kernel/common/types/network_types.svh.in ${CMAKE_SOURCE_DIR}/kernel/common/types/network_types.svh) 64 | 65 | #configure_file(${CMAKE_SOURCE_DIR}/Makefile.in ${CMAKE_SOURCE_DIR}/Makefile) 66 | 67 | 68 | #add_custom_target(shell COMMAND ${VIVADO_BINARY} -mode tcl -source ${CMAKE_BINARY_DIR}/shell.tcl) 69 | 70 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/src/hls/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Author: David Sidler (david.sidler@inf.ethz.ch) 2 | 3 | cmake_minimum_required(VERSION 3.0) 4 | 5 | set (PROJECT_NAME scatter) 6 | project(${PROJECT_NAME}) 7 | 8 | # Include custom Find.cmake scripts to enable searching for Vivado HLS 9 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/../../../cmake) 10 | 11 | # Without this variable set, CMake will build tests when running install 12 | #set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY ON) 13 | 14 | # Generate Doxygen if available 15 | #find_package(Doxygen) 16 | #if(Doxygen_FOUND) 17 | # configure_file(${CMAKE_SOURCE_DIR}/Doxyfile.in Doxyfile) 18 | # add_custom_target(doxygen ALL 19 | # COMMAND ${DOXYGEN_EXECUTABLE} Doxyfile 20 | # WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) 21 | #endif() 22 | 23 | 24 | set(FPGA_PART "xcu280-fsvh2892-2L-e" CACHE STRING "FPGA device") 25 | set(NETWORK_BANDWIDTH 100 CACHE STRING "Network bandwidth") 26 | set(NETWORK_INTERFACE 100 CACHE STRING "Network bandwidth") 27 | set(DATA_WIDTH 64 CACHE STRING "Width of data path in bytes") 28 | set(CLOCK_PERIOD 3.2 CACHE STRING "Target clock period in nanoseconds") 29 | 30 | 31 | # Find Xilinx Vivado HLS 32 | find_package(VivadoHLS REQUIRED) 33 | if (NOT VIVADO_HLS_FOUND) 34 | message(FATAL_ERROR "Vivado HLS not found.") 35 | endif() 36 | 37 | # Installation directory 38 | if (DEFINED ENV{IPREPO_DIR}) 39 | set(IPREPO_DIR $ENV{IPREPO_DIR}) 40 | elseif(NOT IPREPO_DIR) 41 | set(IPREPO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/iprepo/) 42 | endif() 43 | 44 | 45 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 46 | 47 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/scatter_config.hpp.in scatter_config.hpp) 48 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/make.tcl.in make.tcl) 49 | 50 | 51 | set(EXAMPLE_HLS_DEPENDS 52 | ${CMAKE_CURRENT_SOURCE_DIR}/scatter.cpp 53 | ${CMAKE_CURRENT_SOURCE_DIR}/scatter.hpp 54 | ${CMAKE_CURRENT_SOURCE_DIR}/scatter_config.hpp.in 55 | ${CMAKE_CURRENT_SOURCE_DIR}/test_scatter.cpp) 56 | 57 | 58 | #Setup HLS custom targets 59 | set(HLS_TARGETS synthesis csim ip installip) 60 | 61 | foreach (target ${HLS_TARGETS}) 62 | if (NOT TARGET ${target}) 63 | add_custom_target(${target}) 64 | endif() 65 | 66 | add_custom_target(${target}.${PROJECT_NAME} 67 | COMMAND ${VIVADO_HLS_BINARY} -f make.tcl -tclargs ${target} 68 | DEPENDS ${EXAMPLE_HLS_DEPENDS}) 69 | add_dependencies(${target} ${target}.${PROJECT_NAME}) 70 | endforeach() 71 | 72 | #target dependencies 73 | add_dependencies(ip.${PROJECT_NAME} synthesis.${PROJECT_NAME}) 74 | add_dependencies(installip.${PROJECT_NAME} ip.${PROJECT_NAME}) 75 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/README.md: -------------------------------------------------------------------------------- 1 | # Programs 2 | 3 | First start Terminal 1: run cuda_server first (./run_cuda_server.sh) 4 | 5 | Then start Terminal 2: run client to send data (./run_client_sender.sh) -> Remember to Adjust the Server IP Address 6 | 7 | Correct Results: 8 | 9 | Input Feature Size = 512 -> 68719476736 10 | 11 | Input Feature Size = 1024 -> 137438953472 12 | 13 | ## cuda_server.c 14 | 15 | rm cuda_server 16 | 17 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 18 | 19 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server 20 | 21 | ## multiple_connections_network_client_sender.c 22 | 23 | ### NOTE! Sender must send more data than receiver side, because on both sender and receiver threads have different progress, e.g., receiver connection 1 is waiting for the last batch, yet sender is trying to send the data through connection 2. 24 | 25 | As a result, we use more sender data (2 * required) 26 | 27 | This program simulates FPGA that opens 4 connections and sending data to the CUDA server. 28 | 29 | Start CUDA server first, then client. 30 | 31 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 32 | 33 | ./multiple_connections_network_client_sender 34 | 35 | 36 | # Other programs (for building the final version) 37 | 38 | ## pthread_test.c 39 | 40 | https://www.geeksforgeeks.org/multithreading-c-2/ 41 | 42 | Pass port info and memory address space to the thread as a structure, and execute that thread. 43 | 44 | gcc pthread_test.c -lpthread 45 | 46 | ./a.out 47 | 48 | ## single_connection_network_server_receiver.c 49 | 50 | Start server first, then client. 51 | 52 | gcc single_connection_network_server_receiver.c -lpthread -o single_connection_network_server_receiver 53 | 54 | ./single_connection_network_server_receiver 55 | 56 | ## single_connection_network_client_sender.c 57 | 58 | Start server first, then client. 59 | 60 | gcc single_connection_network_client_sender.c -lpthread -o single_connection_network_client_sender 61 | 62 | ./single_connection_network_client_sender 63 | 64 | 65 | ## multiple_connections_network_server_receiver.c 66 | 67 | Start server first, then client. 68 | 69 | 4 TCP connections. 70 | 71 | gcc multiple_connections_network_server_receiver.c -lpthread -o multiple_connections_network_server_receiver 72 | 73 | ./multiple_connections_network_server_receiver 74 | 75 | ## multiple_connections_network_client_sender.c 76 | 77 | Start server first, then client. 78 | 79 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 80 | 81 | ./multiple_connections_network_client_sender -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/README.md: -------------------------------------------------------------------------------- 1 | # Programs 2 | 3 | First start Terminal 1: run cuda_server first (./run_cuda_server.sh) 4 | 5 | Then start Terminal 2: run client to send data (./run_client_sender.sh) -> Remember to Adjust the Server IP Address 6 | 7 | Correct Results: 8 | 9 | Input Feature Size = 512 -> 68719476736 10 | 11 | Input Feature Size = 1024 -> 137438953472 12 | 13 | ## cuda_server.c 14 | 15 | rm cuda_server 16 | 17 | nvcc -l cublasLt -lpthread cuda_server.c -o cuda_server 18 | 19 | nvprof -f --export-profile timeline.prof --concurrent-kernels on ./cuda_server 20 | 21 | ## multiple_connections_network_client_sender.c 22 | 23 | ### NOTE! Sender must send more data than receiver side, because on both sender and receiver threads have different progress, e.g., receiver connection 1 is waiting for the last batch, yet sender is trying to send the data through connection 2. 24 | 25 | As a result, we use more sender data (2 * required) 26 | 27 | This program simulates FPGA that opens 4 connections and sending data to the CUDA server. 28 | 29 | Start CUDA server first, then client. 30 | 31 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 32 | 33 | ./multiple_connections_network_client_sender 34 | 35 | 36 | # Other programs (for building the final version) 37 | 38 | ## pthread_test.c 39 | 40 | https://www.geeksforgeeks.org/multithreading-c-2/ 41 | 42 | Pass port info and memory address space to the thread as a structure, and execute that thread. 43 | 44 | gcc pthread_test.c -lpthread 45 | 46 | ./a.out 47 | 48 | ## single_connection_network_server_receiver.c 49 | 50 | Start server first, then client. 51 | 52 | gcc single_connection_network_server_receiver.c -lpthread -o single_connection_network_server_receiver 53 | 54 | ./single_connection_network_server_receiver 55 | 56 | ## single_connection_network_client_sender.c 57 | 58 | Start server first, then client. 59 | 60 | gcc single_connection_network_client_sender.c -lpthread -o single_connection_network_client_sender 61 | 62 | ./single_connection_network_client_sender 63 | 64 | 65 | ## multiple_connections_network_server_receiver.c 66 | 67 | Start server first, then client. 68 | 69 | 4 TCP connections. 70 | 71 | gcc multiple_connections_network_server_receiver.c -lpthread -o multiple_connections_network_server_receiver 72 | 73 | ./multiple_connections_network_server_receiver 74 | 75 | ## multiple_connections_network_client_sender.c 76 | 77 | Start server first, then client. 78 | 79 | gcc multiple_connections_network_client_sender.c -lpthread -o multiple_connections_network_client_sender 80 | 81 | ./multiple_connections_network_client_sender -------------------------------------------------------------------------------- /FPGA/scripts/gen_xo.tcl: -------------------------------------------------------------------------------- 1 | # /******************************************************************************* 2 | # Copyright (c) 2018, Xilinx, Inc. 3 | # All rights reserved. 4 | # 5 | # Redistribution and use in source and binary forms, with or without modification, 6 | # are permitted provided that the following conditions are met: 7 | # 8 | # 1. Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # 11 | # 12 | # 2. Redistributions in binary form must reproduce the above copyright notice, 13 | # this list of conditions and the following disclaimer in the documentation 14 | # and/or other materials provided with the distribution. 15 | # 16 | # 17 | # 3. Neither the name of the copyright holder nor the names of its contributors 18 | # may be used to endorse or promote products derived from this software 19 | # without specific prior written permission. 20 | # 21 | # 22 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 23 | # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE IMPLIED 24 | # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25 | # IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 26 | # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 29 | # OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 30 | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 31 | # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 | # 33 | # *******************************************************************************/ 34 | 35 | if { $::argc != 7 } { 36 | puts "ERROR: Program \"$::argv0\" requires 6 arguments!\n" 37 | puts "Usage: $::argv0 \n" 38 | exit 39 | } 40 | 41 | set xoname [lindex $::argv 0] 42 | set krnl_name [lindex $::argv 1] 43 | set target [lindex $::argv 2] 44 | set xpfm_path [lindex $::argv 3] 45 | set device [lindex $::argv 4] 46 | set xml_path [lindex $::argv 5] 47 | set package_tcl_path [lindex $::argv 6] 48 | 49 | set suffix "${krnl_name}_${target}_${device}" 50 | 51 | puts "INFO: ${xoname} ${krnl_name} ${target} ${xpfm_path} ${device}" 52 | 53 | source -notrace ${package_tcl_path} 54 | 55 | if {[file exists "${xoname}"]} { 56 | file delete -force "${xoname}" 57 | } 58 | 59 | package_xo -xo_path ${xoname} -kernel_name ${krnl_name} -ip_directory ./packaged_kernel_${suffix} -kernel_xml ${xml_path} 60 | -------------------------------------------------------------------------------- /FPGA/common/includes/simplebmp/simplebmp.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | //Read and write uncompressed 24 bit BMP image format image 30 | //based on http://en.wikipedia.org/wiki/BMP_file_formt 31 | //Copyright Xilinx 32 | 33 | 34 | #ifndef __SIMPLE_BMP 35 | #define __SIMPLE_BMP 36 | 37 | struct bmpheader_t{ 38 | //Header 39 | char headerB; 40 | char headerM; 41 | uint32_t headerbmpsize; 42 | uint16_t headerapp0; 43 | uint16_t headerapp1; 44 | uint32_t headerpixelsoffset; 45 | 46 | //DIB header 47 | uint32_t dibheadersize; 48 | uint32_t dibwidth; 49 | uint32_t dibheight; 50 | uint16_t dibplane; 51 | uint16_t dibdepth; 52 | uint32_t dibcompression; 53 | uint32_t dibsize; 54 | uint32_t dibhor; 55 | uint32_t dibver; 56 | uint32_t dibpal; 57 | uint32_t dibimportant; 58 | 59 | }; 60 | 61 | 62 | struct bmp_t{ 63 | struct bmpheader_t header; 64 | uint32_t width; 65 | uint32_t height; 66 | uint32_t *pixels; 67 | }; 68 | 69 | int writebmp(char *filename,struct bmp_t *bitmap); 70 | 71 | int readbmp(char *filename,struct bmp_t *bitmap); 72 | //-1 file access error 73 | //-2 invalid BMP 74 | //-3 memory allocation error 75 | 76 | 77 | #endif 78 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/iperf_krnl/src/hls/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Author: David Sidler (david.sidler@inf.ethz.ch) 2 | 3 | cmake_minimum_required(VERSION 3.0) 4 | 5 | set (PROJECT_NAME iperf_client) 6 | project(${PROJECT_NAME}) 7 | 8 | # Include custom Find.cmake scripts to enable searching for Vivado HLS 9 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/../../cmake) 10 | 11 | # Without this variable set, CMake will build tests when running install 12 | #set(CMAKE_SKIP_INSTALL_ALL_DEPENDENCY ON) 13 | 14 | # Generate Doxygen if available 15 | #find_package(Doxygen) 16 | #if(Doxygen_FOUND) 17 | # configure_file(${CMAKE_SOURCE_DIR}/Doxyfile.in Doxyfile) 18 | # add_custom_target(doxygen ALL 19 | # COMMAND ${DOXYGEN_EXECUTABLE} Doxyfile 20 | # WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) 21 | #endif() 22 | 23 | if (NOT hasParent) 24 | if (DEVICE_NAME STREQUAL "u280") 25 | set(FPGA_PART xcu280-fsvh2892-2L-e) 26 | set(FPGA_FAMILY ultraplus) 27 | set(NETWORK_BANDWIDTH 100 CACHE STRING "Network bandwidth") 28 | endif() 29 | endif() 30 | set(DATA_WIDTH 8 CACHE STRING "Width of data path in bytes") 31 | set(CLOCK_PERIOD 6.4 CACHE STRING "Target clock period in nanoseconds") 32 | 33 | 34 | # Find Xilinx Vivado HLS 35 | find_package(VivadoHLS REQUIRED) 36 | if (NOT VIVADO_HLS_FOUND) 37 | message(FATAL_ERROR "Vivado HLS not found.") 38 | endif() 39 | 40 | 41 | include_directories(${CMAKE_CURRENT_BINARY_DIR}) 42 | 43 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/iperf_client_config.hpp.in iperf_client_config.hpp) 44 | configure_file(${CMAKE_CURRENT_SOURCE_DIR}/make.tcl.in make.tcl) 45 | 46 | 47 | set(EXAMPLE_HLS_DEPENDS 48 | ${CMAKE_CURRENT_SOURCE_DIR}/iperf_client.cpp 49 | ${CMAKE_CURRENT_SOURCE_DIR}/iperf_client.hpp 50 | ${CMAKE_CURRENT_SOURCE_DIR}/iperf_client_config.hpp.in 51 | ${CMAKE_CURRENT_SOURCE_DIR}/test_iperf_client.cpp) 52 | 53 | 54 | #Setup HLS custom targets 55 | set(HLS_TARGETS synthesis csim ip installip) 56 | 57 | foreach (target ${HLS_TARGETS}) 58 | if (NOT TARGET ${target}) 59 | add_custom_target(${target}) 60 | endif() 61 | 62 | add_custom_target(${target}.${PROJECT_NAME} 63 | COMMAND ${VIVADO_HLS_BINARY} -f make.tcl -tclargs ${target} 64 | DEPENDS ${EXAMPLE_HLS_DEPENDS}) 65 | add_dependencies(${target} ${target}.${PROJECT_NAME}) 66 | endforeach() 67 | 68 | #target dependencies 69 | add_dependencies(ip.${PROJECT_NAME} synthesis.${PROJECT_NAME}) 70 | add_dependencies(installip.${PROJECT_NAME} ip.${PROJECT_NAME}) 71 | 72 | # Installation 73 | if (DEFINED ENV{IPREPO_DIR}) 74 | set(IPREPO_DIR $ENV{IPREPO_DIR}) 75 | elseif(NOT IPREPO_DIR) 76 | set(IPREPO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/iprepo/) 77 | endif() 78 | #install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}_prj/solution1/impl/ip/ 79 | # DESTINATION ${IPREPO_DIR}/${PROJECT_NAME}/) 80 | 81 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_krnl/config_sp_embedding_krnl.txt: -------------------------------------------------------------------------------- 1 | profile_kernel=data:embedding_krnl_1:all:all 2 | 3 | [connectivity] 4 | sp=embedding_krnl_1.table_HBM0:HBM[0] 5 | sp=embedding_krnl_1.table_HBM1:HBM[1] 6 | sp=embedding_krnl_1.table_HBM2:HBM[2] 7 | sp=embedding_krnl_1.table_HBM3:HBM[3] 8 | sp=embedding_krnl_1.table_HBM4:HBM[4] 9 | sp=embedding_krnl_1.table_HBM5:HBM[5] 10 | sp=embedding_krnl_1.table_HBM6:HBM[6] 11 | sp=embedding_krnl_1.table_HBM7:HBM[7] 12 | sp=embedding_krnl_1.table_HBM8:HBM[8] 13 | sp=embedding_krnl_1.table_HBM9:HBM[9] 14 | sp=embedding_krnl_1.table_HBM10:HBM[10] 15 | sp=embedding_krnl_1.table_HBM11:HBM[11] 16 | sp=embedding_krnl_1.table_HBM12:HBM[12] 17 | sp=embedding_krnl_1.table_HBM13:HBM[13] 18 | sp=embedding_krnl_1.table_HBM14:HBM[14] 19 | sp=embedding_krnl_1.table_HBM15:HBM[15] 20 | sp=embedding_krnl_1.table_HBM16:HBM[16] 21 | sp=embedding_krnl_1.table_HBM17:HBM[17] 22 | sp=embedding_krnl_1.table_HBM18:HBM[18] 23 | sp=embedding_krnl_1.table_HBM19:HBM[19] 24 | sp=embedding_krnl_1.table_HBM20:HBM[20] 25 | sp=embedding_krnl_1.table_HBM21:HBM[21] 26 | sp=embedding_krnl_1.table_HBM22:HBM[22] 27 | sp=embedding_krnl_1.table_HBM23:HBM[23] 28 | sp=embedding_krnl_1.table_HBM24:HBM[24] 29 | sp=embedding_krnl_1.table_HBM25:HBM[25] 30 | sp=embedding_krnl_1.table_HBM26:HBM[26] 31 | sp=embedding_krnl_1.table_HBM27:HBM[27] 32 | sp=embedding_krnl_1.table_DDR0:DDR[0] 33 | sp=embedding_krnl_1.table_DDR1:DDR[1] 34 | 35 | sp=network_krnl_1.m00_axi:HBM[28] 36 | sp=network_krnl_1.m01_axi:HBM[29] 37 | 38 | sc=network_krnl_1.m_axis_udp_rx:embedding_krnl_1.s_axis_udp_rx 39 | sc=network_krnl_1.m_axis_udp_rx_meta:embedding_krnl_1.s_axis_udp_rx_meta 40 | sc=network_krnl_1.m_axis_tcp_port_status:embedding_krnl_1.s_axis_tcp_port_status 41 | sc=network_krnl_1.m_axis_tcp_open_status:embedding_krnl_1.s_axis_tcp_open_status 42 | sc=network_krnl_1.m_axis_tcp_notification:embedding_krnl_1.s_axis_tcp_notification 43 | sc=network_krnl_1.m_axis_tcp_rx_meta:embedding_krnl_1.s_axis_tcp_rx_meta 44 | sc=network_krnl_1.m_axis_tcp_rx_data:embedding_krnl_1.s_axis_tcp_rx_data 45 | sc=network_krnl_1.m_axis_tcp_tx_status:embedding_krnl_1.s_axis_tcp_tx_status 46 | 47 | sc=embedding_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 48 | sc=embedding_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 49 | sc=embedding_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 50 | sc=embedding_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 51 | sc=embedding_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 52 | sc=embedding_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 53 | sc=embedding_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 54 | sc=embedding_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 55 | 56 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 57 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx -------------------------------------------------------------------------------- /FPGA/utils.mk: -------------------------------------------------------------------------------- 1 | #+------------------------------------------------------------------------------- 2 | # The following parameters are assigned with default values. These parameters can 3 | # be overridden through the make command line 4 | #+------------------------------------------------------------------------------- 5 | 6 | PROFILE := no 7 | 8 | #Generates profile summary report 9 | ifeq ($(PROFILE), yes) 10 | LDCLFLAGS += --profile_kernel data:all:all:all 11 | endif 12 | 13 | DEBUG := no 14 | B_TEMP = `$(ABS_COMMON_REPO)/common/utility/parse_platform_list.py $(DEVICE)` 15 | 16 | #Generates debug summary report 17 | ifeq ($(DEBUG), yes) 18 | LDCLFLAGS += --dk list_ports 19 | endif 20 | 21 | #Setting Platform Path 22 | ifeq ($(findstring xpfm, $(DEVICE)), xpfm) 23 | B_NAME = $(shell dirname $(DEVICE)) 24 | else 25 | B_NAME = $(B_TEMP)/$(DEVICE) 26 | endif 27 | 28 | #Checks for XILINX_VITIS 29 | ifndef XILINX_VITIS 30 | $(error XILINX_VITIS variable is not set, please set correctly and rerun) 31 | endif 32 | 33 | #Checks for Device Family 34 | ifeq ($(HOST_ARCH), aarch32) 35 | DEV_FAM = 7Series 36 | else ifeq ($(HOST_ARCH), aarch64) 37 | DEV_FAM = Ultrascale 38 | endif 39 | 40 | #Checks for XILINX_XRT 41 | check-xrt: 42 | ifndef XILINX_XRT 43 | $(error XILINX_XRT variable is not set, please set correctly and rerun) 44 | endif 45 | 46 | #Checks for Correct architecture 47 | ifneq ($(HOST_ARCH), $(filter $(HOST_ARCH),aarch64 aarch32 x86)) 48 | $(error HOST_ARCH variable not set, please set correctly and rerun) 49 | endif 50 | 51 | #Checks for SYSROOT 52 | ifneq ($(HOST_ARCH), x86) 53 | ifndef SYSROOT 54 | $(error SYSROOT variable is not set, please set correctly and rerun) 55 | endif 56 | endif 57 | 58 | #Checks for g++ 59 | ifeq ($(HOST_ARCH), x86) 60 | ifneq ($(shell expr $(shell g++ -dumpversion) \>= 5), 1) 61 | ifndef XILINX_VIVADO 62 | $(error [ERROR]: g++ version older. Please use 5.0 or above.) 63 | else 64 | CXX := $(XILINX_VIVADO)/tps/lnx64/gcc-6.2.0/bin/g++ 65 | $(warning [WARNING]: g++ version older. Using g++ provided by the tool : $(CXX)) 66 | endif 67 | endif 68 | else ifeq ($(HOST_ARCH), aarch64) 69 | CXX := $(XILINX_VITIS)/gnu/aarch64/lin/aarch64-linux/bin/aarch64-linux-gnu-g++ 70 | else ifeq ($(HOST_ARCH), aarch32) 71 | CXX := $(XILINX_VITIS)/gnu/aarch32/lin/gcc-arm-linux-gnueabi/bin/arm-linux-gnueabihf-g++ 72 | endif 73 | 74 | check-devices: 75 | ifndef DEVICE 76 | $(error DEVICE not set. Please set the DEVICE properly and rerun. Run "make help" for more details.) 77 | endif 78 | 79 | # device2xsa - create a filesystem friendly name from device name 80 | # $(1) - full name of device 81 | device2xsa = $(strip $(patsubst %.xpfm, % , $(shell basename $(DEVICE)))) 82 | 83 | # Cleaning stuff 84 | RM = rm -f 85 | RMDIR = rm -rf 86 | 87 | ECHO:= @echo 88 | 89 | docs: README.md 90 | 91 | README.md: description.json 92 | $(ABS_COMMON_REPO)/common/utility/readme_gen/readme_gen.py description.json 93 | -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/out: -------------------------------------------------------------------------------- 1 | rm: cannot remove ‘cuda_server’: No such file or directory 2 | cuda_server.c: In function ‘thread_consume’: 3 | cuda_server.c:412:53: warning: passing argument 2 of ‘clock_gettime’ from incompatible pointer type [-Wincompatible-pointer-types] 4 | clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &network_time[iter * BATCH_NUM_PER_LOOP + block_id]); 5 | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 6 | In file included from cuda_server.c:5: 7 | /usr/include/time.h:342:66: note: expected ‘struct timespec *’ but argument is of type ‘timespec *’ {aka ‘struct *’} 8 | extern int clock_gettime (clockid_t __clock_id, struct timespec *__tp) __THROW; 9 | ~~~~~~~~~~~~~~~~~^~~~ 10 | cuda_server.c:421:28: warning: returning ‘int’ from a function with return type ‘void *’ makes pointer from integer without a cast [-Wint-conversion] 11 | return -1; 12 | ^ 13 | cuda_server.c:432:50: warning: passing argument 2 of ‘clock_gettime’ from incompatible pointer type [-Wincompatible-pointer-types] 14 | clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &cuda_time[iter * BATCH_NUM_PER_LOOP + block_id]); 15 | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 16 | In file included from cuda_server.c:5: 17 | /usr/include/time.h:342:66: note: expected ‘struct timespec *’ but argument is of type ‘timespec *’ {aka ‘struct *’} 18 | extern int clock_gettime (clockid_t __clock_id, struct timespec *__tp) __THROW; 19 | ~~~~~~~~~~~~~~~~~^~~~ 20 | cuda_server.c: In function ‘main’: 21 | cuda_server.c:537:10: error: redeclaration of ‘memcpy_time_ns’ with no linkage 22 | long memcpy_time_ns[TOTAL_BATCH_NUM]; 23 | ^~~~~~~~~~~~~~ 24 | cuda_server.c:535:10: note: previous declaration of ‘memcpy_time_ns’ was here 25 | long memcpy_time_ns[TOTAL_BATCH_NUM]; 26 | ^~~~~~~~~~~~~~ 27 | cuda_server.c:542:47: error: incompatible type for argument 1 of ‘diff’ 28 | memcpy_timespec[i] = diff(network_time[i], cuda_time[i]); 29 | ~~~~~~~~~~~~^~~ 30 | cuda_server.c:68:24: note: expected ‘timespec’ {aka ‘struct ’} but argument is of type ‘timespec *’ {aka ‘struct *’} 31 | timespec diff(timespec start, timespec end) 32 | ~~~~~~~~~^~~~~ 33 | cuda_server.c:542:61: error: incompatible type for argument 2 of ‘diff’ 34 | memcpy_timespec[i] = diff(network_time[i], cuda_time[i]); 35 | ~~~~~~~~~^~~ 36 | cuda_server.c:68:40: note: expected ‘timespec’ {aka ‘struct ’} but argument is of type ‘timespec *’ {aka ‘struct *’} 37 | timespec diff(timespec start, timespec end) 38 | ~~~~~~~~~^~~ 39 | ======== Error: application not found. 40 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_47_krnl/config_sp_embedding_47_krnl.txt: -------------------------------------------------------------------------------- 1 | 2 | [connectivity] 3 | slr=cmac_krnl_1:SLR2 4 | 5 | sp=embedding_47_krnl_1.table_HBM0:HBM[0] 6 | sp=embedding_47_krnl_1.table_HBM1:HBM[1] 7 | sp=embedding_47_krnl_1.table_HBM2:HBM[2] 8 | sp=embedding_47_krnl_1.table_HBM3:HBM[3] 9 | sp=embedding_47_krnl_1.table_HBM4:HBM[4] 10 | sp=embedding_47_krnl_1.table_HBM5:HBM[5] 11 | sp=embedding_47_krnl_1.table_HBM6:HBM[6] 12 | sp=embedding_47_krnl_1.table_HBM7:HBM[7] 13 | sp=embedding_47_krnl_1.table_HBM8:HBM[8] 14 | sp=embedding_47_krnl_1.table_HBM9:HBM[9] 15 | sp=embedding_47_krnl_1.table_HBM10:HBM[10] 16 | sp=embedding_47_krnl_1.table_HBM11:HBM[11] 17 | sp=embedding_47_krnl_1.table_HBM12:HBM[12] 18 | sp=embedding_47_krnl_1.table_HBM13:HBM[13] 19 | sp=embedding_47_krnl_1.table_HBM14:HBM[14] 20 | sp=embedding_47_krnl_1.table_HBM15:HBM[15] 21 | sp=embedding_47_krnl_1.table_HBM16:HBM[16] 22 | sp=embedding_47_krnl_1.table_HBM17:HBM[17] 23 | sp=embedding_47_krnl_1.table_HBM18:HBM[18] 24 | sp=embedding_47_krnl_1.table_HBM19:HBM[19] 25 | sp=embedding_47_krnl_1.table_HBM20:HBM[20] 26 | sp=embedding_47_krnl_1.table_HBM21:HBM[21] 27 | sp=embedding_47_krnl_1.table_HBM22:HBM[22] 28 | sp=embedding_47_krnl_1.table_HBM23:HBM[23] 29 | sp=embedding_47_krnl_1.table_HBM24:HBM[24] 30 | sp=embedding_47_krnl_1.table_HBM25:HBM[25] 31 | sp=embedding_47_krnl_1.table_HBM26:HBM[26] 32 | sp=embedding_47_krnl_1.table_HBM27:HBM[27] 33 | sp=embedding_47_krnl_1.table_DDR0:DDR[0] 34 | sp=embedding_47_krnl_1.table_DDR1:DDR[1] 35 | 36 | sp=network_krnl_1.m00_axi:HBM[28] 37 | sp=network_krnl_1.m01_axi:HBM[29] 38 | 39 | sc=network_krnl_1.m_axis_udp_rx:embedding_47_krnl_1.s_axis_udp_rx 40 | sc=network_krnl_1.m_axis_udp_rx_meta:embedding_47_krnl_1.s_axis_udp_rx_meta 41 | sc=network_krnl_1.m_axis_tcp_port_status:embedding_47_krnl_1.s_axis_tcp_port_status 42 | sc=network_krnl_1.m_axis_tcp_open_status:embedding_47_krnl_1.s_axis_tcp_open_status 43 | sc=network_krnl_1.m_axis_tcp_notification:embedding_47_krnl_1.s_axis_tcp_notification 44 | sc=network_krnl_1.m_axis_tcp_rx_meta:embedding_47_krnl_1.s_axis_tcp_rx_meta 45 | sc=network_krnl_1.m_axis_tcp_rx_data:embedding_47_krnl_1.s_axis_tcp_rx_data 46 | sc=network_krnl_1.m_axis_tcp_tx_status:embedding_47_krnl_1.s_axis_tcp_tx_status 47 | 48 | sc=embedding_47_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 49 | sc=embedding_47_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 50 | sc=embedding_47_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 51 | sc=embedding_47_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 52 | sc=embedding_47_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 53 | sc=embedding_47_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 54 | sc=embedding_47_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 55 | sc=embedding_47_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 56 | 57 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 58 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_98_krnl/config_sp_embedding_98_krnl.txt: -------------------------------------------------------------------------------- 1 | 2 | [connectivity] 3 | slr=cmac_krnl_1:SLR2 4 | 5 | sp=embedding_98_krnl_1.table_HBM0:HBM[0] 6 | sp=embedding_98_krnl_1.table_HBM1:HBM[1] 7 | sp=embedding_98_krnl_1.table_HBM2:HBM[2] 8 | sp=embedding_98_krnl_1.table_HBM3:HBM[3] 9 | sp=embedding_98_krnl_1.table_HBM4:HBM[4] 10 | sp=embedding_98_krnl_1.table_HBM5:HBM[5] 11 | sp=embedding_98_krnl_1.table_HBM6:HBM[6] 12 | sp=embedding_98_krnl_1.table_HBM7:HBM[7] 13 | sp=embedding_98_krnl_1.table_HBM8:HBM[8] 14 | sp=embedding_98_krnl_1.table_HBM9:HBM[9] 15 | sp=embedding_98_krnl_1.table_HBM10:HBM[10] 16 | sp=embedding_98_krnl_1.table_HBM11:HBM[11] 17 | sp=embedding_98_krnl_1.table_HBM12:HBM[12] 18 | sp=embedding_98_krnl_1.table_HBM13:HBM[13] 19 | sp=embedding_98_krnl_1.table_HBM14:HBM[14] 20 | sp=embedding_98_krnl_1.table_HBM15:HBM[15] 21 | sp=embedding_98_krnl_1.table_HBM16:HBM[16] 22 | sp=embedding_98_krnl_1.table_HBM17:HBM[17] 23 | sp=embedding_98_krnl_1.table_HBM18:HBM[18] 24 | sp=embedding_98_krnl_1.table_HBM19:HBM[19] 25 | sp=embedding_98_krnl_1.table_HBM20:HBM[20] 26 | sp=embedding_98_krnl_1.table_HBM21:HBM[21] 27 | sp=embedding_98_krnl_1.table_HBM22:HBM[22] 28 | sp=embedding_98_krnl_1.table_HBM23:HBM[23] 29 | sp=embedding_98_krnl_1.table_HBM24:HBM[24] 30 | sp=embedding_98_krnl_1.table_HBM25:HBM[25] 31 | sp=embedding_98_krnl_1.table_HBM26:HBM[26] 32 | sp=embedding_98_krnl_1.table_HBM27:HBM[27] 33 | sp=embedding_98_krnl_1.table_DDR0:DDR[0] 34 | sp=embedding_98_krnl_1.table_DDR1:DDR[1] 35 | 36 | sp=network_krnl_1.m00_axi:HBM[28] 37 | sp=network_krnl_1.m01_axi:HBM[29] 38 | 39 | sc=network_krnl_1.m_axis_udp_rx:embedding_98_krnl_1.s_axis_udp_rx 40 | sc=network_krnl_1.m_axis_udp_rx_meta:embedding_98_krnl_1.s_axis_udp_rx_meta 41 | sc=network_krnl_1.m_axis_tcp_port_status:embedding_98_krnl_1.s_axis_tcp_port_status 42 | sc=network_krnl_1.m_axis_tcp_open_status:embedding_98_krnl_1.s_axis_tcp_open_status 43 | sc=network_krnl_1.m_axis_tcp_notification:embedding_98_krnl_1.s_axis_tcp_notification 44 | sc=network_krnl_1.m_axis_tcp_rx_meta:embedding_98_krnl_1.s_axis_tcp_rx_meta 45 | sc=network_krnl_1.m_axis_tcp_rx_data:embedding_98_krnl_1.s_axis_tcp_rx_data 46 | sc=network_krnl_1.m_axis_tcp_tx_status:embedding_98_krnl_1.s_axis_tcp_tx_status 47 | 48 | sc=embedding_98_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 49 | sc=embedding_98_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 50 | sc=embedding_98_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 51 | sc=embedding_98_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 52 | sc=embedding_98_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 53 | sc=embedding_98_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 54 | sc=embedding_98_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 55 | sc=embedding_98_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 56 | 57 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 58 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx -------------------------------------------------------------------------------- /FPGA/kernel/cmac_krnl/src/hdl/network_clk_cross.sv: -------------------------------------------------------------------------------- 1 | `include "network_types.svh" 2 | `include "network_intf.svh" 3 | 4 | module network_clk_cross ( 5 | input wire net_clk, 6 | input wire net_aresetn, 7 | input wire pcie_clk, 8 | input wire pcie_aresetn, 9 | 10 | // NCLK 11 | axi_stream.slave m_axis_net_rx_nclk, 12 | axi_stream.master s_axis_net_tx_nclk, 13 | 14 | // ACLK 15 | axi_stream.master m_axis_net_rx_aclk, 16 | axi_stream.slave s_axis_net_tx_aclk 17 | ); 18 | 19 | 20 | reg net_aresetn_reg = 1'b1; 21 | always @ (posedge net_clk) begin 22 | net_aresetn_reg <= net_aresetn; 23 | end 24 | 25 | reg pcie_aresetn_reg = 1'b1; 26 | always @ (posedge pcie_clk) begin 27 | pcie_aresetn_reg <= pcie_aresetn; 28 | end 29 | 30 | // 31 | // Crossings init 32 | // 33 | 34 | axi_stream m_axis_net_rx_nclk_r (); 35 | axi_stream s_axis_net_tx_nclk_r (); 36 | 37 | axi_stream m_axis_net_rx_aclk_r (); 38 | axi_stream s_axis_net_tx_aclk_r (); 39 | 40 | // Might be an overkill 41 | axis_data_reg_array #(.N_STAGES(5)) inst_reg_data_nclk1 (.aclk(net_clk), .aresetn(net_aresetn_reg), .s_axis(m_axis_net_rx_nclk), .m_axis(m_axis_net_rx_nclk_r)); 42 | axis_data_reg_array #(.N_STAGES(5)) inst_reg_data_nclk2 (.aclk(net_clk), .aresetn(net_aresetn_reg), .s_axis(s_axis_net_tx_nclk_r), .m_axis(s_axis_net_tx_nclk)); 43 | axis_data_reg_array #(.N_STAGES(5)) inst_reg_data_aclk1 (.aclk(pcie_clk), .aresetn(pcie_aresetn_reg), .s_axis(m_axis_net_rx_aclk_r), .m_axis(m_axis_net_rx_aclk)); 44 | axis_data_reg_array #(.N_STAGES(5)) inst_reg_data_aclk2 (.aclk(pcie_clk), .aresetn(pcie_aresetn_reg), .s_axis(s_axis_net_tx_aclk), .m_axis(s_axis_net_tx_aclk_r)); 45 | 46 | // Data 47 | axis_data_fifo_cc_udp_data inst_cc_udp_data_rx ( 48 | .m_axis_aclk(pcie_clk), 49 | .s_axis_aclk(net_clk), 50 | .s_axis_aresetn(net_aresetn_reg), 51 | .s_axis_tvalid(m_axis_net_rx_nclk_r.valid), 52 | .s_axis_tready(m_axis_net_rx_nclk_r.ready), 53 | .s_axis_tdata(m_axis_net_rx_nclk_r.data), 54 | .s_axis_tlast(m_axis_net_rx_nclk_r.last), 55 | .s_axis_tkeep(m_axis_net_rx_nclk_r.keep), 56 | .m_axis_tvalid(m_axis_net_rx_aclk_r.valid), 57 | .m_axis_tready(m_axis_net_rx_aclk_r.ready), 58 | .m_axis_tdata(m_axis_net_rx_aclk_r.data), 59 | .m_axis_tlast(m_axis_net_rx_aclk_r.last), 60 | .m_axis_tkeep(m_axis_net_rx_aclk_r.keep) 61 | ); 62 | 63 | axis_data_fifo_cc_udp_data inst_cc_udp_data_tx ( 64 | .m_axis_aclk(net_clk), 65 | .s_axis_aclk(pcie_clk), 66 | .s_axis_aresetn(pcie_aresetn_reg), 67 | .s_axis_tvalid(s_axis_net_tx_aclk_r.valid), 68 | .s_axis_tready(s_axis_net_tx_aclk_r.ready), 69 | .s_axis_tdata(s_axis_net_tx_aclk_r.data), 70 | .s_axis_tlast(s_axis_net_tx_aclk_r.last), 71 | .s_axis_tkeep(s_axis_net_tx_aclk_r.keep), 72 | .m_axis_tvalid(s_axis_net_tx_nclk_r.valid), 73 | .m_axis_tready(s_axis_net_tx_nclk_r.ready), 74 | .m_axis_tdata(s_axis_net_tx_nclk_r.data), 75 | .m_axis_tlast(s_axis_net_tx_nclk_r.last), 76 | .m_axis_tkeep(s_axis_net_tx_nclk_r.keep) 77 | ); 78 | 79 | 80 | 81 | endmodule -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/out: -------------------------------------------------------------------------------- 1 | concurrentKernels: 1 2 | = 1: Concurrent Kernel Execution 3 | asyncEngineCount: 3 4 | > 0: Overlap of Data Transfer and Kernel Execution 5 | = 2: Concurrent Data Transfers 6 | Device 0 has compute capability 7.5. 7 | Before Thread 8 | Printing Port from Thread 8080 9 | Printing Port from Thread 8084 10 | Printing Port from Thread 8088 11 | Successfully built connection. 12 | Successfully built connection. 13 | Successfully built connection. 14 | Consumed time: 0.230000 seconds, INCLUDING Waiting reader proceess 15 | Throughput: 0.008492 GB / secConsumed time: 0.230000 seconds, INCLUDING Waiting reader proceess 16 | Throughput: 0.259001 GB / secConsumed time: 0.210000 seconds, INCLUDING Waiting reader proceess 17 | Throughput: 0.283668 GB / sec0.000000 0.000000 0.000000 0.000000 0.000000 After Thread 18 | i = 0 FPGA0 time = 80380336 ns 19 | i = 0 FPGA1 time = 40231969 ns 20 | i = 0 CPU0 time = 97120297 ns 21 | i = 0 memcpy time = 60306152 ns 22 | i = 1 FPGA0 time = 40014650 ns 23 | i = 1 FPGA1 time = 41016614 ns 24 | i = 1 CPU0 time = 55752916 ns 25 | i = 1 memcpy time = 40515632 ns 26 | i = 2 FPGA0 time = 35439912 ns 27 | i = 2 FPGA1 time = 40787134 ns 28 | i = 2 CPU0 time = 50551573 ns 29 | i = 2 memcpy time = 38113523 ns 30 | i = 3 FPGA0 time = 26818740 ns 31 | i = 3 FPGA1 time = 35218254 ns 32 | i = 3 CPU0 time = 45141873 ns 33 | i = 3 memcpy time = 31018497 ns 34 | i = 4 FPGA0 time = 28601239 ns 35 | i = 4 FPGA1 time = 26737033 ns 36 | i = 4 CPU0 time = 37267314 ns 37 | i = 4 memcpy time = 27669136 ns 38 | i = 5 FPGA0 time = 28733950 ns 39 | i = 5 FPGA1 time = 26872342 ns 40 | i = 5 CPU0 time = 36780408 ns 41 | i = 5 memcpy time = 27803146 ns 42 | i = 6 FPGA0 time = 29757657 ns 43 | i = 6 FPGA1 time = 27512576 ns 44 | i = 6 CPU0 time = 37795140 ns 45 | i = 6 memcpy time = 28635116 ns 46 | i = 7 FPGA0 time = 30751565 ns 47 | i = 7 FPGA1 time = 27698084 ns 48 | i = 7 CPU0 time = 38149920 ns 49 | i = 7 memcpy time = 29224824 ns 50 | i = 8 FPGA0 time = 30113613 ns 51 | i = 8 FPGA1 time = 27005731 ns 52 | i = 8 CPU0 time = 37479690 ns 53 | i = 8 memcpy time = 28559672 ns 54 | i = 9 FPGA0 time = 29953231 ns 55 | i = 9 FPGA1 time = 26991300 ns 56 | i = 9 CPU0 time = 37454036 ns 57 | i = 9 memcpy time = 28472265 ns 58 | i = 10 FPGA0 time = 30321863 ns 59 | i = 10 FPGA1 time = 27749784 ns 60 | i = 10 CPU0 time = 37802879 ns 61 | i = 10 memcpy time = 29035823 ns 62 | i = 11 FPGA0 time = 30906107 ns 63 | i = 11 FPGA1 time = 27477201 ns 64 | i = 11 CPU0 time = 38108115 ns 65 | i = 11 memcpy time = 29191654 ns 66 | i = 12 FPGA0 time = 29478247 ns 67 | i = 12 FPGA1 time = 25947500 ns 68 | i = 12 CPU0 time = 36493962 ns 69 | i = 12 memcpy time = 27712873 ns 70 | i = 13 FPGA0 time = 22442197 ns 71 | i = 13 FPGA1 time = 19544818 ns 72 | i = 13 CPU0 time = 29770012 ns 73 | i = 13 memcpy time = 20993507 ns 74 | i = 14 FPGA0 time = 13310682 ns 75 | i = 14 FPGA1 time = 9783901 ns 76 | i = 14 CPU0 time = 20882263 ns 77 | i = 14 memcpy time = 11547291 ns 78 | i = 15 FPGA0 time = 6249740 ns 79 | i = 15 FPGA1 time = 4897785 ns 80 | i = 15 CPU0 time = 10993382 ns 81 | i = 15 memcpy time = 5573762 ns 82 | 83 | Average memcpt time per batch: 0.029023 sec = 29.023304 ms = 29023.304688 us = 29023304.000000 ns 84 | -------------------------------------------------------------------------------- /FPGA/host/embedding_377_krnl/host.hpp: -------------------------------------------------------------------------------- 1 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 2 | #define CL_HPP_TARGET_OPENCL_VERSION 120 3 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 4 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 5 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 6 | 7 | //OCL_CHECK doesn't work if call has templatized function call 8 | #define OCL_CHECK(error,call) \ 9 | call; \ 10 | if (error != CL_SUCCESS) { \ 11 | printf("%s:%d Error calling " #call ", error code is: %d\n", \ 12 | __FILE__,__LINE__, error); \ 13 | exit(EXIT_FAILURE); \ 14 | } 15 | #include "constants.hpp" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | // template 24 | // struct aligned_allocator 25 | // { 26 | // using value_type = T; 27 | // T* allocate(std::size_t num) 28 | // { 29 | // void* ptr = nullptr; 30 | // if (posix_memalign(&ptr,4096,num*sizeof(T))) 31 | // throw std::bad_alloc(); 32 | // return reinterpret_cast(ptr); 33 | // } 34 | // void deallocate(T* p, std::size_t num) 35 | // { 36 | // free(p); 37 | // } 38 | // }; 39 | 40 | std::vector get_devices(const std::string& vendor_name) { 41 | 42 | size_t i; 43 | cl_int err; 44 | std::vector platforms; 45 | OCL_CHECK(err, err = cl::Platform::get(&platforms)); 46 | cl::Platform platform; 47 | for (i = 0 ; i < platforms.size(); i++){ 48 | platform = platforms[i]; 49 | OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); 50 | if (platformName == vendor_name){ 51 | std::cout << "Found Platform" << std::endl; 52 | std::cout << "Platform Name: " << platformName.c_str() << std::endl; 53 | break; 54 | } 55 | } 56 | if (i == platforms.size()) { 57 | std::cout << "Error: Failed to find Xilinx platform" << std::endl; 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | //Getting ACCELERATOR Devices and selecting 1st such device 62 | std::vector devices; 63 | OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); 64 | return devices; 65 | } 66 | 67 | char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb) 68 | { 69 | std::cout << "INFO: Reading " << xclbin_file_name << std::endl; 70 | 71 | if(access(xclbin_file_name.c_str(), R_OK) != 0) { 72 | printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); 73 | exit(EXIT_FAILURE); 74 | } 75 | //Loading XCL Bin into char buffer 76 | std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; 77 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 78 | bin_file.seekg (0, bin_file.end); 79 | nb = bin_file.tellg(); 80 | bin_file.seekg (0, bin_file.beg); 81 | char *buf = new char [nb]; 82 | bin_file.read(buf, nb); 83 | return buf; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /FPGA/host/embedding_47_krnl/host.hpp: -------------------------------------------------------------------------------- 1 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 2 | #define CL_HPP_TARGET_OPENCL_VERSION 120 3 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 4 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 5 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 6 | 7 | //OCL_CHECK doesn't work if call has templatized function call 8 | #define OCL_CHECK(error,call) \ 9 | call; \ 10 | if (error != CL_SUCCESS) { \ 11 | printf("%s:%d Error calling " #call ", error code is: %d\n", \ 12 | __FILE__,__LINE__, error); \ 13 | exit(EXIT_FAILURE); \ 14 | } 15 | #include "constants.hpp" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | // template 24 | // struct aligned_allocator 25 | // { 26 | // using value_type = T; 27 | // T* allocate(std::size_t num) 28 | // { 29 | // void* ptr = nullptr; 30 | // if (posix_memalign(&ptr,4096,num*sizeof(T))) 31 | // throw std::bad_alloc(); 32 | // return reinterpret_cast(ptr); 33 | // } 34 | // void deallocate(T* p, std::size_t num) 35 | // { 36 | // free(p); 37 | // } 38 | // }; 39 | 40 | std::vector get_devices(const std::string& vendor_name) { 41 | 42 | size_t i; 43 | cl_int err; 44 | std::vector platforms; 45 | OCL_CHECK(err, err = cl::Platform::get(&platforms)); 46 | cl::Platform platform; 47 | for (i = 0 ; i < platforms.size(); i++){ 48 | platform = platforms[i]; 49 | OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); 50 | if (platformName == vendor_name){ 51 | std::cout << "Found Platform" << std::endl; 52 | std::cout << "Platform Name: " << platformName.c_str() << std::endl; 53 | break; 54 | } 55 | } 56 | if (i == platforms.size()) { 57 | std::cout << "Error: Failed to find Xilinx platform" << std::endl; 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | //Getting ACCELERATOR Devices and selecting 1st such device 62 | std::vector devices; 63 | OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); 64 | return devices; 65 | } 66 | 67 | char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb) 68 | { 69 | std::cout << "INFO: Reading " << xclbin_file_name << std::endl; 70 | 71 | if(access(xclbin_file_name.c_str(), R_OK) != 0) { 72 | printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); 73 | exit(EXIT_FAILURE); 74 | } 75 | //Loading XCL Bin into char buffer 76 | std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; 77 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 78 | bin_file.seekg (0, bin_file.end); 79 | nb = bin_file.tellg(); 80 | bin_file.seekg (0, bin_file.beg); 81 | char *buf = new char [nb]; 82 | bin_file.read(buf, nb); 83 | return buf; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /FPGA/host/embedding_98_krnl/host.hpp: -------------------------------------------------------------------------------- 1 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 2 | #define CL_HPP_TARGET_OPENCL_VERSION 120 3 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 4 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 5 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 6 | 7 | //OCL_CHECK doesn't work if call has templatized function call 8 | #define OCL_CHECK(error,call) \ 9 | call; \ 10 | if (error != CL_SUCCESS) { \ 11 | printf("%s:%d Error calling " #call ", error code is: %d\n", \ 12 | __FILE__,__LINE__, error); \ 13 | exit(EXIT_FAILURE); \ 14 | } 15 | #include "constants.hpp" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | // template 24 | // struct aligned_allocator 25 | // { 26 | // using value_type = T; 27 | // T* allocate(std::size_t num) 28 | // { 29 | // void* ptr = nullptr; 30 | // if (posix_memalign(&ptr,4096,num*sizeof(T))) 31 | // throw std::bad_alloc(); 32 | // return reinterpret_cast(ptr); 33 | // } 34 | // void deallocate(T* p, std::size_t num) 35 | // { 36 | // free(p); 37 | // } 38 | // }; 39 | 40 | std::vector get_devices(const std::string& vendor_name) { 41 | 42 | size_t i; 43 | cl_int err; 44 | std::vector platforms; 45 | OCL_CHECK(err, err = cl::Platform::get(&platforms)); 46 | cl::Platform platform; 47 | for (i = 0 ; i < platforms.size(); i++){ 48 | platform = platforms[i]; 49 | OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); 50 | if (platformName == vendor_name){ 51 | std::cout << "Found Platform" << std::endl; 52 | std::cout << "Platform Name: " << platformName.c_str() << std::endl; 53 | break; 54 | } 55 | } 56 | if (i == platforms.size()) { 57 | std::cout << "Error: Failed to find Xilinx platform" << std::endl; 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | //Getting ACCELERATOR Devices and selecting 1st such device 62 | std::vector devices; 63 | OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); 64 | return devices; 65 | } 66 | 67 | char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb) 68 | { 69 | std::cout << "INFO: Reading " << xclbin_file_name << std::endl; 70 | 71 | if(access(xclbin_file_name.c_str(), R_OK) != 0) { 72 | printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); 73 | exit(EXIT_FAILURE); 74 | } 75 | //Loading XCL Bin into char buffer 76 | std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; 77 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 78 | bin_file.seekg (0, bin_file.end); 79 | nb = bin_file.tellg(); 80 | bin_file.seekg (0, bin_file.beg); 81 | char *buf = new char [nb]; 82 | bin_file.read(buf, nb); 83 | return buf; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /FPGA/host/embedding_krnl/host.hpp: -------------------------------------------------------------------------------- 1 | #define CL_HPP_CL_1_2_DEFAULT_BUILD 2 | #define CL_HPP_TARGET_OPENCL_VERSION 120 3 | #define CL_HPP_MINIMUM_OPENCL_VERSION 120 4 | #define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1 5 | #define CL_USE_DEPRECATED_OPENCL_1_2_APIS 6 | 7 | //OCL_CHECK doesn't work if call has templatized function call 8 | #define OCL_CHECK(error,call) \ 9 | call; \ 10 | if (error != CL_SUCCESS) { \ 11 | printf("%s:%d Error calling " #call ", error code is: %d\n", \ 12 | __FILE__,__LINE__, error); \ 13 | exit(EXIT_FAILURE); \ 14 | } 15 | #include "constants.hpp" 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | // template 24 | // struct aligned_allocator 25 | // { 26 | // using value_type = T; 27 | // T* allocate(std::size_t num) 28 | // { 29 | // void* ptr = nullptr; 30 | // if (posix_memalign(&ptr,4096,num*sizeof(T))) 31 | // throw std::bad_alloc(); 32 | // return reinterpret_cast(ptr); 33 | // } 34 | // void deallocate(T* p, std::size_t num) 35 | // { 36 | // free(p); 37 | // } 38 | // }; 39 | 40 | std::vector get_devices(const std::string& vendor_name) { 41 | 42 | size_t i; 43 | cl_int err; 44 | std::vector platforms; 45 | OCL_CHECK(err, err = cl::Platform::get(&platforms)); 46 | cl::Platform platform; 47 | for (i = 0 ; i < platforms.size(); i++){ 48 | platform = platforms[i]; 49 | OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); 50 | if (platformName == vendor_name){ 51 | std::cout << "Found Platform" << std::endl; 52 | std::cout << "Platform Name: " << platformName.c_str() << std::endl; 53 | break; 54 | } 55 | } 56 | if (i == platforms.size()) { 57 | std::cout << "Error: Failed to find Xilinx platform" << std::endl; 58 | exit(EXIT_FAILURE); 59 | } 60 | 61 | //Getting ACCELERATOR Devices and selecting 1st such device 62 | std::vector devices; 63 | OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); 64 | return devices; 65 | } 66 | 67 | char* read_binary_file(const std::string &xclbin_file_name, unsigned &nb) 68 | { 69 | std::cout << "INFO: Reading " << xclbin_file_name << std::endl; 70 | 71 | if(access(xclbin_file_name.c_str(), R_OK) != 0) { 72 | printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); 73 | exit(EXIT_FAILURE); 74 | } 75 | //Loading XCL Bin into char buffer 76 | std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; 77 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 78 | bin_file.seekg (0, bin_file.end); 79 | nb = bin_file.tellg(); 80 | bin_file.seekg (0, bin_file.beg); 81 | char *buf = new char [nb]; 82 | bin_file.read(buf, nb); 83 | return buf; 84 | } 85 | 86 | -------------------------------------------------------------------------------- /FPGA/common/utility/makefile_gen/descgen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from sys import argv 3 | import json 4 | import glob 5 | import os 6 | import re 7 | import subprocess 8 | 9 | script, desc_file = argv 10 | desc = open(desc_file, 'r') 11 | data = json.load(desc) 12 | desc.close() 13 | 14 | #top level list : dict_new 15 | dict_new = {} 16 | if 'example' in data: 17 | dict_new['name'] = data.pop('example') 18 | if 'overview' in data: 19 | dict_new['description'] = data.pop('overview') 20 | if 'board' in data: 21 | dict_new['device'] = data.pop('board') 22 | if 'nboard' in data: 23 | dict_new['ndevice'] = data.pop('nboard') 24 | if 'sdx_gui' in data: 25 | dict_new['gui'] = data.pop('sdx_gui') 26 | 27 | 28 | 29 | #host_list = [] 30 | host_dict = {} 31 | if 'host_exe' in data: 32 | host_dict['host_exe'] = data.pop('host_exe') 33 | 34 | #if 'host_srcs' in data: 35 | # srcs = data['host_srcs'].split(" ") 36 | # host_dict['sources'] = srcs 37 | # del data['host_srcs'] 38 | #if 'host_hdrs' in data: 39 | # hdrs = data['host_hdrs'].split(" ") 40 | # host_dict['sources'].extend(hdrs) 41 | # del data['host_hdrs'] 42 | 43 | #linker_list = [] 44 | #linker_dict = {} 45 | #library_paths = [] 46 | #library_paths.append("REPO_DIR/common/libs/") 47 | #linker_dict['librarypaths'] = library_paths 48 | #if 'libs' in data: 49 | # library = [] 50 | # for item in data['libs']: 51 | # library.append(item["name"]) 52 | # linker_dict['libraries'] = library 53 | 54 | if 'linker' in data: 55 | linker_dict = {} 56 | linker_dict.update(data['linker']) 57 | del data['linker'] 58 | #linker_list.append(linker_dict) 59 | host_dict['linker'] = linker_dict 60 | 61 | 62 | 63 | if 'libs' or 'compiler' or 'host_srcs' or 'host_hdrs' in data: 64 | #compiler_list = [] 65 | compiler_dict = {} 66 | if 'libs' or 'host_srcs' or 'host_hdrs' in data: 67 | srcs = [] 68 | if 'libs' in data: 69 | include_paths = [] 70 | for item in data['libs']: 71 | include_paths.append('REPO_DIR/common/includes/'+ item) 72 | srcs.append('REPO_DIR/common/includes/'+ item) 73 | compiler_dict['includepaths'] = include_paths 74 | del data['libs'] 75 | if 'compiler' in data: 76 | compiler_dict.update(data['compiler']) 77 | if 'host_srcs' in data: 78 | srcs.extend(data['host_srcs'].split(" ")) 79 | #compiler_dict['sources'] = srcs 80 | del data['host_srcs'] 81 | if 'host_hdrs' in data: 82 | hdrs = data['host_hdrs'].split(" ") 83 | srcs.extend(hdrs) 84 | del data['host_hdrs'] 85 | 86 | compiler_dict['sources'] = srcs 87 | 88 | 89 | #compiler_list.append(compiler_dict) 90 | #host_dict['compiler'] = compiler_list 91 | host_dict['compiler'] = compiler_dict 92 | 93 | 94 | if 'cmd_args' in data: 95 | launch_list = [] 96 | emu_cmd_dictny = {} 97 | emu_cmd_dictny['name'] = 'generic launch for all flows' 98 | emu_cmd_dictny['cmd_args'] = data.pop('cmd_args') 99 | launch_list.append(emu_cmd_dictny) 100 | dict_new['launch'] = launch_list 101 | 102 | 103 | #host_list.append(host_dict) 104 | dict_new['host'] = host_dict 105 | 106 | dict_new['platform_type'] = "pcie" 107 | 108 | data.update(dict_new) 109 | 110 | target = open('description.json', 'w+') 111 | json.dump(data, target, indent=4, sort_keys=False) 112 | -------------------------------------------------------------------------------- /FPGA/common/includes/logger/logger.h: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #ifndef LOGGER_H_ 30 | #define LOGGER_H_ 31 | 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | 38 | #define ENABLE_LOG_TOFILE 1 39 | #define ENABLE_LOG_TIME 1 40 | 41 | //global logging 42 | #define LogInfo(desc, ...) sda::LogWrapper(0, __FILE__, __LINE__, desc, ##__VA_ARGS__) 43 | #define LogWarn(desc, ...) sda::LogWrapper(1, __FILE__, __LINE__, desc, ##__VA_ARGS__) 44 | #define LogError(desc, ...) sda::LogWrapper(2, __FILE__, __LINE__, desc, ##__VA_ARGS__) 45 | 46 | using namespace std; 47 | 48 | namespace sda { 49 | 50 | enum LOGTYPE {etInfo, etWarning, etError}; 51 | 52 | //string 53 | string& ltrim(string& s); 54 | string& rtrim(string& s); 55 | string& trim(string& s); 56 | string GetFileExt(const string& s); 57 | string GetFileTitleOnly(const string& s); 58 | 59 | string ToLower(const string& s); 60 | string ToUpper(const string& s); 61 | 62 | //time 63 | string GetTimeStamp(); 64 | 65 | //paths 66 | string GetApplicationPath(); 67 | 68 | 69 | //debug 70 | template 71 | void PrintPOD(const vector& pod, size_t display_count = 0, const int precision = 4) { 72 | 73 | size_t count = pod.size(); 74 | if(display_count > 0) 75 | count = std::min(pod.size(), display_count); 76 | 77 | for(size_t i = 0; i < count; i++) { 78 | cout << std::setprecision(precision) << pod[i] << ", "; 79 | } 80 | cout << endl; 81 | } 82 | 83 | //logging 84 | void LogWrapper(int etype, const char* file, int line, const char* desc, ...); 85 | 86 | } 87 | 88 | 89 | 90 | #endif /* LOGGER_H_ */ 91 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/hls_test_krnl/src/hls/in_casting_bench.hpp: -------------------------------------------------------------------------------- 1 | /************************************************ 2 | Copyright (c) 2018, Systems Group, ETH Zurich. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | ************************************************/ 29 | #pragma once 30 | 31 | #include "axi_utils.hpp" 32 | #include "toe.hpp" 33 | 34 | const unsigned DATA_WIDTH = 64 * 8; 35 | // #ifndef __SYNTHESIS__ 36 | // static const ap_uint<32> END_TIME = 1000; //1000000; 37 | // static const ap_uint<40> END_TIME_120 = 1000; 38 | 39 | // #else 40 | // static const ap_uint<32> END_TIME = 1546546546;//1501501501; 41 | // static const ap_uint<40> END_TIME_120 = 18750000000; 42 | // #endif 43 | 44 | 45 | void in_casting_bench( hls::stream >& listenPort, 46 | hls::stream& listenPortStatus, 47 | hls::stream& notifications, 48 | hls::stream& readRequest, 49 | hls::stream >& rxMetaData, 50 | hls::stream >& rxData, 51 | hls::stream& openConnection, 52 | hls::stream& openConStatus, 53 | hls::stream >& closeConnection, 54 | hls::stream& txMetaData, 55 | hls::stream >& txData, 56 | hls::stream& txStatus, 57 | ap_uint<1> runExperiment, 58 | ap_uint<16> useConn, 59 | ap_uint<16> useIpAddr, 60 | ap_uint<16> pkgWordCount, 61 | ap_uint<16> regBasePort, 62 | ap_uint<16> usePort, 63 | ap_uint<16> expectedRespInKBPerCon, 64 | ap_uint<1> finishExperiment, 65 | ap_uint<32> delayedCycles, 66 | ap_uint<32> regIpAddress0, 67 | ap_uint<32> regIpAddress1, 68 | ap_uint<32> regIpAddress2, 69 | ap_uint<32> regIpAddress3, 70 | ap_uint<32> regIpAddress4, 71 | ap_uint<32> regIpAddress5, 72 | ap_uint<32> regIpAddress6, 73 | ap_uint<32> regIpAddress7, 74 | ap_uint<32> regIpAddress8, 75 | ap_uint<32> regIpAddress9, 76 | ap_uint<32> regIpAddress10); -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/constant.h: -------------------------------------------------------------------------------- 1 | // Input: (INPUT_FEATURE_LEN, BATCH_SIZE) 2 | // Layer1: W1 * INPUT + B1 3 | // -> W1 (HIDDEN_SIZE1, INPUT_FEATURE_LEN) 4 | // -> B1 (HIDDEN_SIZE1) 5 | // -> Result1 (HIDDEN_SIZE1, BATCH_SIZE) 6 | // Layer2: W2 * Result1 + B2 7 | // -> W2 (HIDDEN_SIZE2, HIDDEN_SIZE1) 8 | // -> B2 (HIDDEN_SIZE2) 9 | // -> Result2 (HIDDEN_SIZE2, BATCH_SIZE) 10 | // Layer3: W3 * Result2 + B3 11 | // -> W3 (HIDDEN_SIZE3, HIDDEN_SIZE2) 12 | // -> B3 (HIDDEN_SIZE3) 13 | // -> Result3 (HIDDEN_SIZE3, BATCH_SIZE) 14 | // Output Layer: W_OUT * Result3 + B_OUT 15 | // -> W3 (OUTPUT_FEATURE_LEN, HIDDEN_SIZE3) 16 | // -> B3 (OUTPUT_FEATURE_LEN) 17 | // -> Result3 (OUTPUT_FEATURE_LEN, BATCH_SIZE) 18 | 19 | ///////////// OPTION: small model 384 -> 512 ///////////// 20 | ///////////// OPTION: large model 876 -> 1024 ///////////// 21 | 22 | /// TODO: CHANGE THIS 23 | // #define INPUT_FEATURE_LEN 1024 24 | 25 | #define INPUT_FEATURE_LEN_RECEIVER 3968 26 | #define INPUT_FEATURE_LEN_FPGA_SENDER 1952 27 | #define INPUT_FEATURE_LEN_CPU_SENDER 64 28 | 29 | // BATCH_SIZE GIVEN IN constant.h, need to revisit the constant definition later 30 | #define HIDDEN_SIZE1 2048 // 1024 31 | #define HIDDEN_SIZE2 512 32 | #define HIDDEN_SIZE3 256 33 | #define OUTPUT_FEATURE_LEN 1 34 | 35 | /* constraint: SHM_DATA_SIZE === 1 GB */ 36 | /* FLOAT_SIZE * BATCH_SIZE * INPUT_DIM * BATCH_NUM_PER_LOOP = 1024 **3 */ 37 | #define FLOAT_SIZE 4 38 | #define BATCH_SIZE 512 // 256 39 | // maintain the same FIFO memory size 40 | // if batch size = 256, using 1024 as FIFO size, if batch size = 512, FIFO size = 512, etc. 41 | // #define BATCH_NUM_PER_LOOP ((1024) / (BATCH_SIZE / 256)) // -> should be renamed as FIFO_BATCH_NUM 42 | #define BATCH_NUM_PER_LOOP 16//((1024 * 256) / (BATCH_SIZE)) // -> should be renamed as FIFO_BATCH_NUM 43 | 44 | // LOOP = number of GBs to perform 45 | #define LOOP_NUM 1// 16 46 | 47 | #define TOTAL_BATCH_NUM (BATCH_NUM_PER_LOOP * LOOP_NUM) 48 | /* matrix (batch * input_dim): 1024 * 1024, float: 4 byte, 1024 batches in queue */ 49 | /* 4 GB in total, (1024 * 1024 * 4 * 1024) */ 50 | 51 | /// TODO: CHANGE THIS 52 | // #define BLOCK_ENTRY_NUM (BATCH_SIZE * INPUT_FEATURE_LEN) 53 | // #define BLOCK_SIZE (BLOCK_ENTRY_NUM * FLOAT_SIZE) 54 | 55 | #define BLOCK_ENTRY_NUM_RECEIVER (BATCH_SIZE * INPUT_FEATURE_LEN_RECEIVER) 56 | #define BLOCK_SIZE_RECEIVER (BLOCK_ENTRY_NUM_RECEIVER * FLOAT_SIZE) 57 | 58 | #define BLOCK_ENTRY_NUM_FPGA_SENDER (BATCH_SIZE * INPUT_FEATURE_LEN_FPGA_SENDER) 59 | #define BLOCK_SIZE_FPGA_SENDER (BLOCK_ENTRY_NUM_FPGA_SENDER * FLOAT_SIZE) 60 | 61 | #define BLOCK_ENTRY_NUM_CPU_SENDER (BATCH_SIZE * INPUT_FEATURE_LEN_CPU_SENDER) 62 | #define BLOCK_SIZE_CPU_SENDER (BLOCK_ENTRY_NUM_CPU_SENDER * FLOAT_SIZE) 63 | 64 | // maximum shared memory size: 1 GB 65 | /// TODO: CHANGE THIS 66 | // #define SHM_DATA_SIZE (BLOCK_SIZE * BATCH_NUM_PER_LOOP) // TODO: may need to use smaller BATCH_NUM_PER_LOOP 67 | 68 | #define SHM_DATA_SIZE_FPGA (BLOCK_SIZE_FPGA_SENDER * BATCH_NUM_PER_LOOP) // TODO: may need to use smaller BATCH_NUM_PER_LOOP 69 | #define SHM_DATA_SIZE_CPU (BLOCK_SIZE_CPU_SENDER * BATCH_NUM_PER_LOOP) // TODO: may need to use smaller BATCH_NUM_PER_LOOP 70 | 71 | #define SHM_CONTROL_SIZE 1024 72 | 73 | #define THREAD_NUM 1 74 | 75 | /// TODO: CHANGE THIS 76 | // #define PORT 8080 77 | 78 | // Stream 0 port: PORT, Stream 1 port: PORT + 1, ... 79 | #define PORT_FPGA_SENDER_0 8080 80 | #define PORT_FPGA_SENDER_1 8084 81 | #define PORT_CPU_SENDER_0 8088 82 | 83 | #define PORT_RECEIVER 8080 -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/src/hls/scatter.hpp: -------------------------------------------------------------------------------- 1 | /************************************************ 2 | Copyright (c) 2018, Systems Group, ETH Zurich. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | ************************************************/ 29 | #pragma once 30 | 31 | #include "scatter_config.hpp" 32 | #include "axi_utils.hpp" 33 | #include "packet.hpp" 34 | #include "toe.hpp" 35 | 36 | // #ifndef __SYNTHESIS__ 37 | // static const ap_uint<32> END_TIME = 1000; //1000000; 38 | // static const ap_uint<40> END_TIME_120 = 1000; 39 | 40 | // #else 41 | // static const ap_uint<32> END_TIME = 1546546546;//1501501501; 42 | // static const ap_uint<40> END_TIME_120 = 18750000000; 43 | // #endif 44 | 45 | 46 | void scatter( hls::stream >& listenPort, 47 | hls::stream& listenPortStatus, 48 | hls::stream& notifications, 49 | hls::stream& readRequest, 50 | hls::stream >& rxMetaData, 51 | hls::stream >& rxData, 52 | hls::stream& openConnection, 53 | hls::stream& openConStatus, 54 | hls::stream >& closeConnection, 55 | hls::stream& txMetaData, 56 | hls::stream >& txData, 57 | hls::stream& txStatus, 58 | ap_uint<1> runExperiment, 59 | ap_uint<16> useConn, 60 | ap_uint<16> useIpAddr, 61 | ap_uint<16> pkgWordCount, 62 | ap_uint<16> regBasePort, 63 | ap_uint<16> usePort, 64 | ap_uint<16> expectedRespInKBPerCon, 65 | ap_uint<1> finishExperiment, 66 | ap_uint<32> delayedCycles, 67 | ap_uint<32> clientPkgNum, 68 | ap_uint<32> regIpAddress0, 69 | ap_uint<32> regIpAddress1, 70 | ap_uint<32> regIpAddress2, 71 | ap_uint<32> regIpAddress3, 72 | ap_uint<32> regIpAddress4, 73 | ap_uint<32> regIpAddress5, 74 | ap_uint<32> regIpAddress6, 75 | ap_uint<32> regIpAddress7, 76 | ap_uint<32> regIpAddress8, 77 | ap_uint<32> regIpAddress9, 78 | ap_uint<32> regIpAddress10); -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/embedding_377_krnl/config_sp_embedding_377_krnl.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | [connectivity] 4 | slr=cmac_krnl_1:SLR2 5 | 6 | sp=embedding_377_krnl_1.table_HBM0:HBM[0] 7 | sp=embedding_377_krnl_1.table_HBM1:HBM[1] 8 | sp=embedding_377_krnl_1.table_HBM2:HBM[2] 9 | sp=embedding_377_krnl_1.table_HBM3:HBM[3] 10 | sp=embedding_377_krnl_1.table_HBM4:HBM[4] 11 | sp=embedding_377_krnl_1.table_HBM5:HBM[5] 12 | sp=embedding_377_krnl_1.table_HBM6:HBM[6] 13 | sp=embedding_377_krnl_1.table_HBM7:HBM[7] 14 | sp=embedding_377_krnl_1.table_HBM8:HBM[8] 15 | sp=embedding_377_krnl_1.table_HBM9:HBM[9] 16 | sp=embedding_377_krnl_1.table_HBM10:HBM[10] 17 | sp=embedding_377_krnl_1.table_HBM11:HBM[11] 18 | sp=embedding_377_krnl_1.table_HBM12:HBM[12] 19 | sp=embedding_377_krnl_1.table_HBM13:HBM[13] 20 | sp=embedding_377_krnl_1.table_HBM14:HBM[14] 21 | sp=embedding_377_krnl_1.table_HBM15:HBM[15] 22 | sp=embedding_377_krnl_1.table_HBM16:HBM[16] 23 | sp=embedding_377_krnl_1.table_HBM17:HBM[17] 24 | sp=embedding_377_krnl_1.table_HBM18:HBM[18] 25 | sp=embedding_377_krnl_1.table_HBM19:HBM[19] 26 | sp=embedding_377_krnl_1.table_HBM20:HBM[20] 27 | sp=embedding_377_krnl_1.table_HBM21:HBM[21] 28 | sp=embedding_377_krnl_1.table_HBM22:HBM[22] 29 | sp=embedding_377_krnl_1.table_HBM23:HBM[23] 30 | sp=embedding_377_krnl_1.table_HBM24:HBM[24] 31 | sp=embedding_377_krnl_1.table_HBM25:HBM[25] 32 | sp=embedding_377_krnl_1.table_HBM26:HBM[26] 33 | sp=embedding_377_krnl_1.table_HBM27:HBM[27] 34 | sp=embedding_377_krnl_1.table_DDR0:DDR[0] 35 | sp=embedding_377_krnl_1.table_DDR1:DDR[1] 36 | 37 | sp=network_krnl_1.m00_axi:HBM[28] 38 | sp=network_krnl_1.m01_axi:HBM[29] 39 | 40 | sc=network_krnl_1.m_axis_udp_rx:embedding_377_krnl_1.s_axis_udp_rx 41 | sc=network_krnl_1.m_axis_udp_rx_meta:embedding_377_krnl_1.s_axis_udp_rx_meta 42 | sc=network_krnl_1.m_axis_tcp_port_status:embedding_377_krnl_1.s_axis_tcp_port_status 43 | sc=network_krnl_1.m_axis_tcp_open_status:embedding_377_krnl_1.s_axis_tcp_open_status 44 | sc=network_krnl_1.m_axis_tcp_notification:embedding_377_krnl_1.s_axis_tcp_notification 45 | sc=network_krnl_1.m_axis_tcp_rx_meta:embedding_377_krnl_1.s_axis_tcp_rx_meta 46 | sc=network_krnl_1.m_axis_tcp_rx_data:embedding_377_krnl_1.s_axis_tcp_rx_data 47 | sc=network_krnl_1.m_axis_tcp_tx_status:embedding_377_krnl_1.s_axis_tcp_tx_status 48 | 49 | sc=embedding_377_krnl_1.m_axis_udp_tx:network_krnl_1.s_axis_udp_tx 50 | sc=embedding_377_krnl_1.m_axis_udp_tx_meta:network_krnl_1.s_axis_udp_tx_meta 51 | sc=embedding_377_krnl_1.m_axis_tcp_listen_port:network_krnl_1.s_axis_tcp_listen_port 52 | sc=embedding_377_krnl_1.m_axis_tcp_open_connection:network_krnl_1.s_axis_tcp_open_connection 53 | sc=embedding_377_krnl_1.m_axis_tcp_close_connection:network_krnl_1.s_axis_tcp_close_connection 54 | sc=embedding_377_krnl_1.m_axis_tcp_read_pkg:network_krnl_1.s_axis_tcp_read_pkg 55 | sc=embedding_377_krnl_1.m_axis_tcp_tx_meta:network_krnl_1.s_axis_tcp_tx_meta 56 | sc=embedding_377_krnl_1.m_axis_tcp_tx_data:network_krnl_1.s_axis_tcp_tx_data 57 | 58 | sc=cmac_krnl_1.axis_net_rx:network_krnl_1.axis_net_rx 59 | sc=network_krnl_1.axis_net_tx:cmac_krnl_1.axis_net_tx 60 | 61 | 62 | [vivado] 63 | #param=compiler.userPreSysLinkTcl=$(PWD)/tcl/plram.tcl 64 | param=route.enableGlobalHoldIter=true 65 | prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=SSI_SpreadLogic_high 66 | prop=run.impl_1.{STEPS.PHYS_OPT_DESIGN.IS_ENABLED}=true 67 | prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=ExploreWithHoldFix 68 | #prop=run.impl_1.{STEPS.PHYS_OPT_DESIGN.ARGS.MORE OPTIONS}={-hold_fix -slr_crossing_opt} 69 | prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=AlternateCLBRouting 70 | #prop=run.impl_1.{STEPS.PHYS_OPT_DESIGN.ARGS.MORE OPTIONS}={-hold_fix} 71 | prop=run.impl_1.{STEPS.POST_ROUTE_PHYS_OPT_DESIGN.IS_ENABLED}=true 72 | prop=run.impl_1.{STEPS.POST_ROUTE_PHYS_OPT_DESIGN.ARGS.MORE OPTIONS}={-sll_reg_hold_fix -hold_fix -slr_crossing_opt} 73 | 74 | -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_3_nodes/single_connection_network_client_sender.c: -------------------------------------------------------------------------------- 1 | // Client side C/C++ program to demonstrate Socket programming 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "constant.h" 13 | 14 | 15 | struct Thread_info { 16 | int port; 17 | }; 18 | 19 | // A normal C function that is executed as a thread 20 | void *thread_send_packets(void* vargp) 21 | { 22 | struct Thread_info* t_info = (struct Thread_info*) vargp; 23 | printf("Printing Port from Thread %d\n", t_info -> port); 24 | 25 | 26 | int sock = 0, valread; 27 | struct sockaddr_in serv_addr; 28 | 29 | float array_buf[BLOCK_ENTRY_NUM]; 30 | for (int i = 0; i < BLOCK_ENTRY_NUM; i++) { 31 | array_buf[i] = 1; 32 | } 33 | 34 | if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) 35 | { 36 | printf("\n Socket creation error \n"); 37 | return -1; 38 | } 39 | 40 | serv_addr.sin_family = AF_INET; 41 | serv_addr.sin_port = htons(t_info -> port); 42 | 43 | // Convert IPv4 and IPv6 addresses from text to binary form 44 | if(inet_pton(AF_INET, "127.0.0.1", &serv_addr.sin_addr)<=0) 45 | // if(inet_pton(AF_INET, "10.128.0.11", &serv_addr.sin_addr)<=0) 46 | { 47 | printf("\nInvalid address/ Address not supported \n"); 48 | return -1; 49 | } 50 | 51 | if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr))<0) 52 | { 53 | printf("\nConnection Failed \n"); 54 | return -1; 55 | } 56 | 57 | printf("Start sending data.\n"); 58 | //////////////// Data transfer //////////////// 59 | int i = 0; 60 | float total_sent_bytes = 0.0; 61 | 62 | clock_t start = clock(); 63 | 64 | for (int i = 0; i < LOOP_NUM * BATCH_NUM_PER_LOOP; i++) { 65 | 66 | int total_sent_bytes = 0; 67 | 68 | while (total_sent_bytes < BLOCK_SIZE) { 69 | int sent_bytes = send(sock, array_buf + total_sent_bytes, BLOCK_SIZE - total_sent_bytes, 0); 70 | total_sent_bytes += sent_bytes; 71 | if (sent_bytes == -1) { 72 | printf("Sending data UNSUCCESSFUL!\n"); 73 | return -1; 74 | } 75 | } 76 | 77 | if (total_sent_bytes != BLOCK_SIZE) { 78 | printf("Sending error, sending more bytes than a block\n"); 79 | } 80 | } 81 | 82 | clock_t end = clock(); 83 | 84 | // Should wait until the server said all the data was sent correctly, 85 | // otherwise the sender may send packets yet the server did not receive. 86 | char msg[32]; 87 | int recv_bytes = read(sock, msg, 32); 88 | printf("received from server: %s\n", msg); 89 | 90 | float total_size = (float)LOOP_NUM * BATCH_NUM_PER_LOOP * BLOCK_SIZE; 91 | printf("Data sent. Packet number:%d\tPacket size:%d bytes\tTotal data:%fGB\n", 92 | LOOP_NUM * BATCH_NUM_PER_LOOP, BLOCK_SIZE, total_size / (1024 * 1024 * 1024)); 93 | float elapsed_time = (end-start) / (float)CLOCKS_PER_SEC; 94 | printf("\nConsumed time: %f seconds\n", elapsed_time); 95 | printf("Transfer Throughput: %f GB / sec\n", total_size / elapsed_time / 1024 / 1024 / 1024); 96 | 97 | return NULL; 98 | } 99 | 100 | int main(int argc, char const *argv[]) 101 | { 102 | 103 | pthread_t thread_id; 104 | printf("Before Thread\n"); 105 | 106 | struct Thread_info t_info_0; 107 | t_info_0.port = PORT; 108 | 109 | pthread_create(&thread_id, NULL, thread_send_packets, (void*) &t_info_0); 110 | // pthread_create(&thread_id, NULL, thread_send_packets, NULL); 111 | pthread_join(thread_id, NULL); 112 | printf("After Thread\n"); 113 | 114 | return 0; 115 | } -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_1_node_no_FIFO_scatter/single_connection_network_client_sender.c: -------------------------------------------------------------------------------- 1 | // Client side C/C++ program to demonstrate Socket programming 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "constant.h" 13 | 14 | 15 | struct Thread_info { 16 | int port; 17 | }; 18 | 19 | // A normal C function that is executed as a thread 20 | void *thread_send_packets(void* vargp) 21 | { 22 | struct Thread_info* t_info = (struct Thread_info*) vargp; 23 | printf("Printing Port from Thread %d\n", t_info -> port); 24 | 25 | 26 | int sock = 0, valread; 27 | struct sockaddr_in serv_addr; 28 | 29 | float array_buf[BLOCK_ENTRY_NUM]; 30 | for (int i = 0; i < BLOCK_ENTRY_NUM; i++) { 31 | array_buf[i] = 1; 32 | } 33 | 34 | if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) 35 | { 36 | printf("\n Socket creation error \n"); 37 | return -1; 38 | } 39 | 40 | serv_addr.sin_family = AF_INET; 41 | serv_addr.sin_port = htons(t_info -> port); 42 | 43 | // Convert IPv4 and IPv6 addresses from text to binary form 44 | if(inet_pton(AF_INET, "127.0.0.1", &serv_addr.sin_addr)<=0) 45 | // if(inet_pton(AF_INET, "10.128.0.11", &serv_addr.sin_addr)<=0) 46 | { 47 | printf("\nInvalid address/ Address not supported \n"); 48 | return -1; 49 | } 50 | 51 | if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr))<0) 52 | { 53 | printf("\nConnection Failed \n"); 54 | return -1; 55 | } 56 | 57 | printf("Start sending data.\n"); 58 | //////////////// Data transfer //////////////// 59 | int i = 0; 60 | float total_sent_bytes = 0.0; 61 | 62 | clock_t start = clock(); 63 | 64 | for (int i = 0; i < LOOP_NUM * BATCH_NUM_PER_LOOP; i++) { 65 | 66 | int total_sent_bytes = 0; 67 | 68 | while (total_sent_bytes < BLOCK_SIZE) { 69 | int sent_bytes = send(sock, array_buf + total_sent_bytes, BLOCK_SIZE - total_sent_bytes, 0); 70 | total_sent_bytes += sent_bytes; 71 | if (sent_bytes == -1) { 72 | printf("Sending data UNSUCCESSFUL!\n"); 73 | return -1; 74 | } 75 | } 76 | 77 | if (total_sent_bytes != BLOCK_SIZE) { 78 | printf("Sending error, sending more bytes than a block\n"); 79 | } 80 | } 81 | 82 | clock_t end = clock(); 83 | 84 | // Should wait until the server said all the data was sent correctly, 85 | // otherwise the sender may send packets yet the server did not receive. 86 | char msg[32]; 87 | int recv_bytes = read(sock, msg, 32); 88 | printf("received from server: %s\n", msg); 89 | 90 | float total_size = (float)LOOP_NUM * BATCH_NUM_PER_LOOP * BLOCK_SIZE; 91 | printf("Data sent. Packet number:%d\tPacket size:%d bytes\tTotal data:%fGB\n", 92 | LOOP_NUM * BATCH_NUM_PER_LOOP, BLOCK_SIZE, total_size / (1024 * 1024 * 1024)); 93 | float elapsed_time = (end-start) / (float)CLOCKS_PER_SEC; 94 | printf("\nConsumed time: %f seconds\n", elapsed_time); 95 | printf("Transfer Throughput: %f GB / sec\n", total_size / elapsed_time / 1024 / 1024 / 1024); 96 | 97 | return NULL; 98 | } 99 | 100 | int main(int argc, char const *argv[]) 101 | { 102 | 103 | pthread_t thread_id; 104 | printf("Before Thread\n"); 105 | 106 | struct Thread_info t_info_0; 107 | t_info_0.port = PORT; 108 | 109 | pthread_create(&thread_id, NULL, thread_send_packets, (void*) &t_info_0); 110 | // pthread_create(&thread_id, NULL, thread_send_packets, NULL); 111 | pthread_join(thread_id, NULL); 112 | printf("After Thread\n"); 113 | 114 | return 0; 115 | } -------------------------------------------------------------------------------- /GPU/final_network_cublasLt_3_nodes_no_FIFO_scatter/single_connection_network_client_sender.c: -------------------------------------------------------------------------------- 1 | // Client side C/C++ program to demonstrate Socket programming 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "constant.h" 13 | 14 | 15 | struct Thread_info { 16 | int port; 17 | }; 18 | 19 | // A normal C function that is executed as a thread 20 | void *thread_send_packets(void* vargp) 21 | { 22 | struct Thread_info* t_info = (struct Thread_info*) vargp; 23 | printf("Printing Port from Thread %d\n", t_info -> port); 24 | 25 | 26 | int sock = 0, valread; 27 | struct sockaddr_in serv_addr; 28 | 29 | float array_buf[BLOCK_ENTRY_NUM]; 30 | for (int i = 0; i < BLOCK_ENTRY_NUM; i++) { 31 | array_buf[i] = 1; 32 | } 33 | 34 | if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) 35 | { 36 | printf("\n Socket creation error \n"); 37 | return -1; 38 | } 39 | 40 | serv_addr.sin_family = AF_INET; 41 | serv_addr.sin_port = htons(t_info -> port); 42 | 43 | // Convert IPv4 and IPv6 addresses from text to binary form 44 | if(inet_pton(AF_INET, "127.0.0.1", &serv_addr.sin_addr)<=0) 45 | // if(inet_pton(AF_INET, "10.128.0.11", &serv_addr.sin_addr)<=0) 46 | { 47 | printf("\nInvalid address/ Address not supported \n"); 48 | return -1; 49 | } 50 | 51 | if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr))<0) 52 | { 53 | printf("\nConnection Failed \n"); 54 | return -1; 55 | } 56 | 57 | printf("Start sending data.\n"); 58 | //////////////// Data transfer //////////////// 59 | int i = 0; 60 | float total_sent_bytes = 0.0; 61 | 62 | clock_t start = clock(); 63 | 64 | for (int i = 0; i < LOOP_NUM * BATCH_NUM_PER_LOOP; i++) { 65 | 66 | int total_sent_bytes = 0; 67 | 68 | while (total_sent_bytes < BLOCK_SIZE) { 69 | int sent_bytes = send(sock, array_buf + total_sent_bytes, BLOCK_SIZE - total_sent_bytes, 0); 70 | total_sent_bytes += sent_bytes; 71 | if (sent_bytes == -1) { 72 | printf("Sending data UNSUCCESSFUL!\n"); 73 | return -1; 74 | } 75 | } 76 | 77 | if (total_sent_bytes != BLOCK_SIZE) { 78 | printf("Sending error, sending more bytes than a block\n"); 79 | } 80 | } 81 | 82 | clock_t end = clock(); 83 | 84 | // Should wait until the server said all the data was sent correctly, 85 | // otherwise the sender may send packets yet the server did not receive. 86 | char msg[32]; 87 | int recv_bytes = read(sock, msg, 32); 88 | printf("received from server: %s\n", msg); 89 | 90 | float total_size = (float)LOOP_NUM * BATCH_NUM_PER_LOOP * BLOCK_SIZE; 91 | printf("Data sent. Packet number:%d\tPacket size:%d bytes\tTotal data:%fGB\n", 92 | LOOP_NUM * BATCH_NUM_PER_LOOP, BLOCK_SIZE, total_size / (1024 * 1024 * 1024)); 93 | float elapsed_time = (end-start) / (float)CLOCKS_PER_SEC; 94 | printf("\nConsumed time: %f seconds\n", elapsed_time); 95 | printf("Transfer Throughput: %f GB / sec\n", total_size / elapsed_time / 1024 / 1024 / 1024); 96 | 97 | return NULL; 98 | } 99 | 100 | int main(int argc, char const *argv[]) 101 | { 102 | 103 | pthread_t thread_id; 104 | printf("Before Thread\n"); 105 | 106 | struct Thread_info t_info_0; 107 | t_info_0.port = PORT; 108 | 109 | pthread_create(&thread_id, NULL, thread_send_packets, (void*) &t_info_0); 110 | // pthread_create(&thread_id, NULL, thread_send_packets, NULL); 111 | pthread_join(thread_id, NULL); 112 | printf("After Thread\n"); 113 | 114 | return 0; 115 | } -------------------------------------------------------------------------------- /GPU/measure_network_cuda_cp_latency_single_node/single_connection_network_client_sender.c: -------------------------------------------------------------------------------- 1 | // Client side C/C++ program to demonstrate Socket programming 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "constant.h" 13 | 14 | 15 | struct Thread_info { 16 | int port; 17 | }; 18 | 19 | // A normal C function that is executed as a thread 20 | void *thread_send_packets(void* vargp) 21 | { 22 | struct Thread_info* t_info = (struct Thread_info*) vargp; 23 | printf("Printing Port from Thread %d\n", t_info -> port); 24 | 25 | 26 | int sock = 0, valread; 27 | struct sockaddr_in serv_addr; 28 | 29 | float array_buf[BLOCK_ENTRY_NUM]; 30 | for (int i = 0; i < BLOCK_ENTRY_NUM; i++) { 31 | array_buf[i] = 1; 32 | } 33 | 34 | if ((sock = socket(AF_INET, SOCK_STREAM, 0)) < 0) 35 | { 36 | printf("\n Socket creation error \n"); 37 | return -1; 38 | } 39 | 40 | serv_addr.sin_family = AF_INET; 41 | serv_addr.sin_port = htons(t_info -> port); 42 | 43 | // Convert IPv4 and IPv6 addresses from text to binary form 44 | if(inet_pton(AF_INET, "127.0.0.1", &serv_addr.sin_addr)<=0) 45 | // if(inet_pton(AF_INET, "10.128.0.11", &serv_addr.sin_addr)<=0) 46 | { 47 | printf("\nInvalid address/ Address not supported \n"); 48 | return -1; 49 | } 50 | 51 | if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr))<0) 52 | { 53 | printf("\nConnection Failed \n"); 54 | return -1; 55 | } 56 | 57 | printf("Start sending data.\n"); 58 | //////////////// Data transfer //////////////// 59 | int i = 0; 60 | float total_sent_bytes = 0.0; 61 | 62 | clock_t start = clock(); 63 | 64 | for (int i = 0; i < LOOP_NUM * BATCH_NUM_PER_LOOP; i++) { 65 | 66 | int total_sent_bytes = 0; 67 | 68 | while (total_sent_bytes < BLOCK_SIZE) { 69 | int sent_bytes = send(sock, array_buf + total_sent_bytes, BLOCK_SIZE - total_sent_bytes, 0); 70 | total_sent_bytes += sent_bytes; 71 | if (sent_bytes == -1) { 72 | printf("Sending data UNSUCCESSFUL!\n"); 73 | return -1; 74 | } 75 | } 76 | 77 | if (total_sent_bytes != BLOCK_SIZE) { 78 | printf("Sending error, sending more bytes than a block\n"); 79 | } 80 | } 81 | 82 | clock_t end = clock(); 83 | 84 | // Should wait until the server said all the data was sent correctly, 85 | // otherwise the sender may send packets yet the server did not receive. 86 | char msg[32]; 87 | int recv_bytes = read(sock, msg, 32); 88 | printf("received from server: %s\n", msg); 89 | 90 | float total_size = (float)LOOP_NUM * BATCH_NUM_PER_LOOP * BLOCK_SIZE; 91 | printf("Data sent. Packet number:%d\tPacket size:%d bytes\tTotal data:%fGB\n", 92 | LOOP_NUM * BATCH_NUM_PER_LOOP, BLOCK_SIZE, total_size / (1024 * 1024 * 1024)); 93 | float elapsed_time = (end-start) / (float)CLOCKS_PER_SEC; 94 | printf("\nConsumed time: %f seconds\n", elapsed_time); 95 | printf("Transfer Throughput: %f GB / sec\n", total_size / elapsed_time / 1024 / 1024 / 1024); 96 | 97 | return NULL; 98 | } 99 | 100 | int main(int argc, char const *argv[]) 101 | { 102 | 103 | pthread_t thread_id; 104 | printf("Before Thread\n"); 105 | 106 | struct Thread_info t_info_0; 107 | t_info_0.port = PORT; 108 | 109 | pthread_create(&thread_id, NULL, thread_send_packets, (void*) &t_info_0); 110 | // pthread_create(&thread_id, NULL, thread_send_packets, NULL); 111 | pthread_join(thread_id, NULL); 112 | printf("After Thread\n"); 113 | 114 | return 0; 115 | } -------------------------------------------------------------------------------- /FPGA/common/includes/bitmap/bitmap.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | #include 30 | #include 31 | #include 32 | #include 33 | 34 | #include 35 | 36 | #include "bitmap.h" 37 | 38 | BitmapInterface::BitmapInterface(const char *f) : filename(f) { 39 | core = NULL; 40 | dib = NULL; 41 | image = NULL; 42 | 43 | magicNumber = 0; 44 | fileSize = 0; 45 | offsetOfImage = 0; 46 | 47 | sizeOfDIB = 0; 48 | sizeOfImage = 0; 49 | 50 | height = -1; 51 | width = -1; 52 | } 53 | 54 | BitmapInterface::~BitmapInterface() { 55 | if (core != NULL) 56 | delete[] core; 57 | if (dib != NULL) 58 | delete[] dib; 59 | if (image != NULL) 60 | delete[] image; 61 | } 62 | 63 | bool BitmapInterface::readBitmapFile() { 64 | // First, open the bitmap file 65 | int fd; 66 | unsigned int fileSize; 67 | 68 | fd = open(filename, O_RDONLY); 69 | if (fd < 0) { 70 | std::cerr << "Cannot read image file " << filename << std::endl; 71 | return false; 72 | } 73 | 74 | core = new char[14]; 75 | read(fd, core, 14); 76 | magicNumber = (*(unsigned short *)(&(core[0]))); 77 | fileSize = (*(unsigned int *)(&(core[2]))); 78 | offsetOfImage = (*(unsigned int *)(&(core[10]))); 79 | 80 | // Just read in the DIB, but don't process it 81 | sizeOfDIB = offsetOfImage - 14; 82 | dib = new char[sizeOfDIB]; 83 | read(fd, dib, sizeOfDIB); 84 | 85 | width = (*(int *)(&(dib[4]))); 86 | height = (*(int *)(&(dib[8]))); 87 | 88 | sizeOfImage = fileSize - 14 - sizeOfDIB; 89 | int numPixels = sizeOfImage / 3; // RGB 90 | 91 | image = new int[numPixels]; 92 | 93 | for (int i = 0; i < numPixels; ++i) { 94 | // Use an integer for every pixel even though we might not need that 95 | // much space (padding 0 bits in the rest of the integer) 96 | image[i] = 0; 97 | read(fd, &(image[i]), 3); 98 | } 99 | 100 | return true; 101 | } 102 | 103 | bool BitmapInterface::writeBitmapFile(int *otherImage) { 104 | int fd; 105 | fd = open("output.bmp", O_WRONLY | O_CREAT, 0644); 106 | 107 | if (fd < 0) { 108 | std::cerr << "Cannot open output.bmp for writing!" << std::endl; 109 | return false; 110 | } 111 | 112 | write(fd, core, 14); 113 | write(fd, dib, sizeOfDIB); 114 | 115 | int numPixels = sizeOfImage / 3; 116 | 117 | int *outputImage = otherImage != NULL ? otherImage : image; 118 | 119 | for (int i = 0; i < numPixels; ++i) { 120 | write(fd, &(outputImage[i]), 3); 121 | } 122 | 123 | return true; 124 | } 125 | -------------------------------------------------------------------------------- /FPGA/common/includes/xcl2/xcl2.cpp: -------------------------------------------------------------------------------- 1 | /********** 2 | Copyright (c) 2019, Xilinx, Inc. 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without modification, 6 | are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its contributors 16 | may be used to endorse or promote products derived from this software 17 | without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 21 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 | INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 24 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 | HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 27 | EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | **********/ 29 | 30 | #include "xcl2.hpp" 31 | #include 32 | #include 33 | #include 34 | 35 | namespace xcl { 36 | std::vector get_devices(const std::string &vendor_name) { 37 | size_t i; 38 | cl_int err; 39 | std::vector platforms; 40 | OCL_CHECK(err, err = cl::Platform::get(&platforms)); 41 | cl::Platform platform; 42 | for (i = 0; i < platforms.size(); i++) { 43 | platform = platforms[i]; 44 | OCL_CHECK(err, 45 | std::string platformName = 46 | platform.getInfo(&err)); 47 | if (platformName == vendor_name) { 48 | std::cout << "Found Platform" << std::endl; 49 | std::cout << "Platform Name: " << platformName.c_str() << std::endl; 50 | break; 51 | } 52 | } 53 | if (i == platforms.size()) { 54 | std::cout << "Error: Failed to find Xilinx platform" << std::endl; 55 | exit(EXIT_FAILURE); 56 | } 57 | //Getting ACCELERATOR Devices and selecting 1st such device 58 | std::vector devices; 59 | OCL_CHECK(err, 60 | err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices)); 61 | return devices; 62 | } 63 | 64 | std::vector get_xil_devices() { return get_devices("Xilinx"); } 65 | 66 | std::vector 67 | read_binary_file(const std::string &xclbin_file_name) { 68 | std::cout << "INFO: Reading " << xclbin_file_name << std::endl; 69 | 70 | if (access(xclbin_file_name.c_str(), R_OK) != 0) { 71 | printf("ERROR: %s xclbin not available please build\n", 72 | xclbin_file_name.c_str()); 73 | exit(EXIT_FAILURE); 74 | } 75 | //Loading XCL Bin into char buffer 76 | std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; 77 | std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); 78 | bin_file.seekg(0, bin_file.end); 79 | auto nb = bin_file.tellg(); 80 | bin_file.seekg(0, bin_file.beg); 81 | std::vector buf; 82 | buf.resize(nb); 83 | bin_file.read(reinterpret_cast(buf.data()), nb); 84 | return buf; 85 | } 86 | 87 | bool is_emulation() { 88 | bool ret = false; 89 | char *xcl_mode = getenv("XCL_EMULATION_MODE"); 90 | if (xcl_mode != NULL) { 91 | ret = true; 92 | } 93 | return ret; 94 | } 95 | 96 | bool is_hw_emulation() { 97 | bool ret = false; 98 | char *xcl_mode = getenv("XCL_EMULATION_MODE"); 99 | if ((xcl_mode != NULL) && !strcmp(xcl_mode, "hw_emu")) { 100 | ret = true; 101 | } 102 | return ret; 103 | } 104 | 105 | bool is_xpr_device(const char *device_name) { 106 | const char *output = strstr(device_name, "xpr"); 107 | 108 | if (output == NULL) { 109 | return false; 110 | } else { 111 | return true; 112 | } 113 | } 114 | }; // namespace xcl 115 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/iperf_krnl/src/hls/packet.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef PACKET_HPP 28 | #define PACKET_HPP 29 | 30 | #include "stdint.h" 31 | #include "axi_utils.hpp" 32 | 33 | using namespace hls; 34 | 35 | template 36 | class packetHeader { 37 | public: 38 | bool ready; 39 | uint16_t idx; 40 | ap_uint header; 41 | 42 | public: 43 | packetHeader() 44 | :ready(false), idx(0) {} 45 | packetHeader& operator=(const packetHeader& other) 46 | { 47 | ready = other.ready; 48 | idx = other.idx; 49 | header = other.header; 50 | return *this; 51 | } 52 | 53 | void parseWord(ap_uint& w) 54 | { 55 | if (ready) 56 | return; 57 | 58 | if (idx*W+W < HEADER_SIZE) 59 | { 60 | header(idx*W+W-1, idx*W) = w; 61 | } 62 | else //(idx*W+W >= HEADER_SIZE) 63 | { 64 | header(HEADER_SIZE-1, idx*W) = w; 65 | ready = true; 66 | } 67 | idx++; 68 | /*(header(idx*W+W-1, idx*W) = w; 69 | if (idx*W+W >= HEADER_SIZE) 70 | { 71 | ready = true; 72 | }*/ 73 | } 74 | ap_uint<8> consumeWord(ap_uint& w) 75 | { 76 | if ((idx+1)*W <= HEADER_SIZE) 77 | { 78 | w = header(((idx+1)*W)-1, idx*W); 79 | idx++; 80 | return ((HEADER_SIZE - (idx*W)) / 8); 81 | } 82 | else if (idx*W < HEADER_SIZE) 83 | { 84 | w((HEADER_SIZE%W)-1, 0) = header(HEADER_SIZE-1, idx*W); 85 | idx++; 86 | return 0;//(HEADER_SIZE - (idx*W)); 87 | } 88 | return 0; 89 | } 90 | /*bool consumeWord(ap_uint& w) 91 | { 92 | if ((idx+2)*W <= HEADER_SIZE) 93 | { 94 | w = header(((idx+1)*W)-1, idx*W); 95 | idx++; 96 | return false; 97 | /*if ((idx+1)*W > HEADER_SIZE) 98 | { 99 | return true; 100 | } 101 | else 102 | { 103 | return false; 104 | }*//* 105 | } 106 | else if ((idx+1)*W <= HEADER_SIZE) 107 | { 108 | w = header(((idx+1)*W)-1, idx*W); 109 | idx++; 110 | return true; 111 | } 112 | return true; 113 | }*/ 114 | /*void consumePartialWord(ap_uint& w) 115 | { 116 | if (idx*W < HEADER_SIZE) 117 | { 118 | w((HEADER_SIZE%AXI_WIDTH)-1, 0) = header(HEADER_SIZE-1, idx*W); 119 | idx++; 120 | } 121 | //return true; 122 | } 123 | /*bool consumeWord(ap_uint& w) 124 | { 125 | if ((idx+1)*W <= HEADER_SIZE) 126 | { 127 | w = header(((idx+1)*W)-1, idx*W); 128 | idx++; 129 | return true; 130 | } 131 | return false; 132 | } 133 | bool consumePartialWord(ap_uint& w) 134 | { 135 | if (idx*W < HEADER_SIZE) 136 | { 137 | w((HEADER_SIZE%AXI_WIDTH)-1, 0) = header(HEADER_SIZE-1, idx*W); 138 | idx++; 139 | return true; 140 | } 141 | return false; 142 | }*/ 143 | void setRawHeader(ap_uint h) 144 | { 145 | header = h; 146 | } 147 | ap_uint getRawHeader() 148 | { 149 | return header; 150 | } 151 | bool isReady() 152 | { 153 | return ready; 154 | } 155 | 156 | void clear() 157 | { 158 | #pragma HLS pipeline II=1 159 | //header = 0; 160 | ready = false; 161 | idx = 0; 162 | } 163 | }; 164 | 165 | #endif 166 | -------------------------------------------------------------------------------- /FPGA/kernel/user_krnl/scatter_krnl/src/hls/packet.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2019, Systems Group, ETH Zurich 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without modification, 6 | * are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, 9 | * this list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 3. Neither the name of the copyright holder nor the names of its contributors 14 | * may be used to endorse or promote products derived from this software 15 | * without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 19 | * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 | * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 | * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 25 | * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | #ifndef PACKET_HPP 28 | #define PACKET_HPP 29 | 30 | #include "stdint.h" 31 | #include "axi_utils.hpp" 32 | 33 | using namespace hls; 34 | 35 | template 36 | class packetHeader { 37 | public: 38 | bool ready; 39 | uint16_t idx; 40 | ap_uint header; 41 | 42 | public: 43 | packetHeader() 44 | :ready(false), idx(0) {} 45 | packetHeader& operator=(const packetHeader& other) 46 | { 47 | ready = other.ready; 48 | idx = other.idx; 49 | header = other.header; 50 | return *this; 51 | } 52 | 53 | void parseWord(ap_uint& w) 54 | { 55 | if (ready) 56 | return; 57 | 58 | if (idx*W+W < HEADER_SIZE) 59 | { 60 | header(idx*W+W-1, idx*W) = w; 61 | } 62 | else //(idx*W+W >= HEADER_SIZE) 63 | { 64 | header(HEADER_SIZE-1, idx*W) = w; 65 | ready = true; 66 | } 67 | idx++; 68 | /*(header(idx*W+W-1, idx*W) = w; 69 | if (idx*W+W >= HEADER_SIZE) 70 | { 71 | ready = true; 72 | }*/ 73 | } 74 | ap_uint<8> consumeWord(ap_uint& w) 75 | { 76 | if ((idx+1)*W <= HEADER_SIZE) 77 | { 78 | w = header(((idx+1)*W)-1, idx*W); 79 | idx++; 80 | return ((HEADER_SIZE - (idx*W)) / 8); 81 | } 82 | else if (idx*W < HEADER_SIZE) 83 | { 84 | w((HEADER_SIZE%W)-1, 0) = header(HEADER_SIZE-1, idx*W); 85 | idx++; 86 | return 0;//(HEADER_SIZE - (idx*W)); 87 | } 88 | return 0; 89 | } 90 | /*bool consumeWord(ap_uint& w) 91 | { 92 | if ((idx+2)*W <= HEADER_SIZE) 93 | { 94 | w = header(((idx+1)*W)-1, idx*W); 95 | idx++; 96 | return false; 97 | /*if ((idx+1)*W > HEADER_SIZE) 98 | { 99 | return true; 100 | } 101 | else 102 | { 103 | return false; 104 | }*//* 105 | } 106 | else if ((idx+1)*W <= HEADER_SIZE) 107 | { 108 | w = header(((idx+1)*W)-1, idx*W); 109 | idx++; 110 | return true; 111 | } 112 | return true; 113 | }*/ 114 | /*void consumePartialWord(ap_uint& w) 115 | { 116 | if (idx*W < HEADER_SIZE) 117 | { 118 | w((HEADER_SIZE%AXI_WIDTH)-1, 0) = header(HEADER_SIZE-1, idx*W); 119 | idx++; 120 | } 121 | //return true; 122 | } 123 | /*bool consumeWord(ap_uint& w) 124 | { 125 | if ((idx+1)*W <= HEADER_SIZE) 126 | { 127 | w = header(((idx+1)*W)-1, idx*W); 128 | idx++; 129 | return true; 130 | } 131 | return false; 132 | } 133 | bool consumePartialWord(ap_uint& w) 134 | { 135 | if (idx*W < HEADER_SIZE) 136 | { 137 | w((HEADER_SIZE%AXI_WIDTH)-1, 0) = header(HEADER_SIZE-1, idx*W); 138 | idx++; 139 | return true; 140 | } 141 | return false; 142 | }*/ 143 | void setRawHeader(ap_uint h) 144 | { 145 | header = h; 146 | } 147 | ap_uint getRawHeader() 148 | { 149 | return header; 150 | } 151 | bool isReady() 152 | { 153 | return ready; 154 | } 155 | 156 | void clear() 157 | { 158 | #pragma HLS pipeline II=1 159 | //header = 0; 160 | ready = false; 161 | idx = 0; 162 | } 163 | }; 164 | 165 | #endif 166 | -------------------------------------------------------------------------------- /FPGA/common/utility/readme_gen/gs_summary_util.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, re 4 | import fnmatch 5 | import json 6 | 7 | 8 | def get_testcases(dir): 9 | testcase_list = [] 10 | for root, dirnames, filenames in os.walk(dir): 11 | for filename in fnmatch.filter(filenames, 'description.json'): 12 | testcase_list.append(os.path.join(root, filename)) 13 | return testcase_list 14 | 15 | def get_drives(dir): 16 | folders = [] 17 | while 1: 18 | dir, folder = os.path.split(dir) 19 | if folder != "" and folder != ".": 20 | folders.append(folder) 21 | else: 22 | break 23 | folders.reverse() 24 | return folders 25 | 26 | def get_immediate_subdirectories(dir): 27 | return [name for name in os.listdir(dir) 28 | if os.path.isdir(os.path.join(dir, name))] 29 | 30 | def gen_category(dir ,outfile, subdircount): 31 | 32 | links = "[" + dir +"]:"+ dir + "\n" 33 | testcaselist = get_testcases(dir); 34 | testcaselist.sort(); 35 | for testcase in testcaselist: 36 | drives = get_drives(testcase) 37 | link = "" 38 | if len(drives) <= subdircount : 39 | continue 40 | for drive in drives: 41 | if drive == "description.json": 42 | continue 43 | link += drive +"/" 44 | links += "[" + link + "]:" + link + "\n" 45 | 46 | outfile.write("["+link+"][]") 47 | outfile.write("|") 48 | desc = open(testcase,'r') 49 | data = json.load(desc) 50 | outfile.write(('\n').join(data["description"])) 51 | outfile.write("|") 52 | if 'key_concepts' in data: 53 | outfile.write("__Key__ __Concepts__") 54 | key_concepts = data["key_concepts"] 55 | for i, kc in enumerate(key_concepts): 56 | outfile.write("
") 57 | outfile.write(" - ") 58 | outfile.write(kc) 59 | outfile.write("
") 60 | if 'keywords' in data: 61 | outfile.write("__Keywords__") 62 | keywords = data["keywords"] 63 | for i, kw in enumerate(keywords): 64 | outfile.write("
") 65 | outfile.write(" - ") 66 | outfile.write(kw) 67 | outfile.write("\n") 68 | desc.close() 69 | return links 70 | 71 | def genReadMe(dir): 72 | desc = open(os.path.join(dir,"summary.json"),'r') 73 | data = json.load(desc) 74 | outfile = open(os.path.join(dir, "README.md"), "w") 75 | outfile.write(('\n').join((data["description"]))) 76 | outfile.write("\n") 77 | outfile.write("==================================\n") 78 | outfile.write(('\n').join((data["description"]))) 79 | outfile.write("\n") 80 | if 'subdirs' in data: 81 | subDirs = data['subdirs']; 82 | else: 83 | subDirs = get_immediate_subdirectories(dir); 84 | subDirs.sort(); 85 | outfile.write("\nS.No. | Category | Description \n") 86 | outfile.write("--------|-----------|-----------------------------------------\n") 87 | counter = 1; 88 | links = "" 89 | 90 | for subDir in subDirs: 91 | desc_file = os.path.join(subDir,"summary.json") 92 | if os.path.exists(desc_file): 93 | subDirDesc = open(os.path.join(subDir,"summary.json"),'r') 94 | subDirData = json.load(subDirDesc) 95 | outfile.write(str(counter)); 96 | outfile.write(" | [" +subDir +"][] |") 97 | outfile.write(('\n').join(subDirData["description"])) 98 | outfile.write("\n") 99 | counter = counter + 1; 100 | 101 | outfile.write("\n __Examples Table__ \n") 102 | table_header = """ 103 | Example | Description | Key Concepts / Keywords 104 | ---------------|-----------------------|------------------------- 105 | """ 106 | outfile.write(table_header) 107 | for subDir in subDirs: 108 | links = links + gen_category(subDir,outfile,2); 109 | 110 | outfile.write("\n") 111 | outfile.write(links) 112 | outfile.close(); 113 | 114 | def genReadMe2(dir): 115 | desc = open(os.path.join(dir,"summary.json"),'r') 116 | data = json.load(desc) 117 | outfile = open(os.path.join(dir, "README.md"), "w") 118 | outfile.write(('\n').join((data["overview"]))) 119 | outfile.write("\n") 120 | outfile.write("==================================\n") 121 | outfile.write(('\n').join((data["description"]))) 122 | outfile.write("\n") 123 | outfile.write("\n __Examples Table__ \n") 124 | table_header = """ 125 | Example | Description | Key Concepts / Keywords 126 | ---------------|-----------------------|------------------------- 127 | """ 128 | outfile.write(table_header) 129 | links = gen_category(dir,outfile,1) 130 | outfile.write("\n") 131 | outfile.write(links) 132 | outfile.close(); 133 | 134 | --------------------------------------------------------------------------------