├── data
    └── .keep
├── host_srcs
    ├── inaccel
    │   ├── runtime-api.h
    │   ├── runtime-api.cpp
    │   ├── runtime.h
    │   └── runtime.cpp
    ├── common
    │   ├── INcl.h
    │   └── INcl.cpp
    └── LogisticRegression.cpp
├── Makefile
├── README.md
├── kernel_srcs
    ├── Gradients_0.cpp
    ├── Gradients_1.cpp
    ├── Gradients_2.cpp
    └── Gradients_3.cpp
└── LICENSE


/data/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime-api.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright © 2019 InAccel
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |   http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 | 
17 | #ifndef RUNTIME_API_H
18 | #define RUNTIME_API_H
19 | 
20 | #include "runtime.h"
21 | 
22 | // InAccel function calls.
23 | class InAccel {
24 | 
25 | public:
26 |   // Creates the world.
27 |   static cl_world create_world(int device_id);
28 | 
29 |   // Allocates a new buffer.
30 |   static void *malloc(cl_world world, size_t size, int memory_id);
31 | 
32 |   // Transfers data to a previously allocated buffer.
33 |   static void memcpy_to(cl_world world, void *dst_ptr, size_t offset,
34 |                         void *src_ptr, size_t size);
35 | 
36 |   // Creates a new program.
37 |   static void create_program(cl_world world, const char *bitstream_name);
38 | 
39 |   // Creates a new egine.
40 |   static cl_engine create_engine(cl_world world, const char *kernel_name);
41 | 
42 |   // Sets an engine argument using a buffer.
43 |   static void set_engine_arg(cl_engine engine, int index, void *buffer);
44 | 
45 |   // Sets an engine argument using an int value.
46 |   static void set_engine_arg(cl_engine engine, int index, int value);
47 | 
48 |   // Sets an engine argument using a long value.
49 |   static void set_engine_arg(cl_engine engine, int index, long value);
50 | 
51 |   // Sets an engine argument using a float value.
52 |   static void set_engine_arg(cl_engine engine, int index, float value);
53 | 
54 |   // Sets an engine argument using a double value.
55 |   static void set_engine_arg(cl_engine engine, int index, double value);
56 | 
57 |   // Runs an engine.
58 |   static void run_engine(cl_engine engine);
59 | 
60 |   // Awaits an engine.
61 |   static void await_engine(cl_engine engine);
62 | 
63 |   // Releases an engine.
64 |   static void release_engine(cl_engine engine);
65 | 
66 |   // Releases a program.
67 |   static void release_program(cl_world world);
68 | 
69 |   // Transfers data from a previously allocated buffer.
70 |   static void memcpy_from(cl_world world, void *src_ptr, size_t offset,
71 |                           void *dst_ptr, size_t size);
72 | 
73 |   // Frees a buffer.
74 |   static void free(cl_world world, void *ptr);
75 | 
76 |   // Releases the world.
77 |   static void release_world(cl_world world);
78 | };
79 | 
80 | #endif
81 | 


--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime-api.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include "runtime-api.h"
 18 | 
 19 | // Creates the world.
 20 | cl_world InAccel::create_world(int device_id) {
 21 |   cl_world world = CreateWorld();
 22 | 
 23 |   GetPlatformID(world);
 24 | 
 25 |   GetDeviceID(world, (cl_uint)device_id);
 26 | 
 27 |   CreateContext(world);
 28 | 
 29 |   return world;
 30 | }
 31 | 
 32 | // Allocates a new buffer.
 33 | void *InAccel::malloc(cl_world world, size_t size, int memory_id) {
 34 |   return CreateBuffer(world, size, (cl_uint)memory_id);
 35 | }
 36 | 
 37 | // Transfers data to a previously allocated buffer.
 38 | void InAccel::memcpy_to(cl_world world, void *dst_ptr, size_t offset,
 39 |                         void *src_ptr, size_t size) {
 40 |   cl_command_queue command_queue = CreateCommandQueue(world);
 41 | 
 42 |   EnqueueMemcpyTo(command_queue, dst_ptr, offset, src_ptr, size);
 43 | 
 44 |   ReleaseCommandQueue(command_queue);
 45 | }
 46 | 
 47 | // Creates a new program.
 48 | void InAccel::create_program(cl_world world, const char *bitstream_name) {
 49 |   CreateProgram(world, bitstream_name);
 50 | }
 51 | 
 52 | // Creates a new egine.
 53 | cl_engine InAccel::create_engine(cl_world world, const char *kernel_name) {
 54 |   return CreateEngine(world, kernel_name);
 55 | }
 56 | 
 57 | // Sets an engine argument using a buffer.
 58 | void InAccel::set_engine_arg(cl_engine engine, int index, void *buffer) {
 59 |   SetEngineArgPointer(engine, (cl_uint)index, buffer);
 60 | }
 61 | 
 62 | // Sets an engine argument using an int value.
 63 | void InAccel::set_engine_arg(cl_engine engine, int index, int value) {
 64 |   SetEngineArg(engine, (cl_uint)index, sizeof(int), &value);
 65 | }
 66 | 
 67 | // Sets an engine argument using a long value.
 68 | void InAccel::set_engine_arg(cl_engine engine, int index, long value) {
 69 |   SetEngineArg(engine, (cl_uint)index, sizeof(long), &value);
 70 | }
 71 | 
 72 | // Sets an engine argument using a float value.
 73 | void InAccel::set_engine_arg(cl_engine engine, int index, float value) {
 74 |   SetEngineArg(engine, (cl_uint)index, sizeof(float), &value);
 75 | }
 76 | 
 77 | // Sets an engine argument using a double value.
 78 | void InAccel::set_engine_arg(cl_engine engine, int index, double value) {
 79 |   SetEngineArg(engine, (cl_uint)index, sizeof(double), &value);
 80 | }
 81 | 
 82 | // Runs an engine.
 83 | void InAccel::run_engine(cl_engine engine) { EnqueueEngine(engine); }
 84 | 
 85 | // Awaits an engine.
 86 | void InAccel::await_engine(cl_engine engine) { BlockEngine(engine); }
 87 | 
 88 | // Releases an engine.
 89 | void InAccel::release_engine(cl_engine engine) { ReleaseEngine(engine); }
 90 | 
 91 | // Releases a program.
 92 | void InAccel::release_program(cl_world world) { ReleaseProgram(world); }
 93 | 
 94 | // Transfers data from a previously allocated buffer.
 95 | void InAccel::memcpy_from(cl_world world, void *src_ptr, size_t offset,
 96 |                           void *dst_ptr, size_t size) {
 97 |   cl_command_queue command_queue = CreateCommandQueue(world);
 98 | 
 99 |   EnqueueMemcpyFrom(command_queue, src_ptr, offset, dst_ptr, size);
100 | 
101 |   ReleaseCommandQueue(command_queue);
102 | }
103 | 
104 | // Frees a buffer.
105 | void InAccel::free(cl_world world, void *ptr) { ReleaseBuffer(world, ptr); }
106 | 
107 | // Releases the world.
108 | void InAccel::release_world(cl_world world) {
109 |   ReleaseContext(world);
110 | 
111 |   ReleaseWorld(world);
112 | }
113 | 


--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #ifndef RUNTIME_H
 18 | #define RUNTIME_H
 19 | 
 20 | #include "common/INcl.h"
 21 | 
 22 | // Packs a world struct.
 23 | cl_world PackWorld(_cl_world *_world);
 24 | 
 25 | // Unpacks a world struct.
 26 | _cl_world *UnpackWorld(cl_world world);
 27 | 
 28 | // Packs an engine struct.
 29 | cl_engine PackEngine(_cl_engine *_engine);
 30 | 
 31 | // Unpacks an engine struct.
 32 | _cl_engine *UnpackEngine(cl_engine engine);
 33 | 
 34 | // Transforms an engine to the world.
 35 | cl_world EngineToWorld(cl_engine engine);
 36 | 
 37 | // Creates the world struct.
 38 | cl_world CreateWorld();
 39 | 
 40 | // Obtains the platform id.
 41 | void GetPlatformID(cl_world world);
 42 | 
 43 | // Obtains the specified device id.
 44 | void GetDeviceID(cl_world world, cl_uint id);
 45 | 
 46 | // Creates the context.
 47 | void CreateContext(cl_world world);
 48 | 
 49 | // Creates a program with the specified name.
 50 | void CreateProgram(cl_world world, const char *bitstream_name);
 51 | 
 52 | // Creates a command queue.
 53 | cl_command_queue CreateCommandQueue(cl_world world);
 54 | 
 55 | // Blocks until all tasks in a command queue have been completed.
 56 | void BlockCommandQueue(cl_command_queue command_queue);
 57 | 
 58 | // Releases a command queue.
 59 | void ReleaseCommandQueue(cl_command_queue command_queue);
 60 | 
 61 | // Allocates a memory buffer.
 62 | void *CreateBuffer(cl_world world, size_t size, cl_uint memory);
 63 | 
 64 | // Enqueues a memory copy operation to device.
 65 | void EnqueueMemcpyTo(cl_command_queue command_queue, void *dst_ptr, size_t offset, void *src_ptr, size_t size);
 66 | 
 67 | // Enqueues a memory copy operation from device.
 68 | void EnqueueMemcpyFrom(cl_command_queue command_queue, void *src_ptr, size_t offset, void *dst_ptr, size_t size);
 69 | 
 70 | // Frees a memory buffer.
 71 | void ReleaseBuffer(cl_world world, void *ptr);
 72 | 
 73 | // Creates a kernel with the specified name.
 74 | cl_kernel CreateKernel(cl_world world, const char *kernel_name);
 75 | 
 76 | // Sets a pointer kernel argument.
 77 | void SetKernelArgPointer(cl_kernel kernel, cl_uint arg_index, const void *arg_value);
 78 | 
 79 | // Sets a scalar kernel argument.
 80 | void SetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
 81 | 
 82 | // Enqueues a kernel operation (Task mode).
 83 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel);
 84 | 
 85 | // Enqueues a kernel operation (NDRangeKernel mode).
 86 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel, const size_t *global_work_size, const size_t *local_work_size);
 87 | 
 88 | // Releases a kernel.
 89 | void ReleaseKernel(cl_kernel kernel);
 90 | 
 91 | // Creates an engine struct with the specified name.
 92 | cl_engine CreateEngine(cl_world world, const char *kernel_name);
 93 | 
 94 | // Blocks until all tasks in an engine struct have been completed.
 95 | void BlockEngine(cl_engine engine);
 96 | 
 97 | // Sets a pointer engine struct argument.
 98 | void SetEngineArgPointer(cl_engine engine, cl_uint arg_index, const void *arg_value);
 99 | 
100 | // Sets a scalar engine struct argument.
101 | void SetEngineArg(cl_engine engine, cl_uint arg_index, size_t arg_size, const void *arg_value);
102 | 
103 | // Enqueues an engine struct operation (Task mode).
104 | void EnqueueEngine(cl_engine engine);
105 | 
106 | // Enqueues an engine struct operation (NDRangeKernel mode).
107 | void EnqueueEngine(cl_engine engine, const size_t *global_work_size, const size_t *local_work_size);
108 | 
109 | // Releases an engine struct.
110 | void ReleaseEngine(cl_engine engine);
111 | 
112 | // Releases a program.
113 | void ReleaseProgram(cl_world world);
114 | 
115 | // Releases the context.
116 | void ReleaseContext(cl_world world);
117 | 
118 | // Releases the world struct.
119 | void ReleaseWorld(cl_world world);
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | ifndef XILINX_SDX
  2 | $(error XILINX_SDX is not set)
  3 | endif
  4 | 
  5 | ifndef AWS_PLATFORM
  6 | $(error AWS_PLATFORM is not set)
  7 | endif
  8 | 
  9 | # Host compiler global settings
 10 | CC = g++ -O3 -Wno-deprecated-declarations
 11 | 
 12 | CLCC = xocc
 13 | 
 14 | BITSTREAM_NAME = Gradients
 15 | HOST_EXE = ${BITSTREAM_NAME}
 16 | 
 17 | PLATFORM = ${AWS_PLATFORM}
 18 | 
 19 | HOST_DIR = host_srcs
 20 | KERNEL_DIR = kernel_srcs
 21 | KERNEL_TYPE = cpp
 22 | 
 23 | # Host and Kernel sources
 24 | HOST_SRCS = $(wildcard $(HOST_DIR)/*/*.cpp) $(wildcard $(HOST_DIR)/*.cpp)
 25 | KERNEL_SRCS_CPP = $(wildcard $(KERNEL_DIR)/*.cpp)
 26 | 
 27 | HOST_OBJECTS := $(HOST_SRCS:.cpp=.o)
 28 | KERNEL_OBJECTS := $(KERNEL_SRCS_CPP:.cpp=.xo)
 29 | ESTIMATE_OBJCTS := $(KERNEL_SRCS_CPP:.cpp=.estimate)
 30 | 
 31 | # Include Libraries
 32 | HOST_CFLAGS = -O3 -Wall -I${XILINX_SDX}/runtime/include/1_2 -Ihost_srcs
 33 | HOST_LFLAGS = -L${XILINX_XRT}/lib -lxilinxopencl
 34 | 
 35 | # Connecting kernels to specific memory banks
 36 | BANKS = --sp Gradients_0_1.m_axi_gmem0:bank0 --sp Gradients_0_1.m_axi_gmem1:bank0 --sp Gradients_0_1.m_axi_gmem2:bank0 --sp Gradients_0_1.m_axi_gmem3:bank0 --sp Gradients_1_1.m_axi_gmem0:bank1 --sp Gradients_1_1.m_axi_gmem1:bank1 --sp Gradients_1_1.m_axi_gmem2:bank1 --sp Gradients_1_1.m_axi_gmem3:bank1 --sp Gradients_2_1.m_axi_gmem0:bank2 --sp Gradients_2_1.m_axi_gmem1:bank2 --sp Gradients_2_1.m_axi_gmem2:bank2 --sp Gradients_2_1.m_axi_gmem3:bank2 --sp Gradients_3_1.m_axi_gmem0:bank3 --sp Gradients_3_1.m_axi_gmem1:bank3 --sp Gradients_3_1.m_axi_gmem2:bank3 --sp Gradients_3_1.m_axi_gmem3:bank3
 37 | 
 38 | # Additional Vivado options
 39 | VIVADO_OPTS = --xp misc:enableGlobalHoldIter="True" --xp vivado_prop:run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=NoTimingRelaxation
 40 | 
 41 | SDA_FLOW = sw_emu
 42 | ifeq (${SDA_FLOW},sw_emu)
 43 | 	TARGET = -t sw_emu
 44 | else ifeq (${SDA_FLOW},hw_emu)
 45 | 	TARGET = -t hw_emu
 46 | else ifeq (${SDA_FLOW},hw)
 47 | 	TARGET = -t hw
 48 | endif
 49 | 
 50 | all:
 51 | 	make _TEST_="-D _TEST_" host
 52 | 
 53 | host: ${HOST_EXE}
 54 | 
 55 | xbin_sw_em:
 56 | 	@+make SDA_FLOW=sw_emu xbin
 57 | 
 58 | xbin_hw_em:
 59 | 	@+make SDA_FLOW=hw_emu xbin
 60 | 
 61 | xbin_hw :
 62 | 	@+make SDA_FLOW=hw xbin
 63 | 
 64 | run_sw_em:
 65 | 	@+make SDA_FLOW=sw_emu run_sem
 66 | 
 67 | run_hw_em:
 68 | 	@+make SDA_FLOW=hw_emu run_hem
 69 | 
 70 | run_sem: xconfig host xbin
 71 | 	XCL_EMULATION_MODE=sw_emu ./${HOST_EXE} 1
 72 | 
 73 | run_hem: xconfig host xbin
 74 | 	XCL_EMULATION_MODE=hw_emu ./${HOST_EXE} 1
 75 | 
 76 | xconfig:
 77 | 	emconfigutil --platform ${PLATFORM} --od . --nd 1
 78 | 
 79 | # Building host
 80 | ${HOST_EXE}: ${HOST_OBJECTS}
 81 | 	${CC} ${HOST_OBJECTS} ${HOST_LFLAGS} -o $@
 82 | 	${RM} -rf ${HOST_OBJECTS}
 83 | 
 84 | xbin: ${KERNEL_OBJECTS}
 85 | 	${CLCC} ${TARGET} --link -s --platform ${PLATFORM} ${VIVADO_OPTS} ${BANKS} ${KERNEL_OBJECTS} -o ${BITSTREAM_NAME}.xclbin
 86 | 	${RM} -rf ${KERNEL_OBJECTS}
 87 | 
 88 | estimate: ${ESTIMATE_OBJCTS}
 89 | 	${RM} -rf $(patsubst %.estimate,%.xo,$(ESTIMATE_OBJCTS))
 90 | 
 91 | %.o: %.cpp
 92 | 	${CC} ${_TEST_} ${HOST_CFLAGS} -c $< -o $@
 93 | 
 94 | # Building kernel
 95 | %.xo: %.cpp
 96 | 	${CLCC} ${TARGET} --save-temps --platform ${PLATFORM} --kernel $(notdir $(basename $<)) -c $< -o $@
 97 | 
 98 | %.estimate: %.${KERNEL_TYPE}
 99 | 	${CLCC} --target hw_emu --report_level estimate --save-temps --platform ${PLATFORM} --kernel $(notdir $(basename $<)) -c $< -o $(basename $<).xo
100 | 
101 | clean:
102 | 	${RM} -rf ${HOST_EXE} $(patsubst %.estimate,%.xo,$(ESTIMATE_OBJCTS)) ${KERNEL_OBJECTS} ${HOST_OBJECTS} emconfig.json *.log *.dir *.xml *.dcp *.dat _sds iprepo *.tcl xilinx_aws-vu9p-f1_dynamic_5_0.hpfm .Xil sdaccel_* system_estimate.xtxt _x top_sp.ltx
103 | 
104 | cleanall: clean
105 | 	${RM} -rf ${BITSTREAM_NAME}*
106 | 
107 | help:
108 | 	@echo "Compile and run CPU emulation"
109 | 	@echo "make run_sw_em"
110 | 	@echo ""
111 | 	@echo "Compile and run hardware emulation"
112 | 	@echo "make run_hw_em"
113 | 	@echo ""
114 | 	@echo "Compile host executable only"
115 | 	@echo "make host"
116 | 	@echo ""
117 | 	@echo "Compile host executable only for SW version"
118 | 	@echo "make"
119 | 	@echo ""
120 | 	@echo "Compile .xclbin file for system run only"
121 | 	@echo "make xbin_hw"
122 | 	@echo ""
123 | 	@echo "Compile .xclbin file for sw emulation"
124 | 	@echo "make xbin_sw_em"
125 | 	@echo ""
126 | 	@echo "Compile .xclbin file for hw emulation"
127 | 	@echo "make xbin_hw_em"
128 | 	@echo ""
129 | 	@echo "Clean working diretory"
130 | 	@echo "make clean"
131 | 	@echo "Clean working diretory and bitstream files"
132 | 	@echo "make cleanall"
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <a href="https://www.inaccel.com/">
 2 | <p align="center">
 3 | <img src="https://www.inaccel.com/wp-content/uploads/logo-horizontal1200px.png" width=60% height=60% align="middle" alt="InAccel"/>
 4 | </p>
 5 | </a>
 6 | 
 7 | # Logistic Regression IP core
 8 | 
 9 | 
10 | This is an FPGA accelerated solution for Logistic Regression BGD algorithm. It can provide up to **70x** speedup compared to a single threaded execution and up to **12x** compared to an 8 threaded Intel Xeon CPU execution respectively.
11 | 
12 | ## Specifications
13 | 
14 | |  Classes |  Features  |
15 | | :------: | :--------: |
16 | | up to 64 | up to 2047 |
17 | 
18 | ## Supported Platforms
19 | 
20 | |            Board            |
21 | | :-------------------------: |
22 | |      [Xilinx Alveo U200](https://www.xilinx.com/products/boards-and-kits/alveo/u200.html)      |
23 | |      [Xilinx Alveo U250](https://www.xilinx.com/products/boards-and-kits/alveo/u250.html)      |
24 | |   [AWS VU9P (F1 instances)](https://aws.amazon.com/ec2/instance-types/f1/)   |
25 | | Alibaba VU9P (F3 instances) |
26 | | Any other Xilinx platform with at least the same amount of VU9P resources |
27 | 
28 | ## Design Files
29 | 
30 | -   The application code is located in the hosts_srcs directory. Accelerator kernel files are located under the kernel_srcs directory while any accelerator binaries will be compiled to the current directory.
31 | -   The Makefile will help you generate any host executable and accelerator _.xclbin_ files.
32 | 
33 | A listing of all the files in this repository is shown below:
34 | 
35 |     - Makefile
36 |     - hosts_srcs/
37 |     	- LogisticRegression.cpp
38 |     	- common/
39 |     		- INcl.cpp (OpenCL wrapper functions)
40 |     		- INcl.h
41 |     	- inaccel/
42 |     		- runtime-api.cpp (InAccel runtime abstraction layer)
43 |     		- runtime-api.h
44 |     		- runtime.cpp (InAccel runtime abstraction layer)
45 |     		- runtime.h
46 |     - kernel_srcs/
47 |     	- Gradients_0.cpp (Accelerated kernel)
48 |     	- Gradients_1.cpp (Accelerated kernel)
49 |     	- Gradients_2.cpp (Accelerated kernel)
50 |     	- Gradients_3.cpp (Accelerated kernel)
51 |     - data/
52 | 
53 | ## Preparation
54 | 
55 | **!** Before invoking any of the Makefile targets make sure you have sourced Xilinx **XRT** setup script.  
56 | **!** Make sure you have set **XILINX_SDX** environment variable pointing to the SDx installation directory.
57 | 
58 | As far as the **platform** (or board) is concerned, Makefile uses **AWS_PLATFORM** environment variable as the target platform for the kernels compilation. If you are running this on AWS make sure AWS_PLATFORM environment variable is present and points to the platform DSA files<sup>1</sup>. Otherwise you can set Makefile `PLATFORM` variable to point to your platform DSA files.
59 | 
60 | 1.  To obtain the AWS platform DSA files make sure you have cloned the aws-fpga github repository
61 | 
62 | Download train letters train dataset to data directory. Navigate to data directory and execute the following commands:
63 | 
64 | ``` bash
65 | 	wget https://s3.amazonaws.com/inaccel-demo/data/nist/letters_csv_train.dat
66 | 	wget https://s3.amazonaws.com/inaccel-demo/data/nist/letters_csv_test.dat
67 | ```
68 | 
69 | ## Compiling the kernels
70 | 
71 | To compile the kernels for hardware target you just need to execute `make xbin_hw` while for software and hardware emulation you must execute  `make xbin_sw` and `make xbin_hw` respectively.  
72 | A full list of all the available Makefile targets can be found using `make help` command.
73 | 
74 | ## Single-thread - Single-application Execution
75 | 
76 | To test the generated xclbin file you can simply run `make host` command to create the host application. The host application takes only one input argument, the number of iterations.  
77 | Example execution: `./Gradients 100`
78 | 
79 | ## Scaling Up and Out with InAccel Coral
80 | 
81 | <a href="https://www.inaccel.com/coral-fpga-resource-manager/">
82 | <p align="center">
83 | <img src="https://www.inaccel.com/wp-content/uploads/coral_logo_big-1-e1561553344239.png" width=60% height=60% align="middle" alt="InAccel Coral"/>
84 | </p>
85 | </a>
86 | 
87 | The above example application spawns a single thread and can train a model using a single FPGA device which **is not viable for datacenter-scale needs**. Data scientists rely on frameworks like Scikit Learn and Apache Spark to create and test their machine learning pipelines.  
88 | **InAccel Coral** FPGA resource manager is able to automatically **scale** and **schedule** any acceleration requests to a **cluster of FPGAs**, perform **load balancing** techniques, **reconfigure** the FPGA devices, perform **memory management** etc., yet providing a simple to use **high level API** in Java, CPP and Python.  
89 | We have also ready-to-use **integrations** with broadly used open source frameworks like Apache Spark to seamlessly accelerate your pipelines.  
90 | Finally, shaping cutting edge technology, Coral is fully compatible with **Kubernetes** and using InAccel's device plugin you can set up a Kubernetes cluster aware of hardware accelerated resources or take advantage of **Serverless architecture** and provide acclerated serverless solutions to your own customers.
91 | 
92 | * You can **create a free InAccel Coral license** [here](https://www.inaccel.com/license/).
93 | * You can **download** InAccel Coral docker from [dockerhub](https://hub.docker.com/r/inaccel/coral).
94 | * You can find **full documentation** as well as a **quick starting guide** in [InAccel Docs](https://docs.inaccel.com/).
95 | 


--------------------------------------------------------------------------------
/host_srcs/common/INcl.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #ifndef INCL_H
 18 | #define INCL_H
 19 | 
 20 | #include <CL/opencl.h>
 21 | #include <stdint.h>
 22 | 
 23 | // InAccelCL world struct (Type).
 24 | typedef struct{
 25 | 	cl_platform_id platform_id;
 26 | 	cl_device_id device_id;
 27 | 	cl_context context;
 28 | 	cl_program program;
 29 | } _cl_world;
 30 | 
 31 | // InAccelCL world struct (API Type).
 32 | typedef uintptr_t cl_world;
 33 | 
 34 | // InAccelCL engine struct (Type).
 35 | typedef struct{
 36 | 	cl_world world;
 37 | 
 38 | 	cl_command_queue command_queue;
 39 | 	cl_kernel kernel;
 40 | } _cl_engine;
 41 | 
 42 | // InAccelCL engine struct (API Type).
 43 | typedef uintptr_t cl_engine;
 44 | 
 45 | // Builds a program executable from the program binary.
 46 | void INclBuildProgram(cl_program program);
 47 | 
 48 | // Creates a buffer object.
 49 | cl_mem INclCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr);
 50 | 
 51 | // Create a command-queue on a specific device.
 52 | cl_command_queue INclCreateCommandQueue(cl_context context, cl_device_id device);
 53 | 
 54 | // Creates an OpenCL context.
 55 | cl_context INclCreateContext(const cl_device_id device);
 56 | 
 57 | // Creates a kernel object.
 58 | cl_kernel INclCreateKernel(cl_program program, const char *kernel_name);
 59 | 
 60 | // Creates a program object for a context, and loads specified binary data into the program object.
 61 | cl_program INclCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id *device_list, const char *binary_name);
 62 | 
 63 | // Enqueues a command to map a region of the buffer object given by buffer into the host address space and returns a pointer to this mapped region.
 64 | void *INclEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_map_flags map_flags, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
 65 | 
 66 | // Enqueues a command to indicate which device a set of memory objects should be associated with.
 67 | void INclEnqueueMigrateMemObjects(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem *mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
 68 | 
 69 | // Enqueues a command to execute a kernel on a device.
 70 | void INclEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
 71 | 
 72 | // Enqueue commands to read from a buffer object to host memory.
 73 | void INclEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
 74 | 
 75 | // Enqueues a command to execute a kernel on a device.
 76 | void INclEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
 77 | 
 78 | // Enqueue commands to write to a buffer object from host memory.
 79 | void INclEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
 80 | 
 81 | // Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed.
 82 | void INclFinish(cl_command_queue command_queue);
 83 | 
 84 | // Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue.
 85 | void INclFlush(cl_command_queue command_queue);
 86 | 
 87 | // Obtain specified device, if available.
 88 | cl_device_id INclGetDeviceID(cl_platform_id platform, cl_uint id);
 89 | 
 90 | // Obtain the list of devices available on a platform.
 91 | void INclGetDeviceIDs(cl_platform_id platform, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
 92 | 
 93 | // Get specific information about the OpenCL device.
 94 | void INclGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret);
 95 | 
 96 | // Obtain platform, if available.
 97 | cl_platform_id INclGetPlatformID();
 98 | 
 99 | // Obtain the list of platforms available.
100 | void INclGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms);
101 | 
102 | // Get specific information about the OpenCL platform.
103 | void INclGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret);
104 | 
105 | // Decrements the command_queue reference count.
106 | void INclReleaseCommandQueue(cl_command_queue command_queue);
107 | 
108 | // Decrement the context reference count.
109 | void INclReleaseContext(cl_context context);
110 | 
111 | // Decrements the event reference count.
112 | void INclReleaseEvent(cl_event event);
113 | 
114 | // Decrements the kernel reference count.
115 | void INclReleaseKernel(cl_kernel kernel);
116 | 
117 | // Decrements the memory object reference count.
118 | void INclReleaseMemObject(cl_mem memobj);
119 | 
120 | // Decrements the program reference count.
121 | void INclReleaseProgram(cl_program program);
122 | 
123 | // Used to set the argument value for a specific argument of a kernel.
124 | void INclSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
125 | 
126 | // Waits on the host thread for commands identified by event objects to complete.
127 | void INclWaitForEvents(cl_uint num_events, const cl_event *event_list);
128 | 
129 | // Returns a message related to the error code.
130 | const char *INclCheckErrorCode(cl_int errcode);
131 | 
132 | #endif
133 | 


--------------------------------------------------------------------------------
/kernel_srcs/Gradients_0.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include <ap_int.h>
 18 | #include <math.h>
 19 | 
 20 | #define chunk 8
 21 | #define numClassesMax 64
 22 | #define numFeaturesPlusOneMax 128
 23 | #define vectorSize 16
 24 | 
 25 | typedef ap_int<256> float8;
 26 | typedef ap_int<512> float16;
 27 | 
 28 | union {
 29 |   int asInt;
 30 |   float asFloat;
 31 | } converter1, converter2;
 32 | 
 33 | // This function represents a Logistc Regression HLS kernel.
 34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
 36 | 
 37 | extern "C" {
 38 | void Gradients_0(float8 *_labels, float16 *_data, float16 *_weights,
 39 |                  float16 *_gradients, int numClasses, int numFeatures,
 40 |                  int chunkSize) {
 41 | 
 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
 54 | 
 55 |   float16 features[chunk][numFeaturesPlusOneMax],
 56 |       weights[numClassesMax][numFeaturesPlusOneMax],
 57 |       gradients[numClassesMax][numFeaturesPlusOneMax];
 58 |   float lin[numClassesMax][chunk * vectorSize];
 59 |   float prd[chunk][numClassesMax];
 60 | 
 61 | // Using URAMs for features, weights and gradients buffers
 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
 65 | 
 66 | // Partitioning the local arrays
 67 | #pragma HLS array_partition variable = features complete dim = 1
 68 | #pragma HLS array_partition variable = lin complete dim = 2
 69 | #pragma HLS array_partition variable = prd complete dim = 1
 70 | 
 71 |   // Compute the number of features iterations for float16 input data
 72 |   // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 ->  numFeaturesPlusOne =
 73 |   // 2)
 74 |   int numFeaturesPlusOne =
 75 |       (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
 76 |   // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
 77 |   // dependencies in some loops
 78 |   int numClassesMin = (13 > numClasses) ? 13 : numClasses;
 79 | 
 80 |   int c, i, j, k, t;
 81 | 
 82 |   // Reading weights and filling gradients with zeros
 83 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
 84 |        kj++, j++) {
 85 | #pragma HLS pipeline II = 1
 86 |     if (j == numFeaturesPlusOne) {
 87 |       j = 0;
 88 |       k++;
 89 |     }
 90 |     weights[k][j] = _weights[kj];
 91 |     gradients[k][j] = 0;
 92 |   }
 93 | 
 94 |   // Iterate over the points of the dataset each time reading a batch of 8
 95 |   // points
 96 |   for (i = 0; i < (chunkSize / chunk); i++) {
 97 |     int offset = (i * chunk) * numFeaturesPlusOne;
 98 | 
 99 |     // Reading the features of the dataset
100 |     for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 |       if (j == numFeaturesPlusOne) {
103 |         j = 0;
104 |         c++;
105 |       }
106 |       features[c][j] = _data[offset + cj];
107 |     }
108 | 
109 |     // Computing the algorithm's dot product
110 |     for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 |       for (c = 0; c < chunk; c++) {
113 |         for (t = 0; t < vectorSize; t++) {
114 |           converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 |           converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 |           lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 |         }
118 |       }
119 |     }
120 | 
121 |     for (j = 1; j < numFeaturesPlusOne; j++) {
122 |       for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 |         for (c = 0; c < chunk; c++) {
125 |           for (t = 0; t < vectorSize; t++) {
126 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 |             converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 |             lin[k][c * vectorSize + t] +=
129 |                 converter1.asFloat * converter2.asFloat;
130 |           }
131 |         }
132 |       }
133 |     }
134 | 
135 |     for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 |       for (c = 0; c < chunk; c++) {
138 |         prd[c][k] =
139 |             1.0 /
140 |             (1.0 +
141 |              exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 |                    lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 |                    lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 |                    lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 |                    lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 |                    lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 |                    lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 |                    lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 |       }
150 |     }
151 | 
152 |     // Reading the dataset labels and update predictions
153 |     float8 labels = _labels[i];
154 |     for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 |       int label = labels.range((c + 1) * 32 - 1, c * 32);
157 |       prd[c][label] -= 1.0;
158 |     }
159 | 
160 |     // Compute the output gradients
161 |     for (j = 0; j < numFeaturesPlusOne; j++) {
162 |       for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 |         for (c = 0; c < chunk; c++) {
165 |           for (t = 0; t < vectorSize; t++) {
166 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 |             converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 |             converter2.asFloat += prd[c][k] * converter1.asFloat;
169 |             gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 |           }
171 |         }
172 |       }
173 |     }
174 |   }
175 | 
176 |   // Write back gradients
177 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 |        kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 |     if (j == numFeaturesPlusOne) {
181 |       j = 0;
182 |       k++;
183 |     }
184 |     _gradients[kj] = gradients[k][j];
185 |   }
186 | }
187 | }
188 | 


--------------------------------------------------------------------------------
/kernel_srcs/Gradients_1.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include <ap_int.h>
 18 | #include <math.h>
 19 | 
 20 | #define chunk 8
 21 | #define numClassesMax 64
 22 | #define numFeaturesPlusOneMax 128
 23 | #define vectorSize 16
 24 | 
 25 | typedef ap_int<256> float8;
 26 | typedef ap_int<512> float16;
 27 | 
 28 | union {
 29 |   int asInt;
 30 |   float asFloat;
 31 | } converter1, converter2;
 32 | 
 33 | // This function represents a Logistc Regression HLS kernel.
 34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
 36 | 
 37 | extern "C" {
 38 | void Gradients_1(float8 *_labels, float16 *_data, float16 *_weights,
 39 |                  float16 *_gradients, int numClasses, int numFeatures,
 40 |                  int chunkSize) {
 41 | 
 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
 54 | 
 55 |   float16 features[chunk][numFeaturesPlusOneMax],
 56 |       weights[numClassesMax][numFeaturesPlusOneMax],
 57 |       gradients[numClassesMax][numFeaturesPlusOneMax];
 58 |   float lin[numClassesMax][chunk * vectorSize];
 59 |   float prd[chunk][numClassesMax];
 60 | 
 61 | // Using URAMs for features, weights and gradients buffers
 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
 65 | 
 66 | // Partitioning the local arrays
 67 | #pragma HLS array_partition variable = features complete dim = 1
 68 | #pragma HLS array_partition variable = lin complete dim = 2
 69 | #pragma HLS array_partition variable = prd complete dim = 1
 70 | 
 71 |   // Compute the number of features iterations for float16 input data
 72 |   // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 ->  numFeaturesPlusOne =
 73 |   // 2)
 74 |   int numFeaturesPlusOne =
 75 |       (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
 76 |   // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
 77 |   // dependencies in some loops
 78 |   int numClassesMin = (13 > numClasses) ? 13 : numClasses;
 79 | 
 80 |   int c, i, j, k, t;
 81 | 
 82 |   // Reading weights and filling gradients with zeros
 83 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
 84 |        kj++, j++) {
 85 | #pragma HLS pipeline II = 1
 86 |     if (j == numFeaturesPlusOne) {
 87 |       j = 0;
 88 |       k++;
 89 |     }
 90 |     weights[k][j] = _weights[kj];
 91 |     gradients[k][j] = 0;
 92 |   }
 93 | 
 94 |   // Iterate over the points of the dataset each time reading a batch of 8
 95 |   // points
 96 |   for (i = 0; i < (chunkSize / chunk); i++) {
 97 |     int offset = (i * chunk) * numFeaturesPlusOne;
 98 | 
 99 |     // Reading the features of the dataset
100 |     for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 |       if (j == numFeaturesPlusOne) {
103 |         j = 0;
104 |         c++;
105 |       }
106 |       features[c][j] = _data[offset + cj];
107 |     }
108 | 
109 |     // Computing the algorithm's dot product
110 |     for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 |       for (c = 0; c < chunk; c++) {
113 |         for (t = 0; t < vectorSize; t++) {
114 |           converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 |           converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 |           lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 |         }
118 |       }
119 |     }
120 | 
121 |     for (j = 1; j < numFeaturesPlusOne; j++) {
122 |       for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 |         for (c = 0; c < chunk; c++) {
125 |           for (t = 0; t < vectorSize; t++) {
126 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 |             converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 |             lin[k][c * vectorSize + t] +=
129 |                 converter1.asFloat * converter2.asFloat;
130 |           }
131 |         }
132 |       }
133 |     }
134 | 
135 |     for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 |       for (c = 0; c < chunk; c++) {
138 |         prd[c][k] =
139 |             1.0 /
140 |             (1.0 +
141 |              exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 |                    lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 |                    lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 |                    lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 |                    lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 |                    lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 |                    lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 |                    lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 |       }
150 |     }
151 | 
152 |     // Reading the dataset labels and update predictions
153 |     float8 labels = _labels[i];
154 |     for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 |       int label = labels.range((c + 1) * 32 - 1, c * 32);
157 |       prd[c][label] -= 1.0;
158 |     }
159 | 
160 |     // Compute the output gradients
161 |     for (j = 0; j < numFeaturesPlusOne; j++) {
162 |       for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 |         for (c = 0; c < chunk; c++) {
165 |           for (t = 0; t < vectorSize; t++) {
166 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 |             converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 |             converter2.asFloat += prd[c][k] * converter1.asFloat;
169 |             gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 |           }
171 |         }
172 |       }
173 |     }
174 |   }
175 | 
176 |   // Write back gradients
177 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 |        kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 |     if (j == numFeaturesPlusOne) {
181 |       j = 0;
182 |       k++;
183 |     }
184 |     _gradients[kj] = gradients[k][j];
185 |   }
186 | }
187 | }
188 | 


--------------------------------------------------------------------------------
/kernel_srcs/Gradients_2.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include <ap_int.h>
 18 | #include <math.h>
 19 | 
 20 | #define chunk 8
 21 | #define numClassesMax 64
 22 | #define numFeaturesPlusOneMax 128
 23 | #define vectorSize 16
 24 | 
 25 | typedef ap_int<256> float8;
 26 | typedef ap_int<512> float16;
 27 | 
 28 | union {
 29 |   int asInt;
 30 |   float asFloat;
 31 | } converter1, converter2;
 32 | 
 33 | // This function represents a Logistc Regression HLS kernel.
 34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
 36 | 
 37 | extern "C" {
 38 | void Gradients_2(float8 *_labels, float16 *_data, float16 *_weights,
 39 |                  float16 *_gradients, int numClasses, int numFeatures,
 40 |                  int chunkSize) {
 41 | 
 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
 54 | 
 55 |   float16 features[chunk][numFeaturesPlusOneMax],
 56 |       weights[numClassesMax][numFeaturesPlusOneMax],
 57 |       gradients[numClassesMax][numFeaturesPlusOneMax];
 58 |   float lin[numClassesMax][chunk * vectorSize];
 59 |   float prd[chunk][numClassesMax];
 60 | 
 61 | // Using URAMs for features, weights and gradients buffers
 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
 65 | 
 66 | // Partitioning the local arrays
 67 | #pragma HLS array_partition variable = features complete dim = 1
 68 | #pragma HLS array_partition variable = lin complete dim = 2
 69 | #pragma HLS array_partition variable = prd complete dim = 1
 70 | 
 71 |   // Compute the number of features iterations for float16 input data
 72 |   // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 ->  numFeaturesPlusOne =
 73 |   // 2)
 74 |   int numFeaturesPlusOne =
 75 |       (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
 76 |   // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
 77 |   // dependencies in some loops
 78 |   int numClassesMin = (13 > numClasses) ? 13 : numClasses;
 79 | 
 80 |   int c, i, j, k, t;
 81 | 
 82 |   // Reading weights and filling gradients with zeros
 83 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
 84 |        kj++, j++) {
 85 | #pragma HLS pipeline II = 1
 86 |     if (j == numFeaturesPlusOne) {
 87 |       j = 0;
 88 |       k++;
 89 |     }
 90 |     weights[k][j] = _weights[kj];
 91 |     gradients[k][j] = 0;
 92 |   }
 93 | 
 94 |   // Iterate over the points of the dataset each time reading a batch of 8
 95 |   // points
 96 |   for (i = 0; i < (chunkSize / chunk); i++) {
 97 |     int offset = (i * chunk) * numFeaturesPlusOne;
 98 | 
 99 |     // Reading the features of the dataset
100 |     for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 |       if (j == numFeaturesPlusOne) {
103 |         j = 0;
104 |         c++;
105 |       }
106 |       features[c][j] = _data[offset + cj];
107 |     }
108 | 
109 |     // Computing the algorithm's dot product
110 |     for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 |       for (c = 0; c < chunk; c++) {
113 |         for (t = 0; t < vectorSize; t++) {
114 |           converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 |           converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 |           lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 |         }
118 |       }
119 |     }
120 | 
121 |     for (j = 1; j < numFeaturesPlusOne; j++) {
122 |       for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 |         for (c = 0; c < chunk; c++) {
125 |           for (t = 0; t < vectorSize; t++) {
126 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 |             converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 |             lin[k][c * vectorSize + t] +=
129 |                 converter1.asFloat * converter2.asFloat;
130 |           }
131 |         }
132 |       }
133 |     }
134 | 
135 |     for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 |       for (c = 0; c < chunk; c++) {
138 |         prd[c][k] =
139 |             1.0 /
140 |             (1.0 +
141 |              exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 |                    lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 |                    lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 |                    lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 |                    lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 |                    lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 |                    lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 |                    lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 |       }
150 |     }
151 | 
152 |     // Reading the dataset labels and update predictions
153 |     float8 labels = _labels[i];
154 |     for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 |       int label = labels.range((c + 1) * 32 - 1, c * 32);
157 |       prd[c][label] -= 1.0;
158 |     }
159 | 
160 |     // Compute the output gradients
161 |     for (j = 0; j < numFeaturesPlusOne; j++) {
162 |       for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 |         for (c = 0; c < chunk; c++) {
165 |           for (t = 0; t < vectorSize; t++) {
166 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 |             converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 |             converter2.asFloat += prd[c][k] * converter1.asFloat;
169 |             gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 |           }
171 |         }
172 |       }
173 |     }
174 |   }
175 | 
176 |   // Write back gradients
177 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 |        kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 |     if (j == numFeaturesPlusOne) {
181 |       j = 0;
182 |       k++;
183 |     }
184 |     _gradients[kj] = gradients[k][j];
185 |   }
186 | }
187 | }
188 | 


--------------------------------------------------------------------------------
/kernel_srcs/Gradients_3.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include <ap_int.h>
 18 | #include <math.h>
 19 | 
 20 | #define chunk 8
 21 | #define numClassesMax 64
 22 | #define numFeaturesPlusOneMax 128
 23 | #define vectorSize 16
 24 | 
 25 | typedef ap_int<256> float8;
 26 | typedef ap_int<512> float16;
 27 | 
 28 | union {
 29 |   int asInt;
 30 |   float asFloat;
 31 | } converter1, converter2;
 32 | 
 33 | // This function represents a Logistc Regression HLS kernel.
 34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
 36 | 
 37 | extern "C" {
 38 | void Gradients_3(float8 *_labels, float16 *_data, float16 *_weights,
 39 |                  float16 *_gradients, int numClasses, int numFeatures,
 40 |                  int chunkSize) {
 41 | 
 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
 54 | 
 55 |   float16 features[chunk][numFeaturesPlusOneMax],
 56 |       weights[numClassesMax][numFeaturesPlusOneMax],
 57 |       gradients[numClassesMax][numFeaturesPlusOneMax];
 58 |   float lin[numClassesMax][chunk * vectorSize];
 59 |   float prd[chunk][numClassesMax];
 60 | 
 61 | // Using URAMs for features, weights and gradients buffers
 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
 65 | 
 66 | // Partitioning the local arrays
 67 | #pragma HLS array_partition variable = features complete dim = 1
 68 | #pragma HLS array_partition variable = lin complete dim = 2
 69 | #pragma HLS array_partition variable = prd complete dim = 1
 70 | 
 71 |   // Compute the number of features iterations for float16 input data
 72 |   // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 ->  numFeaturesPlusOne =
 73 |   // 2)
 74 |   int numFeaturesPlusOne =
 75 |       (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
 76 |   // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
 77 |   // dependencies in some loops
 78 |   int numClassesMin = (13 > numClasses) ? 13 : numClasses;
 79 | 
 80 |   int c, i, j, k, t;
 81 | 
 82 |   // Reading weights and filling gradients with zeros
 83 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
 84 |        kj++, j++) {
 85 | #pragma HLS pipeline II = 1
 86 |     if (j == numFeaturesPlusOne) {
 87 |       j = 0;
 88 |       k++;
 89 |     }
 90 |     weights[k][j] = _weights[kj];
 91 |     gradients[k][j] = 0;
 92 |   }
 93 | 
 94 |   // Iterate over the points of the dataset each time reading a batch of 8
 95 |   // points
 96 |   for (i = 0; i < (chunkSize / chunk); i++) {
 97 |     int offset = (i * chunk) * numFeaturesPlusOne;
 98 | 
 99 |     // Reading the features of the dataset
100 |     for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 |       if (j == numFeaturesPlusOne) {
103 |         j = 0;
104 |         c++;
105 |       }
106 |       features[c][j] = _data[offset + cj];
107 |     }
108 | 
109 |     // Computing the algorithm's dot product
110 |     for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 |       for (c = 0; c < chunk; c++) {
113 |         for (t = 0; t < vectorSize; t++) {
114 |           converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 |           converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 |           lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 |         }
118 |       }
119 |     }
120 | 
121 |     for (j = 1; j < numFeaturesPlusOne; j++) {
122 |       for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 |         for (c = 0; c < chunk; c++) {
125 |           for (t = 0; t < vectorSize; t++) {
126 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 |             converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 |             lin[k][c * vectorSize + t] +=
129 |                 converter1.asFloat * converter2.asFloat;
130 |           }
131 |         }
132 |       }
133 |     }
134 | 
135 |     for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 |       for (c = 0; c < chunk; c++) {
138 |         prd[c][k] =
139 |             1.0 /
140 |             (1.0 +
141 |              exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 |                    lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 |                    lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 |                    lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 |                    lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 |                    lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 |                    lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 |                    lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 |       }
150 |     }
151 | 
152 |     // Reading the dataset labels and update predictions
153 |     float8 labels = _labels[i];
154 |     for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 |       int label = labels.range((c + 1) * 32 - 1, c * 32);
157 |       prd[c][label] -= 1.0;
158 |     }
159 | 
160 |     // Compute the output gradients
161 |     for (j = 0; j < numFeaturesPlusOne; j++) {
162 |       for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 |         for (c = 0; c < chunk; c++) {
165 |           for (t = 0; t < vectorSize; t++) {
166 |             converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 |             converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 |             converter2.asFloat += prd[c][k] * converter1.asFloat;
169 |             gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 |           }
171 |         }
172 |       }
173 |     }
174 |   }
175 | 
176 |   // Write back gradients
177 |   for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 |        kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 |     if (j == numFeaturesPlusOne) {
181 |       j = 0;
182 |       k++;
183 |     }
184 |     _gradients[kj] = gradients[k][j];
185 |   }
186 | }
187 | }
188 | 


--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include "runtime.h"
 18 | 
 19 | // Packs a world struct.
 20 | cl_world PackWorld(_cl_world *_world) { return (cl_world)_world; }
 21 | 
 22 | // Unpacks a world struct.
 23 | _cl_world *UnpackWorld(cl_world world) { return (_cl_world *)world; }
 24 | 
 25 | // Packs an engine struct.
 26 | cl_engine PackEngine(_cl_engine *_engine) { return (cl_engine)_engine; }
 27 | 
 28 | // Unpacks an engine struct.
 29 | _cl_engine *UnpackEngine(cl_engine engine) { return (_cl_engine *)engine; }
 30 | 
 31 | // Transforms an engine to the world.
 32 | cl_world EngineToWorld(cl_engine engine) { return UnpackEngine(engine)->world; }
 33 | 
 34 | // Creates the world struct.
 35 | cl_world CreateWorld() {
 36 |   _cl_world *_world = (_cl_world *)malloc(sizeof(_cl_world));
 37 | 
 38 |   return PackWorld(_world);
 39 | }
 40 | 
 41 | // Obtains the platform id.
 42 | void GetPlatformID(cl_world world) {
 43 |   _cl_world *_world = UnpackWorld(world);
 44 | 
 45 |   _world->platform_id = INclGetPlatformID();
 46 | }
 47 | 
 48 | // Obtains the specified device id.
 49 | void GetDeviceID(cl_world world, cl_uint id) {
 50 |   _cl_world *_world = UnpackWorld(world);
 51 | 
 52 |   _world->device_id = INclGetDeviceID(_world->platform_id, id);
 53 | }
 54 | 
 55 | // Creates the context.
 56 | void CreateContext(cl_world world) {
 57 |   _cl_world *_world = UnpackWorld(world);
 58 | 
 59 |   _world->context = INclCreateContext(_world->device_id);
 60 | }
 61 | 
 62 | // Creates a program with the specified name.
 63 | void CreateProgram(cl_world world, const char *bitstream_name) {
 64 |   _cl_world *_world = UnpackWorld(world);
 65 | 
 66 |   _world->program = INclCreateProgramWithBinary(
 67 |       _world->context, 1, &_world->device_id, bitstream_name);
 68 | 
 69 |   INclBuildProgram(_world->program);
 70 | }
 71 | 
 72 | // Creates a command queue.
 73 | cl_command_queue CreateCommandQueue(cl_world world) {
 74 |   _cl_world *_world = UnpackWorld(world);
 75 | 
 76 |   return INclCreateCommandQueue(_world->context, _world->device_id);
 77 | }
 78 | 
 79 | // Blocks until all tasks in a command queue have been completed.
 80 | void BlockCommandQueue(cl_command_queue command_queue) {
 81 |   INclFlush(command_queue);
 82 |   INclFinish(command_queue);
 83 | }
 84 | 
 85 | // Releases a command queue.
 86 | void ReleaseCommandQueue(cl_command_queue command_queue) {
 87 |   BlockCommandQueue(command_queue);
 88 | 
 89 |   INclReleaseCommandQueue(command_queue);
 90 | }
 91 | 
 92 | // Allocates a memory buffer.
 93 | void *CreateBuffer(cl_world world, size_t size, cl_uint memory) {
 94 |   _cl_world *_world = UnpackWorld(world);
 95 | 
 96 |   cl_uint CL_MEM_EXT_PTR = 1 << 31;
 97 | 
 98 |   typedef struct {
 99 |     unsigned flags;
100 |     void *obj;
101 |     void *param;
102 |   } cl_mem_ext_ptr_t;
103 | 
104 |   cl_uint CL_MEMORY = 1 << memory;
105 | 
106 |   cl_mem_ext_ptr_t buffer;
107 |   buffer.flags = CL_MEMORY;
108 |   buffer.obj = NULL;
109 |   buffer.param = 0;
110 | 
111 |   return (void *)INclCreateBuffer(
112 |       _world->context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR, size, &buffer);
113 | }
114 | 
115 | // Enqueues a memory copy operation to device.
116 | void EnqueueMemcpyTo(cl_command_queue command_queue, void *dst_ptr,
117 |                      size_t offset, void *src_ptr, size_t size) {
118 |   INclEnqueueWriteBuffer(command_queue, (cl_mem)dst_ptr, offset, size, src_ptr,
119 |                          0, NULL, NULL);
120 | }
121 | 
122 | // Enqueues a memory copy operation from device.
123 | void EnqueueMemcpyFrom(cl_command_queue command_queue, void *src_ptr,
124 |                        size_t offset, void *dst_ptr, size_t size) {
125 |   INclEnqueueReadBuffer(command_queue, (cl_mem)src_ptr, offset, size, dst_ptr,
126 |                         0, NULL, NULL);
127 | }
128 | 
129 | // Frees a memory buffer.
130 | void ReleaseBuffer(cl_world world, void *ptr) {
131 |   INclReleaseMemObject((cl_mem)ptr);
132 | }
133 | 
134 | // Creates a kernel with the specified name.
135 | cl_kernel CreateKernel(cl_world world, const char *kernel_name) {
136 |   _cl_world *_world = UnpackWorld(world);
137 | 
138 |   return INclCreateKernel(_world->program, kernel_name);
139 | }
140 | 
141 | // Sets a pointer kernel argument.
142 | void SetKernelArgPointer(cl_kernel kernel, cl_uint arg_index,
143 |                          const void *arg_value) {
144 |   INclSetKernelArg(kernel, arg_index, sizeof(cl_mem), &arg_value);
145 | }
146 | 
147 | // Sets a scalar kernel argument.
148 | void SetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
149 |                   const void *arg_value) {
150 |   INclSetKernelArg(kernel, arg_index, arg_size, arg_value);
151 | }
152 | 
153 | // Enqueues a kernel operation (Task mode).
154 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel) {
155 |   INclEnqueueTask(command_queue, kernel, 0, NULL, NULL);
156 | }
157 | 
158 | // Enqueues a kernel operation (NDRangeKernel mode).
159 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel,
160 |                    const size_t *global_work_size,
161 |                    const size_t *local_work_size) {
162 |   INclEnqueueNDRangeKernel(command_queue, kernel, 3, global_work_size,
163 |                            local_work_size, 0, NULL, NULL);
164 | }
165 | 
166 | // Releases a kernel.
167 | void ReleaseKernel(cl_kernel kernel) { INclReleaseKernel(kernel); }
168 | 
169 | // Creates an engine struct with the specified name.
170 | cl_engine CreateEngine(cl_world world, const char *kernel_name) {
171 |   _cl_engine *_engine = (_cl_engine *)malloc(sizeof(_cl_engine));
172 | 
173 |   _engine->world = world;
174 | 
175 |   _engine->command_queue = CreateCommandQueue(world);
176 |   _engine->kernel = CreateKernel(world, kernel_name);
177 | 
178 |   return PackEngine(_engine);
179 | }
180 | 
181 | // Blocks until all tasks in an engine struct have been completed.
182 | void BlockEngine(cl_engine engine) {
183 |   _cl_engine *_engine = UnpackEngine(engine);
184 | 
185 |   BlockCommandQueue(_engine->command_queue);
186 | }
187 | 
188 | // Sets a pointer engine struct argument.
189 | void SetEngineArgPointer(cl_engine engine, cl_uint arg_index,
190 |                          const void *arg_value) {
191 |   _cl_engine *_engine = UnpackEngine(engine);
192 | 
193 |   SetKernelArgPointer(_engine->kernel, arg_index, arg_value);
194 | }
195 | 
196 | // Sets a scalar engine struct argument.
197 | void SetEngineArg(cl_engine engine, cl_uint arg_index, size_t arg_size,
198 |                   const void *arg_value) {
199 |   _cl_engine *_engine = UnpackEngine(engine);
200 | 
201 |   SetKernelArg(_engine->kernel, arg_index, arg_size, arg_value);
202 | }
203 | 
204 | // Enqueues an engine struct operation (Task mode).
205 | void EnqueueEngine(cl_engine engine) {
206 |   _cl_engine *_engine = UnpackEngine(engine);
207 | 
208 |   EnqueueKernel(_engine->command_queue, _engine->kernel);
209 | }
210 | 
211 | // Enqueues an engine struct operation (NDRangeKernel mode).
212 | void EnqueueEngine(cl_engine engine, const size_t *global_work_size,
213 |                    const size_t *local_work_size) {
214 |   _cl_engine *_engine = UnpackEngine(engine);
215 | 
216 |   EnqueueKernel(_engine->command_queue, _engine->kernel, global_work_size,
217 |                 local_work_size);
218 | }
219 | 
220 | // Releases an engine struct.
221 | void ReleaseEngine(cl_engine engine) {
222 |   _cl_engine *_engine = UnpackEngine(engine);
223 | 
224 |   ReleaseCommandQueue(_engine->command_queue);
225 |   ReleaseKernel(_engine->kernel);
226 | 
227 |   free(_engine);
228 | }
229 | 
230 | // Releases a program.
231 | void ReleaseProgram(cl_world world) {
232 |   _cl_world *_world = UnpackWorld(world);
233 | 
234 |   INclReleaseProgram(_world->program);
235 | }
236 | 
237 | // Releases the context.
238 | void ReleaseContext(cl_world world) {
239 |   _cl_world *_world = UnpackWorld(world);
240 | 
241 |   INclReleaseContext(_world->context);
242 | }
243 | 
244 | // Releases the world struct.
245 | void ReleaseWorld(cl_world world) {
246 |   _cl_world *_world = UnpackWorld(world);
247 | 
248 |   free(_world);
249 | }
250 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/host_srcs/LogisticRegression.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #ifndef _TEST_
 18 | #define _accel_ 1
 19 | #else
 20 | #define _accel_ 0
 21 | #endif
 22 | 
 23 | #include <fstream>
 24 | #include <inttypes.h>
 25 | #include <iostream>
 26 | #include <malloc.h>
 27 | #include <math.h>
 28 | #include <sstream>
 29 | #include <string.h>
 30 | #include <sys/time.h>
 31 | #include <vector>
 32 | 
 33 | #include "inaccel/runtime-api.h"
 34 | 
 35 | using namespace std;
 36 | 
 37 | // Dataset specific options
 38 | // Change below definitions according to your input dataset
 39 | #define NUMCLASSES 26
 40 | #define NUMFEATURES 784
 41 | #define NUMEXAMPLES 124800
 42 | #define NUM_KERNELS 4
 43 | 
 44 | // Function to allocate an aligned memory buffer
 45 | void *INalligned_malloc(size_t size) {
 46 |   void *ptr = memalign(4096, size);
 47 |   if (!ptr) {
 48 |     printf("Error: alligned_malloc\n");
 49 |     exit(EXIT_FAILURE);
 50 |   }
 51 | 
 52 |   return ptr;
 53 | }
 54 | 
 55 | // Function to split a string on specified delimiter
 56 | vector<string> split(const string &s) {
 57 |   vector<string> elements;
 58 |   stringstream ss(s);
 59 |   string item;
 60 | 
 61 |   while (getline(ss, item)) {
 62 |     size_t prev = 0;
 63 |     size_t pos;
 64 | 
 65 |     while ((pos = item.find_first_of(" (,[])=", prev)) != std::string::npos) {
 66 |       if (pos > prev)
 67 |         elements.push_back(item.substr(prev, pos - prev));
 68 |       prev = pos + 1;
 69 |     }
 70 | 
 71 |     if (prev < item.length())
 72 |       elements.push_back(item.substr(prev, std::string::npos));
 73 |   }
 74 | 
 75 |   return elements;
 76 | }
 77 | 
 78 | // Reads the input dataset and sets features and labels buffers accordingly
 79 | void read_input(string filename, float *features, int *labels, int numFeatures,
 80 |                 int numExamples) {
 81 |   ifstream train;
 82 |   train.open(filename.c_str());
 83 | 
 84 |   string line;
 85 |   int i;
 86 |   int n = 0;
 87 | 
 88 |   while (getline(train, line) && (n < numExamples)) {
 89 |     if (line.length()) {
 90 |       vector<string> tokens = split(line);
 91 |       features[n * (16 + numFeatures) + numFeatures] = 1.0;
 92 |       labels[n] = atoi(tokens[0].c_str());
 93 |       for (i = 0; i < numFeatures; i++) {
 94 |         features[n * (16 + numFeatures) + i] = atof(tokens[i + 1].c_str());
 95 |       }
 96 |       n++;
 97 |     }
 98 |   }
 99 | 
100 |   train.close();
101 | }
102 | 
103 | // Writes a trained model to the specified filename
104 | void write_output(string filename, float *weights, int numClasses,
105 |                   int numFeatures) {
106 | 
107 |   ofstream results;
108 |   results.open(filename.c_str());
109 | 
110 |   for (int k = 0; k < numClasses; k++) {
111 |     results << weights[k * (16 + numFeatures)];
112 |     for (int j = 1; j < (16 + numFeatures); j++) {
113 |       results << "," << weights[k * (16 + numFeatures) + j];
114 |     }
115 |     results << endl;
116 |   }
117 | 
118 |   results.close();
119 | }
120 | 
121 | // A simple classifier. Given an point it matches the class with the greatest
122 | // probability
123 | int classify(float *features, float *weights, int numClasses, int numFeatures) {
124 |   float prob = -1.0;
125 |   int prediction = -1;
126 | 
127 |   for (int k = 0; k < numClasses; k++) {
128 |     float dot = weights[k * (16 + numFeatures) + numFeatures];
129 | 
130 |     for (int j = 0; j < numFeatures; j++) {
131 |       dot += features[j] * weights[k * (16 + numFeatures) + j];
132 |     }
133 | 
134 |     if (1.0 / (1.0 + exp(-dot)) > prob) {
135 |       prob = 1.0 / (1.0 + exp(-dot));
136 |       prediction = k;
137 |     }
138 |   }
139 | 
140 |   return prediction;
141 | }
142 | 
143 | // A simple prediction function to evaluate the accuracy of a trained model
144 | void predict(string filename, float *weights, int numClasses, int numFeatures) {
145 |   cout << "    * LogisticRegression Testing *" << endl;
146 | 
147 |   float tr = 0.0;
148 |   float fls = 0.0;
149 |   float example[numFeatures];
150 |   string line;
151 |   ifstream test;
152 | 
153 |   test.open(filename.c_str());
154 | 
155 |   while (getline(test, line)) {
156 |     if (line.length()) {
157 |       if (line[0] != '#' && line[0] != ' ') {
158 |         vector<string> tokens = split(line);
159 | 
160 |         int label = (int)atof(tokens[0].c_str());
161 |         for (int j = 1; j < (1 + numFeatures); j++) {
162 |           example[j - 1] = atof(tokens[j].c_str());
163 |         }
164 | 
165 |         int prediction = classify(example, weights, numClasses, numFeatures);
166 | 
167 |         if (prediction == label)
168 |           tr++;
169 |         else
170 |           fls++;
171 |       }
172 |     }
173 |   }
174 | 
175 |   test.close();
176 | 
177 |   printf("     # accuracy:       %1.3f (%i/%i)\n", (tr / (tr + fls)), (int)tr,
178 |          (int)(tr + fls));
179 |   printf("     # true:           %i\n", (int)tr);
180 |   printf("     # false:          %i\n", (int)fls);
181 | }
182 | 
183 | // CPU implementation of Logistic Regression gradients calculation
184 | void gradients_sw(int *labels, float *features, float *weights,
185 |                   float *gradients, int numClasses, int numFeatures,
186 |                   int numExamples) {
187 |   for (int k = 0; k < numClasses; k++) {
188 |     for (int j = 0; j < (16 + numFeatures); j++) {
189 |       gradients[k * (16 + numFeatures) + j] = 0.0;
190 |     }
191 |   }
192 | 
193 |   for (int i = 0; i < numExamples; i++) {
194 |     for (int k = 0; k < numClasses; k++) {
195 |       float dot = weights[k * (16 + numFeatures) + numFeatures];
196 | 
197 |       for (int j = 0; j < numFeatures; j++) {
198 |         dot += weights[k * (16 + numFeatures) + j] *
199 |                features[i * (16 + numFeatures) + j];
200 |       }
201 | 
202 |       float dif = 1.0 / (1.0 + exp(-dot));
203 |       if (labels[i] == k)
204 |         dif -= 1;
205 | 
206 |       for (int j = 0; j < (16 + numFeatures); j++) {
207 |         gradients[k * (16 + numFeatures) + j] +=
208 |             dif * features[i * (16 + numFeatures) + j];
209 |       }
210 |     }
211 |   }
212 | }
213 | 
214 | int main(int argc, char *argv[]) {
215 |   if (argc != 2) {
216 |     cout << "Usage: " << argv[0] << " <iterations>" << endl;
217 |     exit(-1);
218 |   }
219 | 
220 |   struct timeval start, end;
221 | 
222 |   float alpha = 0.3f;
223 |   float gamma = 0.95f;
224 |   int iter = atoi(argv[1]);
225 | 
226 |   // Set up the specifications of the model to be trained
227 |   int numClasses = NUMCLASSES;
228 |   int numFeatures = NUMFEATURES;
229 |   int numExamples = NUMEXAMPLES;
230 | 
231 |   // Split the dataset among the availbale kernels
232 |   int chunkSize = numExamples / NUM_KERNELS;
233 | 
234 |   // Allocate host buffers for lables and features of the dataset as well as
235 |   // weights and gradients for the model to be trained and lastly velocity
236 |   // buffer for accuracy optimization
237 |   int *labels = (int *)INalligned_malloc(numExamples * sizeof(int));
238 |   float *features = (float *)INalligned_malloc(
239 |       numExamples * (16 + numFeatures) * sizeof(float));
240 |   float *weights = (float *)INalligned_malloc(numClasses * (16 + numFeatures) *
241 |                                               sizeof(float));
242 |   float *gradients = (float *)INalligned_malloc(
243 |       numClasses * (16 + numFeatures) * sizeof(float));
244 |   float *velocity = (float *)INalligned_malloc(numClasses * (1 + numFeatures) *
245 |                                                sizeof(float));
246 | 
247 |   // Specify train and test input files as well as output model file
248 |   string trainFile = "data/letters_csv_train.dat";
249 |   string testFile = "data/letters_csv_test.dat";
250 |   string modelFile = "data/weights.out";
251 | 
252 |   // Read the input dataset
253 |   cout << "! Reading train file..." << endl;
254 |   read_input(trainFile, features, labels, numFeatures, numExamples);
255 | 
256 |   // Initialize model weights to zero
257 |   for (int i = 0; i < numClasses * (16 + numFeatures); i++)
258 |     weights[i] = 0.0;
259 | 
260 |   if (_accel_) {
261 |     // Invoke the hardware accelerated implementation of the algorithm
262 | 
263 |     cl_engine engine[NUM_KERNELS];
264 |     float *ffeatures[NUM_KERNELS], *fweights[NUM_KERNELS];
265 |     float *fgradients[NUM_KERNELS], *grads[NUM_KERNELS];
266 |     int *flabels[NUM_KERNELS];
267 | 
268 |     size_t labels_size = chunkSize * sizeof(int);
269 |     size_t features_size = chunkSize * (numFeatures + 16) * sizeof(float);
270 |     size_t weights_size = numClasses * (numFeatures + 16) * sizeof(float);
271 | 
272 |     // Initialize the FPGA world
273 |     cl_world world = InAccel::create_world(0);
274 |     // Program the FPGA device using the provided bitstream
275 |     InAccel::create_program(world, "Gradients.xclbin");
276 | 
277 |     // Instanisate the kernels of the bitstream. Each engine holds a kernel
278 |     // along with its command queue
279 |     engine[0] = InAccel::create_engine(world, "Gradients_0");
280 |     engine[1] = InAccel::create_engine(world, "Gradients_1");
281 |     engine[2] = InAccel::create_engine(world, "Gradients_2");
282 |     engine[3] = InAccel::create_engine(world, "Gradients_3");
283 | 
284 |     // Memcpy to each memory bank the corresponding part of the input dataset
285 |     for (int i = 0; i < NUM_KERNELS; i++) {
286 |       flabels[i] = (int *)InAccel::malloc(world, labels_size, i);
287 |       InAccel::memcpy_to(world, flabels[i], 0, labels + i * chunkSize,
288 |                          labels_size);
289 |       ffeatures[i] = (float *)InAccel::malloc(world, features_size, i);
290 |       InAccel::memcpy_to(world, ffeatures[i], 0,
291 |                          features + (i * chunkSize * (16 + numFeatures)),
292 |                          features_size);
293 | 
294 |       fweights[i] = (float *)InAccel::malloc(world, weights_size, i);
295 | 
296 |       fgradients[i] = (float *)InAccel::malloc(world, weights_size, i);
297 |       grads[i] = (float *)INalligned_malloc(weights_size);
298 |     }
299 | 
300 |     gettimeofday(&start, NULL);
301 |     // Start the iterative part for the training of the algorithm
302 |     for (int t = 0; t < iter; t++) {
303 |       for (int i = 0; i < NUM_KERNELS; i++) {
304 |         // Memcpy to DDR the weights of the model
305 |         InAccel::memcpy_to(world, fweights[i], 0, weights, weights_size);
306 | 
307 |         // Set the kernel arguments
308 |         InAccel::set_engine_arg(engine[i], 0, flabels[i]);
309 |         InAccel::set_engine_arg(engine[i], 1, ffeatures[i]);
310 |         InAccel::set_engine_arg(engine[i], 2, fweights[i]);
311 |         InAccel::set_engine_arg(engine[i], 3, fgradients[i]);
312 |         InAccel::set_engine_arg(engine[i], 4, numClasses);
313 |         InAccel::set_engine_arg(engine[i], 5, numFeatures);
314 |         InAccel::set_engine_arg(engine[i], 6, chunkSize);
315 | 
316 |         // Invoke the kernel execution
317 |         InAccel::run_engine(engine[i]);
318 |       }
319 | 
320 |       // Wait for the kernels to finish
321 |       for (int i = 0; i < NUM_KERNELS; i++) {
322 |         InAccel::await_engine(engine[i]);
323 |       }
324 | 
325 |       // Get the gradients as computed by the kernels
326 |       for (int i = 0; i < NUM_KERNELS; i++) {
327 |         InAccel::memcpy_from(world, fgradients[i], 0, grads[i], weights_size);
328 |       }
329 | 
330 |       // Aggregate the gradients from all kernels
331 |       for (int j = 0; j < numClasses * (16 + numFeatures); j++) {
332 |         gradients[j] = grads[0][j];
333 |         for (int i = 1; i < NUM_KERNELS; i++) {
334 |           gradients[j] += grads[i][j];
335 |         }
336 |       }
337 | 
338 |       // Compute the new weights of the model applying some software
339 |       // optimizations for better model accuracy
340 |       for (int k = 0; k < numClasses; k++) {
341 |         for (int j = 0; j < (1 + numFeatures); j++) {
342 |           velocity[k * (1 + numFeatures) + j] =
343 |               gamma * velocity[k * (1 + numFeatures) + j] +
344 |               (alpha / numExamples) * gradients[k * (16 + numFeatures) + j];
345 |           weights[k * (16 + numFeatures) + j] -=
346 |               velocity[k * (1 + numFeatures) + j];
347 |         }
348 |       }
349 |     }
350 | 
351 |     gettimeofday(&end, NULL);
352 | 
353 |     // Free any allocated buffers for the FPGA device and release the allocated
354 |     // kernels and command queues
355 |     for (int i = 0; i < NUM_KERNELS; i++) {
356 |       free(grads[i]);
357 |       InAccel::free(world, fgradients[i]);
358 |       InAccel::free(world, fweights[i]);
359 |       InAccel::free(world, ffeatures[i]);
360 |       InAccel::free(world, flabels[i]);
361 |       InAccel::release_engine(engine[i]);
362 |     }
363 | 
364 |     // Release the FPGA program
365 |     InAccel::release_program(world);
366 |     // Release the FPGA world
367 |     InAccel::release_world(world);
368 |   } else {
369 |     // Invoke the software implementation of the algorithm
370 |     gettimeofday(&start, NULL);
371 |     for (int t = 0; t < iter; t++) {
372 |       gradients_sw(labels, features, weights, gradients, numClasses,
373 |                    numFeatures, numExamples);
374 |       for (int k = 0; k < numClasses; k++) {
375 |         for (int j = 0; j < (1 + numFeatures); j++) {
376 |           velocity[k * (1 + numFeatures) + j] =
377 |               gamma * velocity[k * (1 + numFeatures) + j] +
378 |               (alpha / numExamples) * gradients[k * (16 + numFeatures) + j];
379 |           weights[k * (16 + numFeatures) + j] -=
380 |               velocity[k * (1 + numFeatures) + j];
381 |         }
382 |       }
383 |     }
384 |     gettimeofday(&end, NULL);
385 |   }
386 | 
387 |   float time_us = ((end.tv_sec * 1000000) + end.tv_usec) -
388 |                   ((start.tv_sec * 1000000) + start.tv_usec);
389 |   float time_s = (end.tv_sec - start.tv_sec);
390 | 
391 |   cout << "! Time running Gradients Kernel: " << time_us / 1000 << " msec, "
392 |        << time_s << " sec " << endl;
393 | 
394 |   // Compute the accuracy of the trained model on a given test dataset.
395 |   predict(testFile, weights, numClasses, numFeatures);
396 | 
397 |   // Save the model to the specified user file
398 |   write_output(modelFile, weights, numClasses, numFeatures);
399 | 
400 |   // Free any host allocated buffers
401 |   free(labels);
402 |   free(features);
403 |   free(weights);
404 |   free(gradients);
405 |   free(velocity);
406 | 
407 |   return 0;
408 | }
409 | 


--------------------------------------------------------------------------------
/host_srcs/common/INcl.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright © 2019 InAccel
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |   http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | */
 16 | 
 17 | #include <malloc.h>
 18 | #include <string.h>
 19 | 
 20 | #include "INcl.h"
 21 | 
 22 | // Builds a program executable from the program binary.
 23 | void INclBuildProgram(cl_program program) {
 24 |   cl_int errcode_ret = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
 25 |   if (errcode_ret != CL_SUCCESS) {
 26 |     fprintf(stderr, "Error: clBuildProgram %s (%d)\n",
 27 |             INclCheckErrorCode(errcode_ret), errcode_ret);
 28 |     throw EXIT_FAILURE;
 29 |   }
 30 | }
 31 | 
 32 | // Creates a buffer object.
 33 | cl_mem INclCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
 34 |                         void *host_ptr) {
 35 |   cl_int errcode_ret;
 36 |   cl_mem mem = clCreateBuffer(context, flags, size, host_ptr, &errcode_ret);
 37 |   if (errcode_ret != CL_SUCCESS || !mem) {
 38 |     fprintf(stderr, "Error: clCreateBuffer %s (%d)\n",
 39 |             INclCheckErrorCode(errcode_ret), errcode_ret);
 40 |     throw EXIT_FAILURE;
 41 |   }
 42 | 
 43 |   return mem;
 44 | }
 45 | 
 46 | // Create a command-queue on a specific device.
 47 | cl_command_queue INclCreateCommandQueue(cl_context context,
 48 |                                         cl_device_id device) {
 49 |   cl_int errcode_ret;
 50 |   cl_command_queue command_queue =
 51 |       clCreateCommandQueue(context, device, 0, &errcode_ret);
 52 |   if (errcode_ret != CL_SUCCESS || !command_queue) {
 53 |     fprintf(stderr, "Error: clCreateCommandQueue %s (%d)\n",
 54 |             INclCheckErrorCode(errcode_ret), errcode_ret);
 55 |     throw EXIT_FAILURE;
 56 |   }
 57 | 
 58 |   return command_queue;
 59 | }
 60 | 
 61 | // Creates an OpenCL context.
 62 | cl_context INclCreateContext(cl_device_id device) {
 63 |   cl_int errcode_ret;
 64 |   cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &errcode_ret);
 65 |   if (errcode_ret != CL_SUCCESS || !context) {
 66 |     fprintf(stderr, "Error: clCreateContext %s (%d)\n",
 67 |             INclCheckErrorCode(errcode_ret), errcode_ret);
 68 |     throw EXIT_FAILURE;
 69 |   }
 70 | 
 71 |   return context;
 72 | }
 73 | 
 74 | // Creates a kernel object.
 75 | cl_kernel INclCreateKernel(cl_program program, const char *kernel_name) {
 76 |   cl_int errcode_ret;
 77 |   cl_kernel kernel = clCreateKernel(program, kernel_name, &errcode_ret);
 78 |   if (errcode_ret != CL_SUCCESS || !kernel) {
 79 |     fprintf(stderr, "Error: clCreateKernel %s (%d)\n",
 80 |             INclCheckErrorCode(errcode_ret), errcode_ret);
 81 |     throw EXIT_FAILURE;
 82 |   }
 83 | 
 84 |   return kernel;
 85 | }
 86 | 
 87 | // Creates a program object for a context, and loads specified binary data into
 88 | // the program object.
 89 | cl_program INclCreateProgramWithBinary(cl_context context, cl_uint num_devices,
 90 |                                        const cl_device_id *device_list,
 91 |                                        const char *binary_name) {
 92 |   FILE *file = fopen(binary_name, "rb");
 93 |   if (!file) {
 94 |     fprintf(stderr, "Error: fopen\n");
 95 |     throw EXIT_FAILURE;
 96 |   }
 97 | 
 98 |   fseek(file, 0, SEEK_END);
 99 |   size_t size = ftell(file);
100 |   fseek(file, 0, SEEK_SET);
101 | 
102 |   char *temp = (char *)malloc((size + 1) * sizeof(char));
103 |   if (!temp) {
104 |     fprintf(stderr, "Error: malloc\n");
105 |     throw EXIT_FAILURE;
106 |   }
107 | 
108 |   if (size != fread(temp, sizeof(char), size, file)) {
109 |     free(temp);
110 | 
111 |     fprintf(stderr, "Error: fread\n");
112 |     throw EXIT_FAILURE;
113 |   }
114 | 
115 |   fclose(file);
116 |   temp[size] = 0;
117 | 
118 |   char *binary = temp;
119 | 
120 |   cl_int errcode_ret;
121 |   cl_program program = clCreateProgramWithBinary(
122 |       context, num_devices, device_list, &size, (const unsigned char **)&binary,
123 |       NULL, &errcode_ret);
124 |   if (errcode_ret != CL_SUCCESS || !program) {
125 |     fprintf(stderr, "Error: clCreateProgramWithBinary %s (%d)\n",
126 |             INclCheckErrorCode(errcode_ret), errcode_ret);
127 |     throw EXIT_FAILURE;
128 |   }
129 | 
130 |   free(temp);
131 | 
132 |   return program;
133 | }
134 | 
135 | // Enqueues a command to map a region of the buffer object given by buffer into
136 | // the host address space and returns a pointer to this mapped region.
137 | void *INclEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer,
138 |                            cl_map_flags map_flags, size_t cb,
139 |                            cl_uint num_events_in_wait_list,
140 |                            const cl_event *event_wait_list, cl_event *event) {
141 |   cl_int errcode_ret;
142 |   void *ptr = clEnqueueMapBuffer(command_queue, buffer, CL_FALSE, map_flags, 0,
143 |                                  cb, num_events_in_wait_list, event_wait_list,
144 |                                  event, &errcode_ret);
145 |   if (errcode_ret != CL_SUCCESS || !ptr) {
146 |     fprintf(stderr, "Error: clEnqueueMapBuffer %s (%d)\n",
147 |             INclCheckErrorCode(errcode_ret), errcode_ret);
148 |     throw EXIT_FAILURE;
149 |   }
150 | 
151 |   return ptr;
152 | }
153 | 
154 | // Enqueues a command to indicate which device a set of memory objects should be
155 | // associated with.
156 | void INclEnqueueMigrateMemObjects(cl_command_queue command_queue,
157 |                                   cl_uint num_mem_objects,
158 |                                   const cl_mem *mem_objects,
159 |                                   cl_mem_migration_flags flags,
160 |                                   cl_uint num_events_in_wait_list,
161 |                                   const cl_event *event_wait_list,
162 |                                   cl_event *event) {
163 |   cl_int errcode_ret = clEnqueueMigrateMemObjects(
164 |       command_queue, num_mem_objects, mem_objects, flags,
165 |       num_events_in_wait_list, event_wait_list, event);
166 |   if (errcode_ret != CL_SUCCESS) {
167 |     fprintf(stderr, "Error: clEnqueueMigrateMemObjects %s (%d)\n",
168 |             INclCheckErrorCode(errcode_ret), errcode_ret);
169 |     throw EXIT_FAILURE;
170 |   }
171 | }
172 | 
173 | // Enqueues a command to execute a kernel on a device.
174 | void INclEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,
175 |                               cl_uint work_dim, const size_t *global_work_size,
176 |                               const size_t *local_work_size,
177 |                               cl_uint num_events_in_wait_list,
178 |                               const cl_event *event_wait_list,
179 |                               cl_event *event) {
180 |   cl_int errcode_ret = clEnqueueNDRangeKernel(
181 |       command_queue, kernel, work_dim, NULL, global_work_size, local_work_size,
182 |       num_events_in_wait_list, event_wait_list, event);
183 |   if (errcode_ret != CL_SUCCESS) {
184 |     fprintf(stderr, "Error: clEnqueueNDRangeKernel %s (%d)\n",
185 |             INclCheckErrorCode(errcode_ret), errcode_ret);
186 |     throw EXIT_FAILURE;
187 |   }
188 | }
189 | 
190 | // Enqueue commands to read from a buffer object to host memory.
191 | void INclEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
192 |                            size_t offset, size_t cb, void *ptr,
193 |                            cl_uint num_events_in_wait_list,
194 |                            const cl_event *event_wait_list, cl_event *event) {
195 |   cl_int errcode_ret =
196 |       clEnqueueReadBuffer(command_queue, buffer, CL_FALSE, offset, cb, ptr,
197 |                           num_events_in_wait_list, event_wait_list, event);
198 |   if (errcode_ret != CL_SUCCESS) {
199 |     fprintf(stderr, "Error: clEnqueueReadBuffer %s (%d)\n",
200 |             INclCheckErrorCode(errcode_ret), errcode_ret);
201 |     throw EXIT_FAILURE;
202 |   }
203 | }
204 | 
205 | // Enqueues a command to execute a kernel on a device.
206 | void INclEnqueueTask(cl_command_queue command_queue, cl_kernel kernel,
207 |                      cl_uint num_events_in_wait_list,
208 |                      const cl_event *event_wait_list, cl_event *event) {
209 |   cl_int errcode_ret = clEnqueueTask(
210 |       command_queue, kernel, num_events_in_wait_list, event_wait_list, event);
211 |   if (errcode_ret != CL_SUCCESS) {
212 |     fprintf(stderr, "Error: clEnqueueTask %s (%d)\n",
213 |             INclCheckErrorCode(errcode_ret), errcode_ret);
214 |     throw EXIT_FAILURE;
215 |   }
216 | }
217 | 
218 | // Enqueue commands to write to a buffer object from host memory.
219 | void INclEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
220 |                             size_t offset, size_t cb, const void *ptr,
221 |                             cl_uint num_events_in_wait_list,
222 |                             const cl_event *event_wait_list, cl_event *event) {
223 |   cl_int errcode_ret =
224 |       clEnqueueWriteBuffer(command_queue, buffer, CL_FALSE, offset, cb, ptr,
225 |                            num_events_in_wait_list, event_wait_list, event);
226 |   if (errcode_ret != CL_SUCCESS) {
227 |     fprintf(stderr, "Error: clEnqueueWriteBuffer %s (%d)\n",
228 |             INclCheckErrorCode(errcode_ret), errcode_ret);
229 |     throw EXIT_FAILURE;
230 |   }
231 | }
232 | 
233 | // Blocks until all previously queued OpenCL commands in a command-queue are
234 | // issued to the associated device and have completed.
235 | void INclFinish(cl_command_queue command_queue) {
236 |   cl_int errcode_ret = clFinish(command_queue);
237 |   if (errcode_ret != CL_SUCCESS) {
238 |     fprintf(stderr, "Error: clFinish %s (%d)\n",
239 |             INclCheckErrorCode(errcode_ret), errcode_ret);
240 |     throw EXIT_FAILURE;
241 |   }
242 | }
243 | 
244 | // Issues all previously queued OpenCL commands in a command-queue to the device
245 | // associated with the command-queue.
246 | void INclFlush(cl_command_queue command_queue) {
247 |   cl_int errcode_ret = clFlush(command_queue);
248 |   if (errcode_ret != CL_SUCCESS) {
249 |     fprintf(stderr, "Error: clFlush %s (%d)\n", INclCheckErrorCode(errcode_ret),
250 |             errcode_ret);
251 |     throw EXIT_FAILURE;
252 |   }
253 | }
254 | 
255 | // Obtain specified device, if available.
256 | cl_device_id INclGetDeviceID(cl_platform_id platform, cl_uint id) {
257 |   cl_device_id device_id = (cl_device_id)malloc(sizeof(cl_device_id));
258 |   if (!device_id) {
259 |     fprintf(stderr, "Error: malloc\n");
260 |     throw EXIT_FAILURE;
261 |   }
262 | 
263 |   cl_uint num_devices;
264 |   INclGetDeviceIDs(platform, 0, NULL, &num_devices);
265 | 
266 |   cl_device_id *devices =
267 |       (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
268 |   if (!devices) {
269 |     fprintf(stderr, "Error: malloc\n");
270 |     throw EXIT_FAILURE;
271 |   }
272 | 
273 |   INclGetDeviceIDs(platform, num_devices, devices, NULL);
274 | 
275 |   cl_uint i;
276 |   for (i = 0; i < num_devices; i++) {
277 |     if (i == id) {
278 |       device_id = devices[i];
279 |       break;
280 |     }
281 |   }
282 | 
283 |   free(devices);
284 | 
285 |   if (i == num_devices) {
286 |     fprintf(stderr, "Error: clGetDeviceID\n");
287 |     throw EXIT_FAILURE;
288 |   }
289 | 
290 |   return device_id;
291 | }
292 | 
293 | // Obtain the list of devices available on a platform.
294 | void INclGetDeviceIDs(cl_platform_id platform, cl_uint num_entries,
295 |                       cl_device_id *devices, cl_uint *num_devices) {
296 |   cl_int errcode_ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_entries,
297 |                                       devices, num_devices);
298 |   if (errcode_ret != CL_SUCCESS) {
299 |     fprintf(stderr, "Error: clGetDeviceIDs %s (%d)\n",
300 |             INclCheckErrorCode(errcode_ret), errcode_ret);
301 |     throw EXIT_FAILURE;
302 |   }
303 | }
304 | 
305 | // Get specific information about the OpenCL device.
306 | void INclGetDeviceInfo(cl_device_id device, cl_device_info param_name,
307 |                        size_t param_value_size, void *param_value,
308 |                        size_t *param_value_size_ret) {
309 |   cl_int errcode_ret = clGetDeviceInfo(device, param_name, param_value_size,
310 |                                        param_value, param_value_size_ret);
311 |   if (errcode_ret != CL_SUCCESS) {
312 |     fprintf(stderr, "Error: clGetDeviceInfo %s (%d)\n",
313 |             INclCheckErrorCode(errcode_ret), errcode_ret);
314 |     throw EXIT_FAILURE;
315 |   }
316 | }
317 | 
318 | // Obtain platform, if available.
319 | cl_platform_id INclGetPlatformID() {
320 |   cl_platform_id platform_id = (cl_platform_id)malloc(sizeof(cl_platform_id));
321 |   if (!platform_id) {
322 |     fprintf(stderr, "Error: malloc\n");
323 |     throw EXIT_FAILURE;
324 |   }
325 | 
326 |   cl_uint num_platforms;
327 |   INclGetPlatformIDs(0, NULL, &num_platforms);
328 | 
329 |   cl_platform_id *platforms =
330 |       (cl_platform_id *)malloc(num_platforms * sizeof(cl_platform_id));
331 |   if (!platforms) {
332 |     fprintf(stderr, "Error: malloc\n");
333 |     throw EXIT_FAILURE;
334 |   }
335 | 
336 |   INclGetPlatformIDs(num_platforms, platforms, NULL);
337 | 
338 |   cl_uint i;
339 |   for (i = 0; i < num_platforms; i++) {
340 |     size_t platform_name_size;
341 |     INclGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL,
342 |                         &platform_name_size);
343 | 
344 |     char *platform_name = (char *)malloc(platform_name_size * sizeof(char));
345 |     if (!platform_name) {
346 |       fprintf(stderr, "Error: malloc\n");
347 |       throw EXIT_FAILURE;
348 |     }
349 | 
350 |     INclGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_size,
351 |                         platform_name, NULL);
352 | 
353 |     if (strstr(platform_name, "Xilinx")) {
354 |       free(platform_name);
355 | 
356 |       platform_id = platforms[i];
357 |       break;
358 |     }
359 | 
360 |     free(platform_name);
361 |   }
362 | 
363 |   free(platforms);
364 | 
365 |   if (i == num_platforms) {
366 |     fprintf(stderr, "Error: clGetPlatformID\n");
367 |     throw EXIT_FAILURE;
368 |   }
369 | 
370 |   return platform_id;
371 | }
372 | 
373 | // Obtain the list of platforms available.
374 | void INclGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
375 |                         cl_uint *num_platforms) {
376 |   cl_int errcode_ret = clGetPlatformIDs(num_entries, platforms, num_platforms);
377 |   if (errcode_ret != CL_SUCCESS) {
378 |     fprintf(stderr, "Error: clGetPlatformIDs %s (%d)\n",
379 |             INclCheckErrorCode(errcode_ret), errcode_ret);
380 |     throw EXIT_FAILURE;
381 |   }
382 | }
383 | 
384 | // Get specific information about the OpenCL platform.
385 | void INclGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
386 |                          size_t param_value_size, void *param_value,
387 |                          size_t *param_value_size_ret) {
388 |   cl_int errcode_ret = clGetPlatformInfo(platform, param_name, param_value_size,
389 |                                          param_value, param_value_size_ret);
390 |   if (errcode_ret != CL_SUCCESS) {
391 |     fprintf(stderr, "Error: clGetPlatformInfo %s (%d)\n",
392 |             INclCheckErrorCode(errcode_ret), errcode_ret);
393 |     throw EXIT_FAILURE;
394 |   }
395 | }
396 | 
397 | // Decrements the command_queue reference count.
398 | void INclReleaseCommandQueue(cl_command_queue command_queue) {
399 |   cl_int errcode_ret = clReleaseCommandQueue(command_queue);
400 |   if (errcode_ret != CL_SUCCESS) {
401 |     fprintf(stderr, "Error: clReleaseCommandQueue %s (%d)\n",
402 |             INclCheckErrorCode(errcode_ret), errcode_ret);
403 |     throw EXIT_FAILURE;
404 |   }
405 | }
406 | 
407 | // Decrement the context reference count.
408 | void INclReleaseContext(cl_context context) {
409 |   cl_int errcode_ret = clReleaseContext(context);
410 |   if (errcode_ret != CL_SUCCESS) {
411 |     fprintf(stderr, "Error: clReleaseContext %s (%d)\n",
412 |             INclCheckErrorCode(errcode_ret), errcode_ret);
413 |     throw EXIT_FAILURE;
414 |   }
415 | }
416 | 
417 | // Decrements the event reference count.
418 | void INclReleaseEvent(cl_event event) {
419 |   cl_int errcode_ret = clReleaseEvent(event);
420 |   if (errcode_ret != CL_SUCCESS) {
421 |     fprintf(stderr, "Error: clReleaseEvent %s (%d)\n",
422 |             INclCheckErrorCode(errcode_ret), errcode_ret);
423 |     throw EXIT_FAILURE;
424 |   }
425 | }
426 | 
427 | // Decrements the kernel reference count.
428 | void INclReleaseKernel(cl_kernel kernel) {
429 |   cl_int errcode_ret = clReleaseKernel(kernel);
430 |   if (errcode_ret != CL_SUCCESS) {
431 |     fprintf(stderr, "Error: clReleaseKernel %s (%d)\n",
432 |             INclCheckErrorCode(errcode_ret), errcode_ret);
433 |     throw EXIT_FAILURE;
434 |   }
435 | }
436 | 
437 | // Decrements the memory object reference count.
438 | void INclReleaseMemObject(cl_mem memobj) {
439 |   cl_int errcode_ret = clReleaseMemObject(memobj);
440 |   if (errcode_ret != CL_SUCCESS) {
441 |     fprintf(stderr, "Error: clReleaseMemObject %s (%d)\n",
442 |             INclCheckErrorCode(errcode_ret), errcode_ret);
443 |     throw EXIT_FAILURE;
444 |   }
445 | }
446 | 
447 | // Decrements the program reference count.
448 | void INclReleaseProgram(cl_program program) {
449 |   cl_int errcode_ret = clReleaseProgram(program);
450 |   if (errcode_ret != CL_SUCCESS) {
451 |     fprintf(stderr, "Error: clReleaseProgram %s (%d)\n",
452 |             INclCheckErrorCode(errcode_ret), errcode_ret);
453 |     throw EXIT_FAILURE;
454 |   }
455 | }
456 | 
457 | // Used to set the argument value for a specific argument of a kernel.
458 | void INclSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
459 |                       const void *arg_value) {
460 |   cl_int errcode_ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
461 |   if (errcode_ret != CL_SUCCESS) {
462 |     fprintf(stderr, "Error: clSetKernelArg %s (%d)\n",
463 |             INclCheckErrorCode(errcode_ret), errcode_ret);
464 |     throw EXIT_FAILURE;
465 |   }
466 | }
467 | 
468 | // Waits on the host thread for commands identified by event objects to
469 | // complete.
470 | void INclWaitForEvents(cl_uint num_events, const cl_event *event_list) {
471 |   cl_int errcode_ret = clWaitForEvents(num_events, event_list);
472 |   if (errcode_ret != CL_SUCCESS) {
473 |     fprintf(stderr, "Error: clWaitForEvents %s (%d)\n",
474 |             INclCheckErrorCode(errcode_ret), errcode_ret);
475 |     throw EXIT_FAILURE;
476 |   }
477 | }
478 | 
479 | // Returns a message related to the error code.
480 | const char *INclCheckErrorCode(cl_int errcode) {
481 |   switch (errcode) {
482 |   case -1:
483 |     return "CL_DEVICE_NOT_FOUND";
484 |   case -2:
485 |     return "CL_DEVICE_NOT_AVAILABLE";
486 |   case -3:
487 |     return "CL_COMPILER_NOT_AVAILABLE";
488 |   case -4:
489 |     return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
490 |   case -5:
491 |     return "CL_OUT_OF_RESOURCES";
492 |   case -6:
493 |     return "CL_OUT_OF_HOST_MEMORY";
494 |   case -7:
495 |     return "CL_PROFILING_INFO_NOT_AVAILABLE";
496 |   case -8:
497 |     return "CL_MEM_COPY_OVERLAP";
498 |   case -9:
499 |     return "CL_IMAGE_FORMAT_MISMATCH";
500 |   case -10:
501 |     return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
502 |   case -11:
503 |     return "CL_BUILD_PROGRAM_FAILURE";
504 |   case -12:
505 |     return "CL_MAP_FAILURE";
506 |   case -13:
507 |     return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
508 |   case -14:
509 |     return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
510 |   case -15:
511 |     return "CL_COMPILE_PROGRAM_FAILURE";
512 |   case -16:
513 |     return "CL_LINKER_NOT_AVAILABLE";
514 |   case -17:
515 |     return "CL_LINK_PROGRAM_FAILURE";
516 |   case -18:
517 |     return "CL_DEVICE_PARTITION_FAILED";
518 |   case -19:
519 |     return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
520 |   case -30:
521 |     return "CL_INVALID_VALUE";
522 |   case -31:
523 |     return "CL_INVALID_DEVICE_TYPE";
524 |   case -32:
525 |     return "CL_INVALID_PLATFORM";
526 |   case -33:
527 |     return "CL_INVALID_DEVICE";
528 |   case -34:
529 |     return "CL_INVALID_CONTEXT";
530 |   case -35:
531 |     return "CL_INVALID_QUEUE_PROPERTIES";
532 |   case -36:
533 |     return "CL_INVALID_COMMAND_QUEUE";
534 |   case -37:
535 |     return "CL_INVALID_HOST_PTR";
536 |   case -38:
537 |     return "CL_INVALID_MEM_OBJECT";
538 |   case -39:
539 |     return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
540 |   case -40:
541 |     return "CL_INVALID_IMAGE_SIZE";
542 |   case -41:
543 |     return "CL_INVALID_SAMPLER";
544 |   case -42:
545 |     return "CL_INVALID_BINARY";
546 |   case -43:
547 |     return "CL_INVALID_BUILD_OPTIONS";
548 |   case -44:
549 |     return "CL_INVALID_PROGRAM";
550 |   case -45:
551 |     return "CL_INVALID_PROGRAM_EXECUTABLE";
552 |   case -46:
553 |     return "CL_INVALID_KERNEL_NAME";
554 |   case -47:
555 |     return "CL_INVALID_KERNEL_DEFINITION";
556 |   case -48:
557 |     return "CL_INVALID_KERNEL";
558 |   case -49:
559 |     return "CL_INVALID_ARG_INDEX";
560 |   case -50:
561 |     return "CL_INVALID_ARG_VALUE";
562 |   case -51:
563 |     return "CL_INVALID_ARG_SIZE";
564 |   case -52:
565 |     return "CL_INVALID_KERNEL_ARGS";
566 |   case -53:
567 |     return "CL_INVALID_WORK_DIMENSION";
568 |   case -54:
569 |     return "CL_INVALID_WORK_GROUP_SIZE";
570 |   case -55:
571 |     return "CL_INVALID_WORK_ITEM_SIZE";
572 |   case -56:
573 |     return "CL_INVALID_GLOBAL_OFFSET";
574 |   case -57:
575 |     return "CL_INVALID_EVENT_WAIT_LIST";
576 |   case -58:
577 |     return "CL_INVALID_EVENT";
578 |   case -59:
579 |     return "CL_INVALID_OPERATION";
580 |   case -60:
581 |     return "CL_INVALID_GL_OBJECT";
582 |   case -61:
583 |     return "CL_INVALID_BUFFER_SIZE";
584 |   case -62:
585 |     return "CL_INVALID_MIP_LEVEL";
586 |   case -63:
587 |     return "CL_INVALID_GLOBAL_WORK_SIZE";
588 |   case -64:
589 |     return "CL_INVALID_PROPERTY";
590 |   case -65:
591 |     return "CL_INVALID_IMAGE_DESCRIPTOR";
592 |   case -66:
593 |     return "CL_INVALID_COMPILER_OPTIONS";
594 |   case -67:
595 |     return "CL_INVALID_LINKER_OPTIONS";
596 |   case -68:
597 |     return "CL_INVALID_DEVICE_PARTITION_COUNT";
598 |   case -69:
599 |     return "CL_INVALID_PIPE_SIZE";
600 |   case -70:
601 |     return "CL_INVALID_DEVICE_QUEUE";
602 |   default:
603 |     return "CL_INVALID_ERROR_CODE";
604 |   }
605 | }
606 | 


--------------------------------------------------------------------------------