├── data
└── .keep
├── host_srcs
├── inaccel
│ ├── runtime-api.h
│ ├── runtime-api.cpp
│ ├── runtime.h
│ └── runtime.cpp
├── common
│ ├── INcl.h
│ └── INcl.cpp
└── LogisticRegression.cpp
├── Makefile
├── README.md
├── kernel_srcs
├── Gradients_0.cpp
├── Gradients_1.cpp
├── Gradients_2.cpp
└── Gradients_3.cpp
└── LICENSE
/data/.keep:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime-api.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #ifndef RUNTIME_API_H
18 | #define RUNTIME_API_H
19 |
20 | #include "runtime.h"
21 |
22 | // InAccel function calls.
23 | class InAccel {
24 |
25 | public:
26 | // Creates the world.
27 | static cl_world create_world(int device_id);
28 |
29 | // Allocates a new buffer.
30 | static void *malloc(cl_world world, size_t size, int memory_id);
31 |
32 | // Transfers data to a previously allocated buffer.
33 | static void memcpy_to(cl_world world, void *dst_ptr, size_t offset,
34 | void *src_ptr, size_t size);
35 |
36 | // Creates a new program.
37 | static void create_program(cl_world world, const char *bitstream_name);
38 |
39 | // Creates a new egine.
40 | static cl_engine create_engine(cl_world world, const char *kernel_name);
41 |
42 | // Sets an engine argument using a buffer.
43 | static void set_engine_arg(cl_engine engine, int index, void *buffer);
44 |
45 | // Sets an engine argument using an int value.
46 | static void set_engine_arg(cl_engine engine, int index, int value);
47 |
48 | // Sets an engine argument using a long value.
49 | static void set_engine_arg(cl_engine engine, int index, long value);
50 |
51 | // Sets an engine argument using a float value.
52 | static void set_engine_arg(cl_engine engine, int index, float value);
53 |
54 | // Sets an engine argument using a double value.
55 | static void set_engine_arg(cl_engine engine, int index, double value);
56 |
57 | // Runs an engine.
58 | static void run_engine(cl_engine engine);
59 |
60 | // Awaits an engine.
61 | static void await_engine(cl_engine engine);
62 |
63 | // Releases an engine.
64 | static void release_engine(cl_engine engine);
65 |
66 | // Releases a program.
67 | static void release_program(cl_world world);
68 |
69 | // Transfers data from a previously allocated buffer.
70 | static void memcpy_from(cl_world world, void *src_ptr, size_t offset,
71 | void *dst_ptr, size_t size);
72 |
73 | // Frees a buffer.
74 | static void free(cl_world world, void *ptr);
75 |
76 | // Releases the world.
77 | static void release_world(cl_world world);
78 | };
79 |
80 | #endif
81 |
--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime-api.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include "runtime-api.h"
18 |
19 | // Creates the world.
20 | cl_world InAccel::create_world(int device_id) {
21 | cl_world world = CreateWorld();
22 |
23 | GetPlatformID(world);
24 |
25 | GetDeviceID(world, (cl_uint)device_id);
26 |
27 | CreateContext(world);
28 |
29 | return world;
30 | }
31 |
32 | // Allocates a new buffer.
33 | void *InAccel::malloc(cl_world world, size_t size, int memory_id) {
34 | return CreateBuffer(world, size, (cl_uint)memory_id);
35 | }
36 |
37 | // Transfers data to a previously allocated buffer.
38 | void InAccel::memcpy_to(cl_world world, void *dst_ptr, size_t offset,
39 | void *src_ptr, size_t size) {
40 | cl_command_queue command_queue = CreateCommandQueue(world);
41 |
42 | EnqueueMemcpyTo(command_queue, dst_ptr, offset, src_ptr, size);
43 |
44 | ReleaseCommandQueue(command_queue);
45 | }
46 |
47 | // Creates a new program.
48 | void InAccel::create_program(cl_world world, const char *bitstream_name) {
49 | CreateProgram(world, bitstream_name);
50 | }
51 |
52 | // Creates a new egine.
53 | cl_engine InAccel::create_engine(cl_world world, const char *kernel_name) {
54 | return CreateEngine(world, kernel_name);
55 | }
56 |
57 | // Sets an engine argument using a buffer.
58 | void InAccel::set_engine_arg(cl_engine engine, int index, void *buffer) {
59 | SetEngineArgPointer(engine, (cl_uint)index, buffer);
60 | }
61 |
62 | // Sets an engine argument using an int value.
63 | void InAccel::set_engine_arg(cl_engine engine, int index, int value) {
64 | SetEngineArg(engine, (cl_uint)index, sizeof(int), &value);
65 | }
66 |
67 | // Sets an engine argument using a long value.
68 | void InAccel::set_engine_arg(cl_engine engine, int index, long value) {
69 | SetEngineArg(engine, (cl_uint)index, sizeof(long), &value);
70 | }
71 |
72 | // Sets an engine argument using a float value.
73 | void InAccel::set_engine_arg(cl_engine engine, int index, float value) {
74 | SetEngineArg(engine, (cl_uint)index, sizeof(float), &value);
75 | }
76 |
77 | // Sets an engine argument using a double value.
78 | void InAccel::set_engine_arg(cl_engine engine, int index, double value) {
79 | SetEngineArg(engine, (cl_uint)index, sizeof(double), &value);
80 | }
81 |
82 | // Runs an engine.
83 | void InAccel::run_engine(cl_engine engine) { EnqueueEngine(engine); }
84 |
85 | // Awaits an engine.
86 | void InAccel::await_engine(cl_engine engine) { BlockEngine(engine); }
87 |
88 | // Releases an engine.
89 | void InAccel::release_engine(cl_engine engine) { ReleaseEngine(engine); }
90 |
91 | // Releases a program.
92 | void InAccel::release_program(cl_world world) { ReleaseProgram(world); }
93 |
94 | // Transfers data from a previously allocated buffer.
95 | void InAccel::memcpy_from(cl_world world, void *src_ptr, size_t offset,
96 | void *dst_ptr, size_t size) {
97 | cl_command_queue command_queue = CreateCommandQueue(world);
98 |
99 | EnqueueMemcpyFrom(command_queue, src_ptr, offset, dst_ptr, size);
100 |
101 | ReleaseCommandQueue(command_queue);
102 | }
103 |
104 | // Frees a buffer.
105 | void InAccel::free(cl_world world, void *ptr) { ReleaseBuffer(world, ptr); }
106 |
107 | // Releases the world.
108 | void InAccel::release_world(cl_world world) {
109 | ReleaseContext(world);
110 |
111 | ReleaseWorld(world);
112 | }
113 |
--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #ifndef RUNTIME_H
18 | #define RUNTIME_H
19 |
20 | #include "common/INcl.h"
21 |
22 | // Packs a world struct.
23 | cl_world PackWorld(_cl_world *_world);
24 |
25 | // Unpacks a world struct.
26 | _cl_world *UnpackWorld(cl_world world);
27 |
28 | // Packs an engine struct.
29 | cl_engine PackEngine(_cl_engine *_engine);
30 |
31 | // Unpacks an engine struct.
32 | _cl_engine *UnpackEngine(cl_engine engine);
33 |
34 | // Transforms an engine to the world.
35 | cl_world EngineToWorld(cl_engine engine);
36 |
37 | // Creates the world struct.
38 | cl_world CreateWorld();
39 |
40 | // Obtains the platform id.
41 | void GetPlatformID(cl_world world);
42 |
43 | // Obtains the specified device id.
44 | void GetDeviceID(cl_world world, cl_uint id);
45 |
46 | // Creates the context.
47 | void CreateContext(cl_world world);
48 |
49 | // Creates a program with the specified name.
50 | void CreateProgram(cl_world world, const char *bitstream_name);
51 |
52 | // Creates a command queue.
53 | cl_command_queue CreateCommandQueue(cl_world world);
54 |
55 | // Blocks until all tasks in a command queue have been completed.
56 | void BlockCommandQueue(cl_command_queue command_queue);
57 |
58 | // Releases a command queue.
59 | void ReleaseCommandQueue(cl_command_queue command_queue);
60 |
61 | // Allocates a memory buffer.
62 | void *CreateBuffer(cl_world world, size_t size, cl_uint memory);
63 |
64 | // Enqueues a memory copy operation to device.
65 | void EnqueueMemcpyTo(cl_command_queue command_queue, void *dst_ptr, size_t offset, void *src_ptr, size_t size);
66 |
67 | // Enqueues a memory copy operation from device.
68 | void EnqueueMemcpyFrom(cl_command_queue command_queue, void *src_ptr, size_t offset, void *dst_ptr, size_t size);
69 |
70 | // Frees a memory buffer.
71 | void ReleaseBuffer(cl_world world, void *ptr);
72 |
73 | // Creates a kernel with the specified name.
74 | cl_kernel CreateKernel(cl_world world, const char *kernel_name);
75 |
76 | // Sets a pointer kernel argument.
77 | void SetKernelArgPointer(cl_kernel kernel, cl_uint arg_index, const void *arg_value);
78 |
79 | // Sets a scalar kernel argument.
80 | void SetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
81 |
82 | // Enqueues a kernel operation (Task mode).
83 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel);
84 |
85 | // Enqueues a kernel operation (NDRangeKernel mode).
86 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel, const size_t *global_work_size, const size_t *local_work_size);
87 |
88 | // Releases a kernel.
89 | void ReleaseKernel(cl_kernel kernel);
90 |
91 | // Creates an engine struct with the specified name.
92 | cl_engine CreateEngine(cl_world world, const char *kernel_name);
93 |
94 | // Blocks until all tasks in an engine struct have been completed.
95 | void BlockEngine(cl_engine engine);
96 |
97 | // Sets a pointer engine struct argument.
98 | void SetEngineArgPointer(cl_engine engine, cl_uint arg_index, const void *arg_value);
99 |
100 | // Sets a scalar engine struct argument.
101 | void SetEngineArg(cl_engine engine, cl_uint arg_index, size_t arg_size, const void *arg_value);
102 |
103 | // Enqueues an engine struct operation (Task mode).
104 | void EnqueueEngine(cl_engine engine);
105 |
106 | // Enqueues an engine struct operation (NDRangeKernel mode).
107 | void EnqueueEngine(cl_engine engine, const size_t *global_work_size, const size_t *local_work_size);
108 |
109 | // Releases an engine struct.
110 | void ReleaseEngine(cl_engine engine);
111 |
112 | // Releases a program.
113 | void ReleaseProgram(cl_world world);
114 |
115 | // Releases the context.
116 | void ReleaseContext(cl_world world);
117 |
118 | // Releases the world struct.
119 | void ReleaseWorld(cl_world world);
120 |
121 | #endif
122 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | ifndef XILINX_SDX
2 | $(error XILINX_SDX is not set)
3 | endif
4 |
5 | ifndef AWS_PLATFORM
6 | $(error AWS_PLATFORM is not set)
7 | endif
8 |
9 | # Host compiler global settings
10 | CC = g++ -O3 -Wno-deprecated-declarations
11 |
12 | CLCC = xocc
13 |
14 | BITSTREAM_NAME = Gradients
15 | HOST_EXE = ${BITSTREAM_NAME}
16 |
17 | PLATFORM = ${AWS_PLATFORM}
18 |
19 | HOST_DIR = host_srcs
20 | KERNEL_DIR = kernel_srcs
21 | KERNEL_TYPE = cpp
22 |
23 | # Host and Kernel sources
24 | HOST_SRCS = $(wildcard $(HOST_DIR)/*/*.cpp) $(wildcard $(HOST_DIR)/*.cpp)
25 | KERNEL_SRCS_CPP = $(wildcard $(KERNEL_DIR)/*.cpp)
26 |
27 | HOST_OBJECTS := $(HOST_SRCS:.cpp=.o)
28 | KERNEL_OBJECTS := $(KERNEL_SRCS_CPP:.cpp=.xo)
29 | ESTIMATE_OBJCTS := $(KERNEL_SRCS_CPP:.cpp=.estimate)
30 |
31 | # Include Libraries
32 | HOST_CFLAGS = -O3 -Wall -I${XILINX_SDX}/runtime/include/1_2 -Ihost_srcs
33 | HOST_LFLAGS = -L${XILINX_XRT}/lib -lxilinxopencl
34 |
35 | # Connecting kernels to specific memory banks
36 | BANKS = --sp Gradients_0_1.m_axi_gmem0:bank0 --sp Gradients_0_1.m_axi_gmem1:bank0 --sp Gradients_0_1.m_axi_gmem2:bank0 --sp Gradients_0_1.m_axi_gmem3:bank0 --sp Gradients_1_1.m_axi_gmem0:bank1 --sp Gradients_1_1.m_axi_gmem1:bank1 --sp Gradients_1_1.m_axi_gmem2:bank1 --sp Gradients_1_1.m_axi_gmem3:bank1 --sp Gradients_2_1.m_axi_gmem0:bank2 --sp Gradients_2_1.m_axi_gmem1:bank2 --sp Gradients_2_1.m_axi_gmem2:bank2 --sp Gradients_2_1.m_axi_gmem3:bank2 --sp Gradients_3_1.m_axi_gmem0:bank3 --sp Gradients_3_1.m_axi_gmem1:bank3 --sp Gradients_3_1.m_axi_gmem2:bank3 --sp Gradients_3_1.m_axi_gmem3:bank3
37 |
38 | # Additional Vivado options
39 | VIVADO_OPTS = --xp misc:enableGlobalHoldIter="True" --xp vivado_prop:run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=NoTimingRelaxation
40 |
41 | SDA_FLOW = sw_emu
42 | ifeq (${SDA_FLOW},sw_emu)
43 | TARGET = -t sw_emu
44 | else ifeq (${SDA_FLOW},hw_emu)
45 | TARGET = -t hw_emu
46 | else ifeq (${SDA_FLOW},hw)
47 | TARGET = -t hw
48 | endif
49 |
50 | all:
51 | make _TEST_="-D _TEST_" host
52 |
53 | host: ${HOST_EXE}
54 |
55 | xbin_sw_em:
56 | @+make SDA_FLOW=sw_emu xbin
57 |
58 | xbin_hw_em:
59 | @+make SDA_FLOW=hw_emu xbin
60 |
61 | xbin_hw :
62 | @+make SDA_FLOW=hw xbin
63 |
64 | run_sw_em:
65 | @+make SDA_FLOW=sw_emu run_sem
66 |
67 | run_hw_em:
68 | @+make SDA_FLOW=hw_emu run_hem
69 |
70 | run_sem: xconfig host xbin
71 | XCL_EMULATION_MODE=sw_emu ./${HOST_EXE} 1
72 |
73 | run_hem: xconfig host xbin
74 | XCL_EMULATION_MODE=hw_emu ./${HOST_EXE} 1
75 |
76 | xconfig:
77 | emconfigutil --platform ${PLATFORM} --od . --nd 1
78 |
79 | # Building host
80 | ${HOST_EXE}: ${HOST_OBJECTS}
81 | ${CC} ${HOST_OBJECTS} ${HOST_LFLAGS} -o $@
82 | ${RM} -rf ${HOST_OBJECTS}
83 |
84 | xbin: ${KERNEL_OBJECTS}
85 | ${CLCC} ${TARGET} --link -s --platform ${PLATFORM} ${VIVADO_OPTS} ${BANKS} ${KERNEL_OBJECTS} -o ${BITSTREAM_NAME}.xclbin
86 | ${RM} -rf ${KERNEL_OBJECTS}
87 |
88 | estimate: ${ESTIMATE_OBJCTS}
89 | ${RM} -rf $(patsubst %.estimate,%.xo,$(ESTIMATE_OBJCTS))
90 |
91 | %.o: %.cpp
92 | ${CC} ${_TEST_} ${HOST_CFLAGS} -c $< -o $@
93 |
94 | # Building kernel
95 | %.xo: %.cpp
96 | ${CLCC} ${TARGET} --save-temps --platform ${PLATFORM} --kernel $(notdir $(basename $<)) -c $< -o $@
97 |
98 | %.estimate: %.${KERNEL_TYPE}
99 | ${CLCC} --target hw_emu --report_level estimate --save-temps --platform ${PLATFORM} --kernel $(notdir $(basename $<)) -c $< -o $(basename $<).xo
100 |
101 | clean:
102 | ${RM} -rf ${HOST_EXE} $(patsubst %.estimate,%.xo,$(ESTIMATE_OBJCTS)) ${KERNEL_OBJECTS} ${HOST_OBJECTS} emconfig.json *.log *.dir *.xml *.dcp *.dat _sds iprepo *.tcl xilinx_aws-vu9p-f1_dynamic_5_0.hpfm .Xil sdaccel_* system_estimate.xtxt _x top_sp.ltx
103 |
104 | cleanall: clean
105 | ${RM} -rf ${BITSTREAM_NAME}*
106 |
107 | help:
108 | @echo "Compile and run CPU emulation"
109 | @echo "make run_sw_em"
110 | @echo ""
111 | @echo "Compile and run hardware emulation"
112 | @echo "make run_hw_em"
113 | @echo ""
114 | @echo "Compile host executable only"
115 | @echo "make host"
116 | @echo ""
117 | @echo "Compile host executable only for SW version"
118 | @echo "make"
119 | @echo ""
120 | @echo "Compile .xclbin file for system run only"
121 | @echo "make xbin_hw"
122 | @echo ""
123 | @echo "Compile .xclbin file for sw emulation"
124 | @echo "make xbin_sw_em"
125 | @echo ""
126 | @echo "Compile .xclbin file for hw emulation"
127 | @echo "make xbin_hw_em"
128 | @echo ""
129 | @echo "Clean working diretory"
130 | @echo "make clean"
131 | @echo "Clean working diretory and bitstream files"
132 | @echo "make cleanall"
133 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | # Logistic Regression IP core
8 |
9 |
10 | This is an FPGA accelerated solution for Logistic Regression BGD algorithm. It can provide up to **70x** speedup compared to a single threaded execution and up to **12x** compared to an 8 threaded Intel Xeon CPU execution respectively.
11 |
12 | ## Specifications
13 |
14 | | Classes | Features |
15 | | :------: | :--------: |
16 | | up to 64 | up to 2047 |
17 |
18 | ## Supported Platforms
19 |
20 | | Board |
21 | | :-------------------------: |
22 | | [Xilinx Alveo U200](https://www.xilinx.com/products/boards-and-kits/alveo/u200.html) |
23 | | [Xilinx Alveo U250](https://www.xilinx.com/products/boards-and-kits/alveo/u250.html) |
24 | | [AWS VU9P (F1 instances)](https://aws.amazon.com/ec2/instance-types/f1/) |
25 | | Alibaba VU9P (F3 instances) |
26 | | Any other Xilinx platform with at least the same amount of VU9P resources |
27 |
28 | ## Design Files
29 |
30 | - The application code is located in the hosts_srcs directory. Accelerator kernel files are located under the kernel_srcs directory while any accelerator binaries will be compiled to the current directory.
31 | - The Makefile will help you generate any host executable and accelerator _.xclbin_ files.
32 |
33 | A listing of all the files in this repository is shown below:
34 |
35 | - Makefile
36 | - hosts_srcs/
37 | - LogisticRegression.cpp
38 | - common/
39 | - INcl.cpp (OpenCL wrapper functions)
40 | - INcl.h
41 | - inaccel/
42 | - runtime-api.cpp (InAccel runtime abstraction layer)
43 | - runtime-api.h
44 | - runtime.cpp (InAccel runtime abstraction layer)
45 | - runtime.h
46 | - kernel_srcs/
47 | - Gradients_0.cpp (Accelerated kernel)
48 | - Gradients_1.cpp (Accelerated kernel)
49 | - Gradients_2.cpp (Accelerated kernel)
50 | - Gradients_3.cpp (Accelerated kernel)
51 | - data/
52 |
53 | ## Preparation
54 |
55 | **!** Before invoking any of the Makefile targets make sure you have sourced Xilinx **XRT** setup script.
56 | **!** Make sure you have set **XILINX_SDX** environment variable pointing to the SDx installation directory.
57 |
58 | As far as the **platform** (or board) is concerned, Makefile uses **AWS_PLATFORM** environment variable as the target platform for the kernels compilation. If you are running this on AWS make sure AWS_PLATFORM environment variable is present and points to the platform DSA files1. Otherwise you can set Makefile `PLATFORM` variable to point to your platform DSA files.
59 |
60 | 1. To obtain the AWS platform DSA files make sure you have cloned the aws-fpga github repository
61 |
62 | Download train letters train dataset to data directory. Navigate to data directory and execute the following commands:
63 |
64 | ``` bash
65 | wget https://s3.amazonaws.com/inaccel-demo/data/nist/letters_csv_train.dat
66 | wget https://s3.amazonaws.com/inaccel-demo/data/nist/letters_csv_test.dat
67 | ```
68 |
69 | ## Compiling the kernels
70 |
71 | To compile the kernels for hardware target you just need to execute `make xbin_hw` while for software and hardware emulation you must execute `make xbin_sw` and `make xbin_hw` respectively.
72 | A full list of all the available Makefile targets can be found using `make help` command.
73 |
74 | ## Single-thread - Single-application Execution
75 |
76 | To test the generated xclbin file you can simply run `make host` command to create the host application. The host application takes only one input argument, the number of iterations.
77 | Example execution: `./Gradients 100`
78 |
79 | ## Scaling Up and Out with InAccel Coral
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 | The above example application spawns a single thread and can train a model using a single FPGA device which **is not viable for datacenter-scale needs**. Data scientists rely on frameworks like Scikit Learn and Apache Spark to create and test their machine learning pipelines.
88 | **InAccel Coral** FPGA resource manager is able to automatically **scale** and **schedule** any acceleration requests to a **cluster of FPGAs**, perform **load balancing** techniques, **reconfigure** the FPGA devices, perform **memory management** etc., yet providing a simple to use **high level API** in Java, CPP and Python.
89 | We have also ready-to-use **integrations** with broadly used open source frameworks like Apache Spark to seamlessly accelerate your pipelines.
90 | Finally, shaping cutting edge technology, Coral is fully compatible with **Kubernetes** and using InAccel's device plugin you can set up a Kubernetes cluster aware of hardware accelerated resources or take advantage of **Serverless architecture** and provide acclerated serverless solutions to your own customers.
91 |
92 | * You can **create a free InAccel Coral license** [here](https://www.inaccel.com/license/).
93 | * You can **download** InAccel Coral docker from [dockerhub](https://hub.docker.com/r/inaccel/coral).
94 | * You can find **full documentation** as well as a **quick starting guide** in [InAccel Docs](https://docs.inaccel.com/).
95 |
--------------------------------------------------------------------------------
/host_srcs/common/INcl.h:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #ifndef INCL_H
18 | #define INCL_H
19 |
20 | #include
21 | #include
22 |
23 | // InAccelCL world struct (Type).
24 | typedef struct{
25 | cl_platform_id platform_id;
26 | cl_device_id device_id;
27 | cl_context context;
28 | cl_program program;
29 | } _cl_world;
30 |
31 | // InAccelCL world struct (API Type).
32 | typedef uintptr_t cl_world;
33 |
34 | // InAccelCL engine struct (Type).
35 | typedef struct{
36 | cl_world world;
37 |
38 | cl_command_queue command_queue;
39 | cl_kernel kernel;
40 | } _cl_engine;
41 |
42 | // InAccelCL engine struct (API Type).
43 | typedef uintptr_t cl_engine;
44 |
45 | // Builds a program executable from the program binary.
46 | void INclBuildProgram(cl_program program);
47 |
48 | // Creates a buffer object.
49 | cl_mem INclCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr);
50 |
51 | // Create a command-queue on a specific device.
52 | cl_command_queue INclCreateCommandQueue(cl_context context, cl_device_id device);
53 |
54 | // Creates an OpenCL context.
55 | cl_context INclCreateContext(const cl_device_id device);
56 |
57 | // Creates a kernel object.
58 | cl_kernel INclCreateKernel(cl_program program, const char *kernel_name);
59 |
60 | // Creates a program object for a context, and loads specified binary data into the program object.
61 | cl_program INclCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id *device_list, const char *binary_name);
62 |
63 | // Enqueues a command to map a region of the buffer object given by buffer into the host address space and returns a pointer to this mapped region.
64 | void *INclEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_map_flags map_flags, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
65 |
66 | // Enqueues a command to indicate which device a set of memory objects should be associated with.
67 | void INclEnqueueMigrateMemObjects(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem *mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
68 |
69 | // Enqueues a command to execute a kernel on a device.
70 | void INclEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
71 |
72 | // Enqueue commands to read from a buffer object to host memory.
73 | void INclEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
74 |
75 | // Enqueues a command to execute a kernel on a device.
76 | void INclEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
77 |
78 | // Enqueue commands to write to a buffer object from host memory.
79 | void INclEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event);
80 |
81 | // Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed.
82 | void INclFinish(cl_command_queue command_queue);
83 |
84 | // Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue.
85 | void INclFlush(cl_command_queue command_queue);
86 |
87 | // Obtain specified device, if available.
88 | cl_device_id INclGetDeviceID(cl_platform_id platform, cl_uint id);
89 |
90 | // Obtain the list of devices available on a platform.
91 | void INclGetDeviceIDs(cl_platform_id platform, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices);
92 |
93 | // Get specific information about the OpenCL device.
94 | void INclGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret);
95 |
96 | // Obtain platform, if available.
97 | cl_platform_id INclGetPlatformID();
98 |
99 | // Obtain the list of platforms available.
100 | void INclGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms);
101 |
102 | // Get specific information about the OpenCL platform.
103 | void INclGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret);
104 |
105 | // Decrements the command_queue reference count.
106 | void INclReleaseCommandQueue(cl_command_queue command_queue);
107 |
108 | // Decrement the context reference count.
109 | void INclReleaseContext(cl_context context);
110 |
111 | // Decrements the event reference count.
112 | void INclReleaseEvent(cl_event event);
113 |
114 | // Decrements the kernel reference count.
115 | void INclReleaseKernel(cl_kernel kernel);
116 |
117 | // Decrements the memory object reference count.
118 | void INclReleaseMemObject(cl_mem memobj);
119 |
120 | // Decrements the program reference count.
121 | void INclReleaseProgram(cl_program program);
122 |
123 | // Used to set the argument value for a specific argument of a kernel.
124 | void INclSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value);
125 |
126 | // Waits on the host thread for commands identified by event objects to complete.
127 | void INclWaitForEvents(cl_uint num_events, const cl_event *event_list);
128 |
129 | // Returns a message related to the error code.
130 | const char *INclCheckErrorCode(cl_int errcode);
131 |
132 | #endif
133 |
--------------------------------------------------------------------------------
/kernel_srcs/Gradients_0.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 |
20 | #define chunk 8
21 | #define numClassesMax 64
22 | #define numFeaturesPlusOneMax 128
23 | #define vectorSize 16
24 |
25 | typedef ap_int<256> float8;
26 | typedef ap_int<512> float16;
27 |
28 | union {
29 | int asInt;
30 | float asFloat;
31 | } converter1, converter2;
32 |
33 | // This function represents a Logistc Regression HLS kernel.
34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
36 |
37 | extern "C" {
38 | void Gradients_0(float8 *_labels, float16 *_data, float16 *_weights,
39 | float16 *_gradients, int numClasses, int numFeatures,
40 | int chunkSize) {
41 |
42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
54 |
55 | float16 features[chunk][numFeaturesPlusOneMax],
56 | weights[numClassesMax][numFeaturesPlusOneMax],
57 | gradients[numClassesMax][numFeaturesPlusOneMax];
58 | float lin[numClassesMax][chunk * vectorSize];
59 | float prd[chunk][numClassesMax];
60 |
61 | // Using URAMs for features, weights and gradients buffers
62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
65 |
66 | // Partitioning the local arrays
67 | #pragma HLS array_partition variable = features complete dim = 1
68 | #pragma HLS array_partition variable = lin complete dim = 2
69 | #pragma HLS array_partition variable = prd complete dim = 1
70 |
71 | // Compute the number of features iterations for float16 input data
72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne =
73 | // 2)
74 | int numFeaturesPlusOne =
75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
77 | // dependencies in some loops
78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses;
79 |
80 | int c, i, j, k, t;
81 |
82 | // Reading weights and filling gradients with zeros
83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
84 | kj++, j++) {
85 | #pragma HLS pipeline II = 1
86 | if (j == numFeaturesPlusOne) {
87 | j = 0;
88 | k++;
89 | }
90 | weights[k][j] = _weights[kj];
91 | gradients[k][j] = 0;
92 | }
93 |
94 | // Iterate over the points of the dataset each time reading a batch of 8
95 | // points
96 | for (i = 0; i < (chunkSize / chunk); i++) {
97 | int offset = (i * chunk) * numFeaturesPlusOne;
98 |
99 | // Reading the features of the dataset
100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 | if (j == numFeaturesPlusOne) {
103 | j = 0;
104 | c++;
105 | }
106 | features[c][j] = _data[offset + cj];
107 | }
108 |
109 | // Computing the algorithm's dot product
110 | for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 | for (c = 0; c < chunk; c++) {
113 | for (t = 0; t < vectorSize; t++) {
114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 | }
118 | }
119 | }
120 |
121 | for (j = 1; j < numFeaturesPlusOne; j++) {
122 | for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 | for (c = 0; c < chunk; c++) {
125 | for (t = 0; t < vectorSize; t++) {
126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 | lin[k][c * vectorSize + t] +=
129 | converter1.asFloat * converter2.asFloat;
130 | }
131 | }
132 | }
133 | }
134 |
135 | for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 | for (c = 0; c < chunk; c++) {
138 | prd[c][k] =
139 | 1.0 /
140 | (1.0 +
141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 | }
150 | }
151 |
152 | // Reading the dataset labels and update predictions
153 | float8 labels = _labels[i];
154 | for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 | int label = labels.range((c + 1) * 32 - 1, c * 32);
157 | prd[c][label] -= 1.0;
158 | }
159 |
160 | // Compute the output gradients
161 | for (j = 0; j < numFeaturesPlusOne; j++) {
162 | for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 | for (c = 0; c < chunk; c++) {
165 | for (t = 0; t < vectorSize; t++) {
166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 | converter2.asFloat += prd[c][k] * converter1.asFloat;
169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 | }
171 | }
172 | }
173 | }
174 | }
175 |
176 | // Write back gradients
177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 | kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 | if (j == numFeaturesPlusOne) {
181 | j = 0;
182 | k++;
183 | }
184 | _gradients[kj] = gradients[k][j];
185 | }
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/kernel_srcs/Gradients_1.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 |
20 | #define chunk 8
21 | #define numClassesMax 64
22 | #define numFeaturesPlusOneMax 128
23 | #define vectorSize 16
24 |
25 | typedef ap_int<256> float8;
26 | typedef ap_int<512> float16;
27 |
28 | union {
29 | int asInt;
30 | float asFloat;
31 | } converter1, converter2;
32 |
33 | // This function represents a Logistc Regression HLS kernel.
34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
36 |
37 | extern "C" {
38 | void Gradients_1(float8 *_labels, float16 *_data, float16 *_weights,
39 | float16 *_gradients, int numClasses, int numFeatures,
40 | int chunkSize) {
41 |
42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
54 |
55 | float16 features[chunk][numFeaturesPlusOneMax],
56 | weights[numClassesMax][numFeaturesPlusOneMax],
57 | gradients[numClassesMax][numFeaturesPlusOneMax];
58 | float lin[numClassesMax][chunk * vectorSize];
59 | float prd[chunk][numClassesMax];
60 |
61 | // Using URAMs for features, weights and gradients buffers
62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
65 |
66 | // Partitioning the local arrays
67 | #pragma HLS array_partition variable = features complete dim = 1
68 | #pragma HLS array_partition variable = lin complete dim = 2
69 | #pragma HLS array_partition variable = prd complete dim = 1
70 |
71 | // Compute the number of features iterations for float16 input data
72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne =
73 | // 2)
74 | int numFeaturesPlusOne =
75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
77 | // dependencies in some loops
78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses;
79 |
80 | int c, i, j, k, t;
81 |
82 | // Reading weights and filling gradients with zeros
83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
84 | kj++, j++) {
85 | #pragma HLS pipeline II = 1
86 | if (j == numFeaturesPlusOne) {
87 | j = 0;
88 | k++;
89 | }
90 | weights[k][j] = _weights[kj];
91 | gradients[k][j] = 0;
92 | }
93 |
94 | // Iterate over the points of the dataset each time reading a batch of 8
95 | // points
96 | for (i = 0; i < (chunkSize / chunk); i++) {
97 | int offset = (i * chunk) * numFeaturesPlusOne;
98 |
99 | // Reading the features of the dataset
100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 | if (j == numFeaturesPlusOne) {
103 | j = 0;
104 | c++;
105 | }
106 | features[c][j] = _data[offset + cj];
107 | }
108 |
109 | // Computing the algorithm's dot product
110 | for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 | for (c = 0; c < chunk; c++) {
113 | for (t = 0; t < vectorSize; t++) {
114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 | }
118 | }
119 | }
120 |
121 | for (j = 1; j < numFeaturesPlusOne; j++) {
122 | for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 | for (c = 0; c < chunk; c++) {
125 | for (t = 0; t < vectorSize; t++) {
126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 | lin[k][c * vectorSize + t] +=
129 | converter1.asFloat * converter2.asFloat;
130 | }
131 | }
132 | }
133 | }
134 |
135 | for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 | for (c = 0; c < chunk; c++) {
138 | prd[c][k] =
139 | 1.0 /
140 | (1.0 +
141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 | }
150 | }
151 |
152 | // Reading the dataset labels and update predictions
153 | float8 labels = _labels[i];
154 | for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 | int label = labels.range((c + 1) * 32 - 1, c * 32);
157 | prd[c][label] -= 1.0;
158 | }
159 |
160 | // Compute the output gradients
161 | for (j = 0; j < numFeaturesPlusOne; j++) {
162 | for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 | for (c = 0; c < chunk; c++) {
165 | for (t = 0; t < vectorSize; t++) {
166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 | converter2.asFloat += prd[c][k] * converter1.asFloat;
169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 | }
171 | }
172 | }
173 | }
174 | }
175 |
176 | // Write back gradients
177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 | kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 | if (j == numFeaturesPlusOne) {
181 | j = 0;
182 | k++;
183 | }
184 | _gradients[kj] = gradients[k][j];
185 | }
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/kernel_srcs/Gradients_2.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 |
20 | #define chunk 8
21 | #define numClassesMax 64
22 | #define numFeaturesPlusOneMax 128
23 | #define vectorSize 16
24 |
25 | typedef ap_int<256> float8;
26 | typedef ap_int<512> float16;
27 |
28 | union {
29 | int asInt;
30 | float asFloat;
31 | } converter1, converter2;
32 |
33 | // This function represents a Logistc Regression HLS kernel.
34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
36 |
37 | extern "C" {
38 | void Gradients_2(float8 *_labels, float16 *_data, float16 *_weights,
39 | float16 *_gradients, int numClasses, int numFeatures,
40 | int chunkSize) {
41 |
42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
54 |
55 | float16 features[chunk][numFeaturesPlusOneMax],
56 | weights[numClassesMax][numFeaturesPlusOneMax],
57 | gradients[numClassesMax][numFeaturesPlusOneMax];
58 | float lin[numClassesMax][chunk * vectorSize];
59 | float prd[chunk][numClassesMax];
60 |
61 | // Using URAMs for features, weights and gradients buffers
62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
65 |
66 | // Partitioning the local arrays
67 | #pragma HLS array_partition variable = features complete dim = 1
68 | #pragma HLS array_partition variable = lin complete dim = 2
69 | #pragma HLS array_partition variable = prd complete dim = 1
70 |
71 | // Compute the number of features iterations for float16 input data
72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne =
73 | // 2)
74 | int numFeaturesPlusOne =
75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
77 | // dependencies in some loops
78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses;
79 |
80 | int c, i, j, k, t;
81 |
82 | // Reading weights and filling gradients with zeros
83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
84 | kj++, j++) {
85 | #pragma HLS pipeline II = 1
86 | if (j == numFeaturesPlusOne) {
87 | j = 0;
88 | k++;
89 | }
90 | weights[k][j] = _weights[kj];
91 | gradients[k][j] = 0;
92 | }
93 |
94 | // Iterate over the points of the dataset each time reading a batch of 8
95 | // points
96 | for (i = 0; i < (chunkSize / chunk); i++) {
97 | int offset = (i * chunk) * numFeaturesPlusOne;
98 |
99 | // Reading the features of the dataset
100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 | if (j == numFeaturesPlusOne) {
103 | j = 0;
104 | c++;
105 | }
106 | features[c][j] = _data[offset + cj];
107 | }
108 |
109 | // Computing the algorithm's dot product
110 | for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 | for (c = 0; c < chunk; c++) {
113 | for (t = 0; t < vectorSize; t++) {
114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 | }
118 | }
119 | }
120 |
121 | for (j = 1; j < numFeaturesPlusOne; j++) {
122 | for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 | for (c = 0; c < chunk; c++) {
125 | for (t = 0; t < vectorSize; t++) {
126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 | lin[k][c * vectorSize + t] +=
129 | converter1.asFloat * converter2.asFloat;
130 | }
131 | }
132 | }
133 | }
134 |
135 | for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 | for (c = 0; c < chunk; c++) {
138 | prd[c][k] =
139 | 1.0 /
140 | (1.0 +
141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 | }
150 | }
151 |
152 | // Reading the dataset labels and update predictions
153 | float8 labels = _labels[i];
154 | for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 | int label = labels.range((c + 1) * 32 - 1, c * 32);
157 | prd[c][label] -= 1.0;
158 | }
159 |
160 | // Compute the output gradients
161 | for (j = 0; j < numFeaturesPlusOne; j++) {
162 | for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 | for (c = 0; c < chunk; c++) {
165 | for (t = 0; t < vectorSize; t++) {
166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 | converter2.asFloat += prd[c][k] * converter1.asFloat;
169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 | }
171 | }
172 | }
173 | }
174 | }
175 |
176 | // Write back gradients
177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 | kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 | if (j == numFeaturesPlusOne) {
181 | j = 0;
182 | k++;
183 | }
184 | _gradients[kj] = gradients[k][j];
185 | }
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/kernel_srcs/Gradients_3.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 |
20 | #define chunk 8
21 | #define numClassesMax 64
22 | #define numFeaturesPlusOneMax 128
23 | #define vectorSize 16
24 |
25 | typedef ap_int<256> float8;
26 | typedef ap_int<512> float16;
27 |
28 | union {
29 | int asInt;
30 | float asFloat;
31 | } converter1, converter2;
32 |
33 | // This function represents a Logistc Regression HLS kernel.
34 | // The kernel is able to train a model of up to 64 classes and 2047 features.
35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable.
36 |
37 | extern "C" {
38 | void Gradients_3(float8 *_labels, float16 *_data, float16 *_weights,
39 | float16 *_gradients, int numClasses, int numFeatures,
40 | int chunkSize) {
41 |
42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0
43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1
44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2
45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3
46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control
47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control
48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control
49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control
50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control
51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control
52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control
53 | #pragma HLS INTERFACE s_axilite port = return bundle = control
54 |
55 | float16 features[chunk][numFeaturesPlusOneMax],
56 | weights[numClassesMax][numFeaturesPlusOneMax],
57 | gradients[numClassesMax][numFeaturesPlusOneMax];
58 | float lin[numClassesMax][chunk * vectorSize];
59 | float prd[chunk][numClassesMax];
60 |
61 | // Using URAMs for features, weights and gradients buffers
62 | #pragma HLS resource variable = features core = XPM_MEMORY uram
63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram
64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram
65 |
66 | // Partitioning the local arrays
67 | #pragma HLS array_partition variable = features complete dim = 1
68 | #pragma HLS array_partition variable = lin complete dim = 2
69 | #pragma HLS array_partition variable = prd complete dim = 1
70 |
71 | // Compute the number of features iterations for float16 input data
72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne =
73 | // 2)
74 | int numFeaturesPlusOne =
75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4;
76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid
77 | // dependencies in some loops
78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses;
79 |
80 | int c, i, j, k, t;
81 |
82 | // Reading weights and filling gradients with zeros
83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
84 | kj++, j++) {
85 | #pragma HLS pipeline II = 1
86 | if (j == numFeaturesPlusOne) {
87 | j = 0;
88 | k++;
89 | }
90 | weights[k][j] = _weights[kj];
91 | gradients[k][j] = 0;
92 | }
93 |
94 | // Iterate over the points of the dataset each time reading a batch of 8
95 | // points
96 | for (i = 0; i < (chunkSize / chunk); i++) {
97 | int offset = (i * chunk) * numFeaturesPlusOne;
98 |
99 | // Reading the features of the dataset
100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) {
101 | #pragma HLS pipeline II = 1
102 | if (j == numFeaturesPlusOne) {
103 | j = 0;
104 | c++;
105 | }
106 | features[c][j] = _data[offset + cj];
107 | }
108 |
109 | // Computing the algorithm's dot product
110 | for (k = 0; k < numClasses; k++) {
111 | #pragma HLS pipeline II = 1
112 | for (c = 0; c < chunk; c++) {
113 | for (t = 0; t < vectorSize; t++) {
114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32);
115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32);
116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat;
117 | }
118 | }
119 | }
120 |
121 | for (j = 1; j < numFeaturesPlusOne; j++) {
122 | for (k = 0; k < numClassesMin; k++) {
123 | #pragma HLS pipeline II = 1
124 | for (c = 0; c < chunk; c++) {
125 | for (t = 0; t < vectorSize; t++) {
126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32);
128 | lin[k][c * vectorSize + t] +=
129 | converter1.asFloat * converter2.asFloat;
130 | }
131 | }
132 | }
133 | }
134 |
135 | for (k = 0; k < numClasses; k++) {
136 | #pragma HLS pipeline II = 1
137 | for (c = 0; c < chunk; c++) {
138 | prd[c][k] =
139 | 1.0 /
140 | (1.0 +
141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] +
142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] +
143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] +
144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] +
145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] +
146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] +
147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] +
148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15])));
149 | }
150 | }
151 |
152 | // Reading the dataset labels and update predictions
153 | float8 labels = _labels[i];
154 | for (c = 0; c < chunk; c++) {
155 | #pragma HLS unroll
156 | int label = labels.range((c + 1) * 32 - 1, c * 32);
157 | prd[c][label] -= 1.0;
158 | }
159 |
160 | // Compute the output gradients
161 | for (j = 0; j < numFeaturesPlusOne; j++) {
162 | for (k = 0; k < numClassesMin; k++) {
163 | #pragma HLS pipeline II = 1
164 | for (c = 0; c < chunk; c++) {
165 | for (t = 0; t < vectorSize; t++) {
166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32);
167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32);
168 | converter2.asFloat += prd[c][k] * converter1.asFloat;
169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt;
170 | }
171 | }
172 | }
173 | }
174 | }
175 |
176 | // Write back gradients
177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne;
178 | kj++, j++) {
179 | #pragma HLS pipeline II = 1
180 | if (j == numFeaturesPlusOne) {
181 | j = 0;
182 | k++;
183 | }
184 | _gradients[kj] = gradients[k][j];
185 | }
186 | }
187 | }
188 |
--------------------------------------------------------------------------------
/host_srcs/inaccel/runtime.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include "runtime.h"
18 |
19 | // Packs a world struct.
20 | cl_world PackWorld(_cl_world *_world) { return (cl_world)_world; }
21 |
22 | // Unpacks a world struct.
23 | _cl_world *UnpackWorld(cl_world world) { return (_cl_world *)world; }
24 |
25 | // Packs an engine struct.
26 | cl_engine PackEngine(_cl_engine *_engine) { return (cl_engine)_engine; }
27 |
28 | // Unpacks an engine struct.
29 | _cl_engine *UnpackEngine(cl_engine engine) { return (_cl_engine *)engine; }
30 |
31 | // Transforms an engine to the world.
32 | cl_world EngineToWorld(cl_engine engine) { return UnpackEngine(engine)->world; }
33 |
34 | // Creates the world struct.
35 | cl_world CreateWorld() {
36 | _cl_world *_world = (_cl_world *)malloc(sizeof(_cl_world));
37 |
38 | return PackWorld(_world);
39 | }
40 |
41 | // Obtains the platform id.
42 | void GetPlatformID(cl_world world) {
43 | _cl_world *_world = UnpackWorld(world);
44 |
45 | _world->platform_id = INclGetPlatformID();
46 | }
47 |
48 | // Obtains the specified device id.
49 | void GetDeviceID(cl_world world, cl_uint id) {
50 | _cl_world *_world = UnpackWorld(world);
51 |
52 | _world->device_id = INclGetDeviceID(_world->platform_id, id);
53 | }
54 |
55 | // Creates the context.
56 | void CreateContext(cl_world world) {
57 | _cl_world *_world = UnpackWorld(world);
58 |
59 | _world->context = INclCreateContext(_world->device_id);
60 | }
61 |
62 | // Creates a program with the specified name.
63 | void CreateProgram(cl_world world, const char *bitstream_name) {
64 | _cl_world *_world = UnpackWorld(world);
65 |
66 | _world->program = INclCreateProgramWithBinary(
67 | _world->context, 1, &_world->device_id, bitstream_name);
68 |
69 | INclBuildProgram(_world->program);
70 | }
71 |
72 | // Creates a command queue.
73 | cl_command_queue CreateCommandQueue(cl_world world) {
74 | _cl_world *_world = UnpackWorld(world);
75 |
76 | return INclCreateCommandQueue(_world->context, _world->device_id);
77 | }
78 |
79 | // Blocks until all tasks in a command queue have been completed.
80 | void BlockCommandQueue(cl_command_queue command_queue) {
81 | INclFlush(command_queue);
82 | INclFinish(command_queue);
83 | }
84 |
85 | // Releases a command queue.
86 | void ReleaseCommandQueue(cl_command_queue command_queue) {
87 | BlockCommandQueue(command_queue);
88 |
89 | INclReleaseCommandQueue(command_queue);
90 | }
91 |
92 | // Allocates a memory buffer.
93 | void *CreateBuffer(cl_world world, size_t size, cl_uint memory) {
94 | _cl_world *_world = UnpackWorld(world);
95 |
96 | cl_uint CL_MEM_EXT_PTR = 1 << 31;
97 |
98 | typedef struct {
99 | unsigned flags;
100 | void *obj;
101 | void *param;
102 | } cl_mem_ext_ptr_t;
103 |
104 | cl_uint CL_MEMORY = 1 << memory;
105 |
106 | cl_mem_ext_ptr_t buffer;
107 | buffer.flags = CL_MEMORY;
108 | buffer.obj = NULL;
109 | buffer.param = 0;
110 |
111 | return (void *)INclCreateBuffer(
112 | _world->context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR, size, &buffer);
113 | }
114 |
115 | // Enqueues a memory copy operation to device.
116 | void EnqueueMemcpyTo(cl_command_queue command_queue, void *dst_ptr,
117 | size_t offset, void *src_ptr, size_t size) {
118 | INclEnqueueWriteBuffer(command_queue, (cl_mem)dst_ptr, offset, size, src_ptr,
119 | 0, NULL, NULL);
120 | }
121 |
122 | // Enqueues a memory copy operation from device.
123 | void EnqueueMemcpyFrom(cl_command_queue command_queue, void *src_ptr,
124 | size_t offset, void *dst_ptr, size_t size) {
125 | INclEnqueueReadBuffer(command_queue, (cl_mem)src_ptr, offset, size, dst_ptr,
126 | 0, NULL, NULL);
127 | }
128 |
129 | // Frees a memory buffer.
130 | void ReleaseBuffer(cl_world world, void *ptr) {
131 | INclReleaseMemObject((cl_mem)ptr);
132 | }
133 |
134 | // Creates a kernel with the specified name.
135 | cl_kernel CreateKernel(cl_world world, const char *kernel_name) {
136 | _cl_world *_world = UnpackWorld(world);
137 |
138 | return INclCreateKernel(_world->program, kernel_name);
139 | }
140 |
141 | // Sets a pointer kernel argument.
142 | void SetKernelArgPointer(cl_kernel kernel, cl_uint arg_index,
143 | const void *arg_value) {
144 | INclSetKernelArg(kernel, arg_index, sizeof(cl_mem), &arg_value);
145 | }
146 |
147 | // Sets a scalar kernel argument.
148 | void SetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
149 | const void *arg_value) {
150 | INclSetKernelArg(kernel, arg_index, arg_size, arg_value);
151 | }
152 |
153 | // Enqueues a kernel operation (Task mode).
154 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel) {
155 | INclEnqueueTask(command_queue, kernel, 0, NULL, NULL);
156 | }
157 |
158 | // Enqueues a kernel operation (NDRangeKernel mode).
159 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel,
160 | const size_t *global_work_size,
161 | const size_t *local_work_size) {
162 | INclEnqueueNDRangeKernel(command_queue, kernel, 3, global_work_size,
163 | local_work_size, 0, NULL, NULL);
164 | }
165 |
166 | // Releases a kernel.
167 | void ReleaseKernel(cl_kernel kernel) { INclReleaseKernel(kernel); }
168 |
169 | // Creates an engine struct with the specified name.
170 | cl_engine CreateEngine(cl_world world, const char *kernel_name) {
171 | _cl_engine *_engine = (_cl_engine *)malloc(sizeof(_cl_engine));
172 |
173 | _engine->world = world;
174 |
175 | _engine->command_queue = CreateCommandQueue(world);
176 | _engine->kernel = CreateKernel(world, kernel_name);
177 |
178 | return PackEngine(_engine);
179 | }
180 |
181 | // Blocks until all tasks in an engine struct have been completed.
182 | void BlockEngine(cl_engine engine) {
183 | _cl_engine *_engine = UnpackEngine(engine);
184 |
185 | BlockCommandQueue(_engine->command_queue);
186 | }
187 |
188 | // Sets a pointer engine struct argument.
189 | void SetEngineArgPointer(cl_engine engine, cl_uint arg_index,
190 | const void *arg_value) {
191 | _cl_engine *_engine = UnpackEngine(engine);
192 |
193 | SetKernelArgPointer(_engine->kernel, arg_index, arg_value);
194 | }
195 |
196 | // Sets a scalar engine struct argument.
197 | void SetEngineArg(cl_engine engine, cl_uint arg_index, size_t arg_size,
198 | const void *arg_value) {
199 | _cl_engine *_engine = UnpackEngine(engine);
200 |
201 | SetKernelArg(_engine->kernel, arg_index, arg_size, arg_value);
202 | }
203 |
204 | // Enqueues an engine struct operation (Task mode).
205 | void EnqueueEngine(cl_engine engine) {
206 | _cl_engine *_engine = UnpackEngine(engine);
207 |
208 | EnqueueKernel(_engine->command_queue, _engine->kernel);
209 | }
210 |
211 | // Enqueues an engine struct operation (NDRangeKernel mode).
212 | void EnqueueEngine(cl_engine engine, const size_t *global_work_size,
213 | const size_t *local_work_size) {
214 | _cl_engine *_engine = UnpackEngine(engine);
215 |
216 | EnqueueKernel(_engine->command_queue, _engine->kernel, global_work_size,
217 | local_work_size);
218 | }
219 |
220 | // Releases an engine struct.
221 | void ReleaseEngine(cl_engine engine) {
222 | _cl_engine *_engine = UnpackEngine(engine);
223 |
224 | ReleaseCommandQueue(_engine->command_queue);
225 | ReleaseKernel(_engine->kernel);
226 |
227 | free(_engine);
228 | }
229 |
230 | // Releases a program.
231 | void ReleaseProgram(cl_world world) {
232 | _cl_world *_world = UnpackWorld(world);
233 |
234 | INclReleaseProgram(_world->program);
235 | }
236 |
237 | // Releases the context.
238 | void ReleaseContext(cl_world world) {
239 | _cl_world *_world = UnpackWorld(world);
240 |
241 | INclReleaseContext(_world->context);
242 | }
243 |
244 | // Releases the world struct.
245 | void ReleaseWorld(cl_world world) {
246 | _cl_world *_world = UnpackWorld(world);
247 |
248 | free(_world);
249 | }
250 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/host_srcs/LogisticRegression.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #ifndef _TEST_
18 | #define _accel_ 1
19 | #else
20 | #define _accel_ 0
21 | #endif
22 |
23 | #include
24 | #include
25 | #include
26 | #include
27 | #include
28 | #include
29 | #include
30 | #include
31 | #include
32 |
33 | #include "inaccel/runtime-api.h"
34 |
35 | using namespace std;
36 |
37 | // Dataset specific options
38 | // Change below definitions according to your input dataset
39 | #define NUMCLASSES 26
40 | #define NUMFEATURES 784
41 | #define NUMEXAMPLES 124800
42 | #define NUM_KERNELS 4
43 |
44 | // Function to allocate an aligned memory buffer
45 | void *INalligned_malloc(size_t size) {
46 | void *ptr = memalign(4096, size);
47 | if (!ptr) {
48 | printf("Error: alligned_malloc\n");
49 | exit(EXIT_FAILURE);
50 | }
51 |
52 | return ptr;
53 | }
54 |
55 | // Function to split a string on specified delimiter
56 | vector split(const string &s) {
57 | vector elements;
58 | stringstream ss(s);
59 | string item;
60 |
61 | while (getline(ss, item)) {
62 | size_t prev = 0;
63 | size_t pos;
64 |
65 | while ((pos = item.find_first_of(" (,[])=", prev)) != std::string::npos) {
66 | if (pos > prev)
67 | elements.push_back(item.substr(prev, pos - prev));
68 | prev = pos + 1;
69 | }
70 |
71 | if (prev < item.length())
72 | elements.push_back(item.substr(prev, std::string::npos));
73 | }
74 |
75 | return elements;
76 | }
77 |
78 | // Reads the input dataset and sets features and labels buffers accordingly
79 | void read_input(string filename, float *features, int *labels, int numFeatures,
80 | int numExamples) {
81 | ifstream train;
82 | train.open(filename.c_str());
83 |
84 | string line;
85 | int i;
86 | int n = 0;
87 |
88 | while (getline(train, line) && (n < numExamples)) {
89 | if (line.length()) {
90 | vector tokens = split(line);
91 | features[n * (16 + numFeatures) + numFeatures] = 1.0;
92 | labels[n] = atoi(tokens[0].c_str());
93 | for (i = 0; i < numFeatures; i++) {
94 | features[n * (16 + numFeatures) + i] = atof(tokens[i + 1].c_str());
95 | }
96 | n++;
97 | }
98 | }
99 |
100 | train.close();
101 | }
102 |
103 | // Writes a trained model to the specified filename
104 | void write_output(string filename, float *weights, int numClasses,
105 | int numFeatures) {
106 |
107 | ofstream results;
108 | results.open(filename.c_str());
109 |
110 | for (int k = 0; k < numClasses; k++) {
111 | results << weights[k * (16 + numFeatures)];
112 | for (int j = 1; j < (16 + numFeatures); j++) {
113 | results << "," << weights[k * (16 + numFeatures) + j];
114 | }
115 | results << endl;
116 | }
117 |
118 | results.close();
119 | }
120 |
121 | // A simple classifier. Given an point it matches the class with the greatest
122 | // probability
123 | int classify(float *features, float *weights, int numClasses, int numFeatures) {
124 | float prob = -1.0;
125 | int prediction = -1;
126 |
127 | for (int k = 0; k < numClasses; k++) {
128 | float dot = weights[k * (16 + numFeatures) + numFeatures];
129 |
130 | for (int j = 0; j < numFeatures; j++) {
131 | dot += features[j] * weights[k * (16 + numFeatures) + j];
132 | }
133 |
134 | if (1.0 / (1.0 + exp(-dot)) > prob) {
135 | prob = 1.0 / (1.0 + exp(-dot));
136 | prediction = k;
137 | }
138 | }
139 |
140 | return prediction;
141 | }
142 |
143 | // A simple prediction function to evaluate the accuracy of a trained model
144 | void predict(string filename, float *weights, int numClasses, int numFeatures) {
145 | cout << " * LogisticRegression Testing *" << endl;
146 |
147 | float tr = 0.0;
148 | float fls = 0.0;
149 | float example[numFeatures];
150 | string line;
151 | ifstream test;
152 |
153 | test.open(filename.c_str());
154 |
155 | while (getline(test, line)) {
156 | if (line.length()) {
157 | if (line[0] != '#' && line[0] != ' ') {
158 | vector tokens = split(line);
159 |
160 | int label = (int)atof(tokens[0].c_str());
161 | for (int j = 1; j < (1 + numFeatures); j++) {
162 | example[j - 1] = atof(tokens[j].c_str());
163 | }
164 |
165 | int prediction = classify(example, weights, numClasses, numFeatures);
166 |
167 | if (prediction == label)
168 | tr++;
169 | else
170 | fls++;
171 | }
172 | }
173 | }
174 |
175 | test.close();
176 |
177 | printf(" # accuracy: %1.3f (%i/%i)\n", (tr / (tr + fls)), (int)tr,
178 | (int)(tr + fls));
179 | printf(" # true: %i\n", (int)tr);
180 | printf(" # false: %i\n", (int)fls);
181 | }
182 |
183 | // CPU implementation of Logistic Regression gradients calculation
184 | void gradients_sw(int *labels, float *features, float *weights,
185 | float *gradients, int numClasses, int numFeatures,
186 | int numExamples) {
187 | for (int k = 0; k < numClasses; k++) {
188 | for (int j = 0; j < (16 + numFeatures); j++) {
189 | gradients[k * (16 + numFeatures) + j] = 0.0;
190 | }
191 | }
192 |
193 | for (int i = 0; i < numExamples; i++) {
194 | for (int k = 0; k < numClasses; k++) {
195 | float dot = weights[k * (16 + numFeatures) + numFeatures];
196 |
197 | for (int j = 0; j < numFeatures; j++) {
198 | dot += weights[k * (16 + numFeatures) + j] *
199 | features[i * (16 + numFeatures) + j];
200 | }
201 |
202 | float dif = 1.0 / (1.0 + exp(-dot));
203 | if (labels[i] == k)
204 | dif -= 1;
205 |
206 | for (int j = 0; j < (16 + numFeatures); j++) {
207 | gradients[k * (16 + numFeatures) + j] +=
208 | dif * features[i * (16 + numFeatures) + j];
209 | }
210 | }
211 | }
212 | }
213 |
214 | int main(int argc, char *argv[]) {
215 | if (argc != 2) {
216 | cout << "Usage: " << argv[0] << " " << endl;
217 | exit(-1);
218 | }
219 |
220 | struct timeval start, end;
221 |
222 | float alpha = 0.3f;
223 | float gamma = 0.95f;
224 | int iter = atoi(argv[1]);
225 |
226 | // Set up the specifications of the model to be trained
227 | int numClasses = NUMCLASSES;
228 | int numFeatures = NUMFEATURES;
229 | int numExamples = NUMEXAMPLES;
230 |
231 | // Split the dataset among the availbale kernels
232 | int chunkSize = numExamples / NUM_KERNELS;
233 |
234 | // Allocate host buffers for lables and features of the dataset as well as
235 | // weights and gradients for the model to be trained and lastly velocity
236 | // buffer for accuracy optimization
237 | int *labels = (int *)INalligned_malloc(numExamples * sizeof(int));
238 | float *features = (float *)INalligned_malloc(
239 | numExamples * (16 + numFeatures) * sizeof(float));
240 | float *weights = (float *)INalligned_malloc(numClasses * (16 + numFeatures) *
241 | sizeof(float));
242 | float *gradients = (float *)INalligned_malloc(
243 | numClasses * (16 + numFeatures) * sizeof(float));
244 | float *velocity = (float *)INalligned_malloc(numClasses * (1 + numFeatures) *
245 | sizeof(float));
246 |
247 | // Specify train and test input files as well as output model file
248 | string trainFile = "data/letters_csv_train.dat";
249 | string testFile = "data/letters_csv_test.dat";
250 | string modelFile = "data/weights.out";
251 |
252 | // Read the input dataset
253 | cout << "! Reading train file..." << endl;
254 | read_input(trainFile, features, labels, numFeatures, numExamples);
255 |
256 | // Initialize model weights to zero
257 | for (int i = 0; i < numClasses * (16 + numFeatures); i++)
258 | weights[i] = 0.0;
259 |
260 | if (_accel_) {
261 | // Invoke the hardware accelerated implementation of the algorithm
262 |
263 | cl_engine engine[NUM_KERNELS];
264 | float *ffeatures[NUM_KERNELS], *fweights[NUM_KERNELS];
265 | float *fgradients[NUM_KERNELS], *grads[NUM_KERNELS];
266 | int *flabels[NUM_KERNELS];
267 |
268 | size_t labels_size = chunkSize * sizeof(int);
269 | size_t features_size = chunkSize * (numFeatures + 16) * sizeof(float);
270 | size_t weights_size = numClasses * (numFeatures + 16) * sizeof(float);
271 |
272 | // Initialize the FPGA world
273 | cl_world world = InAccel::create_world(0);
274 | // Program the FPGA device using the provided bitstream
275 | InAccel::create_program(world, "Gradients.xclbin");
276 |
277 | // Instanisate the kernels of the bitstream. Each engine holds a kernel
278 | // along with its command queue
279 | engine[0] = InAccel::create_engine(world, "Gradients_0");
280 | engine[1] = InAccel::create_engine(world, "Gradients_1");
281 | engine[2] = InAccel::create_engine(world, "Gradients_2");
282 | engine[3] = InAccel::create_engine(world, "Gradients_3");
283 |
284 | // Memcpy to each memory bank the corresponding part of the input dataset
285 | for (int i = 0; i < NUM_KERNELS; i++) {
286 | flabels[i] = (int *)InAccel::malloc(world, labels_size, i);
287 | InAccel::memcpy_to(world, flabels[i], 0, labels + i * chunkSize,
288 | labels_size);
289 | ffeatures[i] = (float *)InAccel::malloc(world, features_size, i);
290 | InAccel::memcpy_to(world, ffeatures[i], 0,
291 | features + (i * chunkSize * (16 + numFeatures)),
292 | features_size);
293 |
294 | fweights[i] = (float *)InAccel::malloc(world, weights_size, i);
295 |
296 | fgradients[i] = (float *)InAccel::malloc(world, weights_size, i);
297 | grads[i] = (float *)INalligned_malloc(weights_size);
298 | }
299 |
300 | gettimeofday(&start, NULL);
301 | // Start the iterative part for the training of the algorithm
302 | for (int t = 0; t < iter; t++) {
303 | for (int i = 0; i < NUM_KERNELS; i++) {
304 | // Memcpy to DDR the weights of the model
305 | InAccel::memcpy_to(world, fweights[i], 0, weights, weights_size);
306 |
307 | // Set the kernel arguments
308 | InAccel::set_engine_arg(engine[i], 0, flabels[i]);
309 | InAccel::set_engine_arg(engine[i], 1, ffeatures[i]);
310 | InAccel::set_engine_arg(engine[i], 2, fweights[i]);
311 | InAccel::set_engine_arg(engine[i], 3, fgradients[i]);
312 | InAccel::set_engine_arg(engine[i], 4, numClasses);
313 | InAccel::set_engine_arg(engine[i], 5, numFeatures);
314 | InAccel::set_engine_arg(engine[i], 6, chunkSize);
315 |
316 | // Invoke the kernel execution
317 | InAccel::run_engine(engine[i]);
318 | }
319 |
320 | // Wait for the kernels to finish
321 | for (int i = 0; i < NUM_KERNELS; i++) {
322 | InAccel::await_engine(engine[i]);
323 | }
324 |
325 | // Get the gradients as computed by the kernels
326 | for (int i = 0; i < NUM_KERNELS; i++) {
327 | InAccel::memcpy_from(world, fgradients[i], 0, grads[i], weights_size);
328 | }
329 |
330 | // Aggregate the gradients from all kernels
331 | for (int j = 0; j < numClasses * (16 + numFeatures); j++) {
332 | gradients[j] = grads[0][j];
333 | for (int i = 1; i < NUM_KERNELS; i++) {
334 | gradients[j] += grads[i][j];
335 | }
336 | }
337 |
338 | // Compute the new weights of the model applying some software
339 | // optimizations for better model accuracy
340 | for (int k = 0; k < numClasses; k++) {
341 | for (int j = 0; j < (1 + numFeatures); j++) {
342 | velocity[k * (1 + numFeatures) + j] =
343 | gamma * velocity[k * (1 + numFeatures) + j] +
344 | (alpha / numExamples) * gradients[k * (16 + numFeatures) + j];
345 | weights[k * (16 + numFeatures) + j] -=
346 | velocity[k * (1 + numFeatures) + j];
347 | }
348 | }
349 | }
350 |
351 | gettimeofday(&end, NULL);
352 |
353 | // Free any allocated buffers for the FPGA device and release the allocated
354 | // kernels and command queues
355 | for (int i = 0; i < NUM_KERNELS; i++) {
356 | free(grads[i]);
357 | InAccel::free(world, fgradients[i]);
358 | InAccel::free(world, fweights[i]);
359 | InAccel::free(world, ffeatures[i]);
360 | InAccel::free(world, flabels[i]);
361 | InAccel::release_engine(engine[i]);
362 | }
363 |
364 | // Release the FPGA program
365 | InAccel::release_program(world);
366 | // Release the FPGA world
367 | InAccel::release_world(world);
368 | } else {
369 | // Invoke the software implementation of the algorithm
370 | gettimeofday(&start, NULL);
371 | for (int t = 0; t < iter; t++) {
372 | gradients_sw(labels, features, weights, gradients, numClasses,
373 | numFeatures, numExamples);
374 | for (int k = 0; k < numClasses; k++) {
375 | for (int j = 0; j < (1 + numFeatures); j++) {
376 | velocity[k * (1 + numFeatures) + j] =
377 | gamma * velocity[k * (1 + numFeatures) + j] +
378 | (alpha / numExamples) * gradients[k * (16 + numFeatures) + j];
379 | weights[k * (16 + numFeatures) + j] -=
380 | velocity[k * (1 + numFeatures) + j];
381 | }
382 | }
383 | }
384 | gettimeofday(&end, NULL);
385 | }
386 |
387 | float time_us = ((end.tv_sec * 1000000) + end.tv_usec) -
388 | ((start.tv_sec * 1000000) + start.tv_usec);
389 | float time_s = (end.tv_sec - start.tv_sec);
390 |
391 | cout << "! Time running Gradients Kernel: " << time_us / 1000 << " msec, "
392 | << time_s << " sec " << endl;
393 |
394 | // Compute the accuracy of the trained model on a given test dataset.
395 | predict(testFile, weights, numClasses, numFeatures);
396 |
397 | // Save the model to the specified user file
398 | write_output(modelFile, weights, numClasses, numFeatures);
399 |
400 | // Free any host allocated buffers
401 | free(labels);
402 | free(features);
403 | free(weights);
404 | free(gradients);
405 | free(velocity);
406 |
407 | return 0;
408 | }
409 |
--------------------------------------------------------------------------------
/host_srcs/common/INcl.cpp:
--------------------------------------------------------------------------------
1 | /*
2 | Copyright © 2019 InAccel
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | */
16 |
17 | #include
18 | #include
19 |
20 | #include "INcl.h"
21 |
22 | // Builds a program executable from the program binary.
23 | void INclBuildProgram(cl_program program) {
24 | cl_int errcode_ret = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
25 | if (errcode_ret != CL_SUCCESS) {
26 | fprintf(stderr, "Error: clBuildProgram %s (%d)\n",
27 | INclCheckErrorCode(errcode_ret), errcode_ret);
28 | throw EXIT_FAILURE;
29 | }
30 | }
31 |
32 | // Creates a buffer object.
33 | cl_mem INclCreateBuffer(cl_context context, cl_mem_flags flags, size_t size,
34 | void *host_ptr) {
35 | cl_int errcode_ret;
36 | cl_mem mem = clCreateBuffer(context, flags, size, host_ptr, &errcode_ret);
37 | if (errcode_ret != CL_SUCCESS || !mem) {
38 | fprintf(stderr, "Error: clCreateBuffer %s (%d)\n",
39 | INclCheckErrorCode(errcode_ret), errcode_ret);
40 | throw EXIT_FAILURE;
41 | }
42 |
43 | return mem;
44 | }
45 |
46 | // Create a command-queue on a specific device.
47 | cl_command_queue INclCreateCommandQueue(cl_context context,
48 | cl_device_id device) {
49 | cl_int errcode_ret;
50 | cl_command_queue command_queue =
51 | clCreateCommandQueue(context, device, 0, &errcode_ret);
52 | if (errcode_ret != CL_SUCCESS || !command_queue) {
53 | fprintf(stderr, "Error: clCreateCommandQueue %s (%d)\n",
54 | INclCheckErrorCode(errcode_ret), errcode_ret);
55 | throw EXIT_FAILURE;
56 | }
57 |
58 | return command_queue;
59 | }
60 |
61 | // Creates an OpenCL context.
62 | cl_context INclCreateContext(cl_device_id device) {
63 | cl_int errcode_ret;
64 | cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &errcode_ret);
65 | if (errcode_ret != CL_SUCCESS || !context) {
66 | fprintf(stderr, "Error: clCreateContext %s (%d)\n",
67 | INclCheckErrorCode(errcode_ret), errcode_ret);
68 | throw EXIT_FAILURE;
69 | }
70 |
71 | return context;
72 | }
73 |
74 | // Creates a kernel object.
75 | cl_kernel INclCreateKernel(cl_program program, const char *kernel_name) {
76 | cl_int errcode_ret;
77 | cl_kernel kernel = clCreateKernel(program, kernel_name, &errcode_ret);
78 | if (errcode_ret != CL_SUCCESS || !kernel) {
79 | fprintf(stderr, "Error: clCreateKernel %s (%d)\n",
80 | INclCheckErrorCode(errcode_ret), errcode_ret);
81 | throw EXIT_FAILURE;
82 | }
83 |
84 | return kernel;
85 | }
86 |
87 | // Creates a program object for a context, and loads specified binary data into
88 | // the program object.
89 | cl_program INclCreateProgramWithBinary(cl_context context, cl_uint num_devices,
90 | const cl_device_id *device_list,
91 | const char *binary_name) {
92 | FILE *file = fopen(binary_name, "rb");
93 | if (!file) {
94 | fprintf(stderr, "Error: fopen\n");
95 | throw EXIT_FAILURE;
96 | }
97 |
98 | fseek(file, 0, SEEK_END);
99 | size_t size = ftell(file);
100 | fseek(file, 0, SEEK_SET);
101 |
102 | char *temp = (char *)malloc((size + 1) * sizeof(char));
103 | if (!temp) {
104 | fprintf(stderr, "Error: malloc\n");
105 | throw EXIT_FAILURE;
106 | }
107 |
108 | if (size != fread(temp, sizeof(char), size, file)) {
109 | free(temp);
110 |
111 | fprintf(stderr, "Error: fread\n");
112 | throw EXIT_FAILURE;
113 | }
114 |
115 | fclose(file);
116 | temp[size] = 0;
117 |
118 | char *binary = temp;
119 |
120 | cl_int errcode_ret;
121 | cl_program program = clCreateProgramWithBinary(
122 | context, num_devices, device_list, &size, (const unsigned char **)&binary,
123 | NULL, &errcode_ret);
124 | if (errcode_ret != CL_SUCCESS || !program) {
125 | fprintf(stderr, "Error: clCreateProgramWithBinary %s (%d)\n",
126 | INclCheckErrorCode(errcode_ret), errcode_ret);
127 | throw EXIT_FAILURE;
128 | }
129 |
130 | free(temp);
131 |
132 | return program;
133 | }
134 |
135 | // Enqueues a command to map a region of the buffer object given by buffer into
136 | // the host address space and returns a pointer to this mapped region.
137 | void *INclEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer,
138 | cl_map_flags map_flags, size_t cb,
139 | cl_uint num_events_in_wait_list,
140 | const cl_event *event_wait_list, cl_event *event) {
141 | cl_int errcode_ret;
142 | void *ptr = clEnqueueMapBuffer(command_queue, buffer, CL_FALSE, map_flags, 0,
143 | cb, num_events_in_wait_list, event_wait_list,
144 | event, &errcode_ret);
145 | if (errcode_ret != CL_SUCCESS || !ptr) {
146 | fprintf(stderr, "Error: clEnqueueMapBuffer %s (%d)\n",
147 | INclCheckErrorCode(errcode_ret), errcode_ret);
148 | throw EXIT_FAILURE;
149 | }
150 |
151 | return ptr;
152 | }
153 |
154 | // Enqueues a command to indicate which device a set of memory objects should be
155 | // associated with.
156 | void INclEnqueueMigrateMemObjects(cl_command_queue command_queue,
157 | cl_uint num_mem_objects,
158 | const cl_mem *mem_objects,
159 | cl_mem_migration_flags flags,
160 | cl_uint num_events_in_wait_list,
161 | const cl_event *event_wait_list,
162 | cl_event *event) {
163 | cl_int errcode_ret = clEnqueueMigrateMemObjects(
164 | command_queue, num_mem_objects, mem_objects, flags,
165 | num_events_in_wait_list, event_wait_list, event);
166 | if (errcode_ret != CL_SUCCESS) {
167 | fprintf(stderr, "Error: clEnqueueMigrateMemObjects %s (%d)\n",
168 | INclCheckErrorCode(errcode_ret), errcode_ret);
169 | throw EXIT_FAILURE;
170 | }
171 | }
172 |
173 | // Enqueues a command to execute a kernel on a device.
174 | void INclEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel,
175 | cl_uint work_dim, const size_t *global_work_size,
176 | const size_t *local_work_size,
177 | cl_uint num_events_in_wait_list,
178 | const cl_event *event_wait_list,
179 | cl_event *event) {
180 | cl_int errcode_ret = clEnqueueNDRangeKernel(
181 | command_queue, kernel, work_dim, NULL, global_work_size, local_work_size,
182 | num_events_in_wait_list, event_wait_list, event);
183 | if (errcode_ret != CL_SUCCESS) {
184 | fprintf(stderr, "Error: clEnqueueNDRangeKernel %s (%d)\n",
185 | INclCheckErrorCode(errcode_ret), errcode_ret);
186 | throw EXIT_FAILURE;
187 | }
188 | }
189 |
190 | // Enqueue commands to read from a buffer object to host memory.
191 | void INclEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer,
192 | size_t offset, size_t cb, void *ptr,
193 | cl_uint num_events_in_wait_list,
194 | const cl_event *event_wait_list, cl_event *event) {
195 | cl_int errcode_ret =
196 | clEnqueueReadBuffer(command_queue, buffer, CL_FALSE, offset, cb, ptr,
197 | num_events_in_wait_list, event_wait_list, event);
198 | if (errcode_ret != CL_SUCCESS) {
199 | fprintf(stderr, "Error: clEnqueueReadBuffer %s (%d)\n",
200 | INclCheckErrorCode(errcode_ret), errcode_ret);
201 | throw EXIT_FAILURE;
202 | }
203 | }
204 |
205 | // Enqueues a command to execute a kernel on a device.
206 | void INclEnqueueTask(cl_command_queue command_queue, cl_kernel kernel,
207 | cl_uint num_events_in_wait_list,
208 | const cl_event *event_wait_list, cl_event *event) {
209 | cl_int errcode_ret = clEnqueueTask(
210 | command_queue, kernel, num_events_in_wait_list, event_wait_list, event);
211 | if (errcode_ret != CL_SUCCESS) {
212 | fprintf(stderr, "Error: clEnqueueTask %s (%d)\n",
213 | INclCheckErrorCode(errcode_ret), errcode_ret);
214 | throw EXIT_FAILURE;
215 | }
216 | }
217 |
218 | // Enqueue commands to write to a buffer object from host memory.
219 | void INclEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer,
220 | size_t offset, size_t cb, const void *ptr,
221 | cl_uint num_events_in_wait_list,
222 | const cl_event *event_wait_list, cl_event *event) {
223 | cl_int errcode_ret =
224 | clEnqueueWriteBuffer(command_queue, buffer, CL_FALSE, offset, cb, ptr,
225 | num_events_in_wait_list, event_wait_list, event);
226 | if (errcode_ret != CL_SUCCESS) {
227 | fprintf(stderr, "Error: clEnqueueWriteBuffer %s (%d)\n",
228 | INclCheckErrorCode(errcode_ret), errcode_ret);
229 | throw EXIT_FAILURE;
230 | }
231 | }
232 |
233 | // Blocks until all previously queued OpenCL commands in a command-queue are
234 | // issued to the associated device and have completed.
235 | void INclFinish(cl_command_queue command_queue) {
236 | cl_int errcode_ret = clFinish(command_queue);
237 | if (errcode_ret != CL_SUCCESS) {
238 | fprintf(stderr, "Error: clFinish %s (%d)\n",
239 | INclCheckErrorCode(errcode_ret), errcode_ret);
240 | throw EXIT_FAILURE;
241 | }
242 | }
243 |
244 | // Issues all previously queued OpenCL commands in a command-queue to the device
245 | // associated with the command-queue.
246 | void INclFlush(cl_command_queue command_queue) {
247 | cl_int errcode_ret = clFlush(command_queue);
248 | if (errcode_ret != CL_SUCCESS) {
249 | fprintf(stderr, "Error: clFlush %s (%d)\n", INclCheckErrorCode(errcode_ret),
250 | errcode_ret);
251 | throw EXIT_FAILURE;
252 | }
253 | }
254 |
255 | // Obtain specified device, if available.
256 | cl_device_id INclGetDeviceID(cl_platform_id platform, cl_uint id) {
257 | cl_device_id device_id = (cl_device_id)malloc(sizeof(cl_device_id));
258 | if (!device_id) {
259 | fprintf(stderr, "Error: malloc\n");
260 | throw EXIT_FAILURE;
261 | }
262 |
263 | cl_uint num_devices;
264 | INclGetDeviceIDs(platform, 0, NULL, &num_devices);
265 |
266 | cl_device_id *devices =
267 | (cl_device_id *)malloc(num_devices * sizeof(cl_device_id));
268 | if (!devices) {
269 | fprintf(stderr, "Error: malloc\n");
270 | throw EXIT_FAILURE;
271 | }
272 |
273 | INclGetDeviceIDs(platform, num_devices, devices, NULL);
274 |
275 | cl_uint i;
276 | for (i = 0; i < num_devices; i++) {
277 | if (i == id) {
278 | device_id = devices[i];
279 | break;
280 | }
281 | }
282 |
283 | free(devices);
284 |
285 | if (i == num_devices) {
286 | fprintf(stderr, "Error: clGetDeviceID\n");
287 | throw EXIT_FAILURE;
288 | }
289 |
290 | return device_id;
291 | }
292 |
293 | // Obtain the list of devices available on a platform.
294 | void INclGetDeviceIDs(cl_platform_id platform, cl_uint num_entries,
295 | cl_device_id *devices, cl_uint *num_devices) {
296 | cl_int errcode_ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_entries,
297 | devices, num_devices);
298 | if (errcode_ret != CL_SUCCESS) {
299 | fprintf(stderr, "Error: clGetDeviceIDs %s (%d)\n",
300 | INclCheckErrorCode(errcode_ret), errcode_ret);
301 | throw EXIT_FAILURE;
302 | }
303 | }
304 |
305 | // Get specific information about the OpenCL device.
306 | void INclGetDeviceInfo(cl_device_id device, cl_device_info param_name,
307 | size_t param_value_size, void *param_value,
308 | size_t *param_value_size_ret) {
309 | cl_int errcode_ret = clGetDeviceInfo(device, param_name, param_value_size,
310 | param_value, param_value_size_ret);
311 | if (errcode_ret != CL_SUCCESS) {
312 | fprintf(stderr, "Error: clGetDeviceInfo %s (%d)\n",
313 | INclCheckErrorCode(errcode_ret), errcode_ret);
314 | throw EXIT_FAILURE;
315 | }
316 | }
317 |
318 | // Obtain platform, if available.
319 | cl_platform_id INclGetPlatformID() {
320 | cl_platform_id platform_id = (cl_platform_id)malloc(sizeof(cl_platform_id));
321 | if (!platform_id) {
322 | fprintf(stderr, "Error: malloc\n");
323 | throw EXIT_FAILURE;
324 | }
325 |
326 | cl_uint num_platforms;
327 | INclGetPlatformIDs(0, NULL, &num_platforms);
328 |
329 | cl_platform_id *platforms =
330 | (cl_platform_id *)malloc(num_platforms * sizeof(cl_platform_id));
331 | if (!platforms) {
332 | fprintf(stderr, "Error: malloc\n");
333 | throw EXIT_FAILURE;
334 | }
335 |
336 | INclGetPlatformIDs(num_platforms, platforms, NULL);
337 |
338 | cl_uint i;
339 | for (i = 0; i < num_platforms; i++) {
340 | size_t platform_name_size;
341 | INclGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL,
342 | &platform_name_size);
343 |
344 | char *platform_name = (char *)malloc(platform_name_size * sizeof(char));
345 | if (!platform_name) {
346 | fprintf(stderr, "Error: malloc\n");
347 | throw EXIT_FAILURE;
348 | }
349 |
350 | INclGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_size,
351 | platform_name, NULL);
352 |
353 | if (strstr(platform_name, "Xilinx")) {
354 | free(platform_name);
355 |
356 | platform_id = platforms[i];
357 | break;
358 | }
359 |
360 | free(platform_name);
361 | }
362 |
363 | free(platforms);
364 |
365 | if (i == num_platforms) {
366 | fprintf(stderr, "Error: clGetPlatformID\n");
367 | throw EXIT_FAILURE;
368 | }
369 |
370 | return platform_id;
371 | }
372 |
373 | // Obtain the list of platforms available.
374 | void INclGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms,
375 | cl_uint *num_platforms) {
376 | cl_int errcode_ret = clGetPlatformIDs(num_entries, platforms, num_platforms);
377 | if (errcode_ret != CL_SUCCESS) {
378 | fprintf(stderr, "Error: clGetPlatformIDs %s (%d)\n",
379 | INclCheckErrorCode(errcode_ret), errcode_ret);
380 | throw EXIT_FAILURE;
381 | }
382 | }
383 |
384 | // Get specific information about the OpenCL platform.
385 | void INclGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name,
386 | size_t param_value_size, void *param_value,
387 | size_t *param_value_size_ret) {
388 | cl_int errcode_ret = clGetPlatformInfo(platform, param_name, param_value_size,
389 | param_value, param_value_size_ret);
390 | if (errcode_ret != CL_SUCCESS) {
391 | fprintf(stderr, "Error: clGetPlatformInfo %s (%d)\n",
392 | INclCheckErrorCode(errcode_ret), errcode_ret);
393 | throw EXIT_FAILURE;
394 | }
395 | }
396 |
397 | // Decrements the command_queue reference count.
398 | void INclReleaseCommandQueue(cl_command_queue command_queue) {
399 | cl_int errcode_ret = clReleaseCommandQueue(command_queue);
400 | if (errcode_ret != CL_SUCCESS) {
401 | fprintf(stderr, "Error: clReleaseCommandQueue %s (%d)\n",
402 | INclCheckErrorCode(errcode_ret), errcode_ret);
403 | throw EXIT_FAILURE;
404 | }
405 | }
406 |
407 | // Decrement the context reference count.
408 | void INclReleaseContext(cl_context context) {
409 | cl_int errcode_ret = clReleaseContext(context);
410 | if (errcode_ret != CL_SUCCESS) {
411 | fprintf(stderr, "Error: clReleaseContext %s (%d)\n",
412 | INclCheckErrorCode(errcode_ret), errcode_ret);
413 | throw EXIT_FAILURE;
414 | }
415 | }
416 |
417 | // Decrements the event reference count.
418 | void INclReleaseEvent(cl_event event) {
419 | cl_int errcode_ret = clReleaseEvent(event);
420 | if (errcode_ret != CL_SUCCESS) {
421 | fprintf(stderr, "Error: clReleaseEvent %s (%d)\n",
422 | INclCheckErrorCode(errcode_ret), errcode_ret);
423 | throw EXIT_FAILURE;
424 | }
425 | }
426 |
427 | // Decrements the kernel reference count.
428 | void INclReleaseKernel(cl_kernel kernel) {
429 | cl_int errcode_ret = clReleaseKernel(kernel);
430 | if (errcode_ret != CL_SUCCESS) {
431 | fprintf(stderr, "Error: clReleaseKernel %s (%d)\n",
432 | INclCheckErrorCode(errcode_ret), errcode_ret);
433 | throw EXIT_FAILURE;
434 | }
435 | }
436 |
437 | // Decrements the memory object reference count.
438 | void INclReleaseMemObject(cl_mem memobj) {
439 | cl_int errcode_ret = clReleaseMemObject(memobj);
440 | if (errcode_ret != CL_SUCCESS) {
441 | fprintf(stderr, "Error: clReleaseMemObject %s (%d)\n",
442 | INclCheckErrorCode(errcode_ret), errcode_ret);
443 | throw EXIT_FAILURE;
444 | }
445 | }
446 |
447 | // Decrements the program reference count.
448 | void INclReleaseProgram(cl_program program) {
449 | cl_int errcode_ret = clReleaseProgram(program);
450 | if (errcode_ret != CL_SUCCESS) {
451 | fprintf(stderr, "Error: clReleaseProgram %s (%d)\n",
452 | INclCheckErrorCode(errcode_ret), errcode_ret);
453 | throw EXIT_FAILURE;
454 | }
455 | }
456 |
457 | // Used to set the argument value for a specific argument of a kernel.
458 | void INclSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size,
459 | const void *arg_value) {
460 | cl_int errcode_ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value);
461 | if (errcode_ret != CL_SUCCESS) {
462 | fprintf(stderr, "Error: clSetKernelArg %s (%d)\n",
463 | INclCheckErrorCode(errcode_ret), errcode_ret);
464 | throw EXIT_FAILURE;
465 | }
466 | }
467 |
468 | // Waits on the host thread for commands identified by event objects to
469 | // complete.
470 | void INclWaitForEvents(cl_uint num_events, const cl_event *event_list) {
471 | cl_int errcode_ret = clWaitForEvents(num_events, event_list);
472 | if (errcode_ret != CL_SUCCESS) {
473 | fprintf(stderr, "Error: clWaitForEvents %s (%d)\n",
474 | INclCheckErrorCode(errcode_ret), errcode_ret);
475 | throw EXIT_FAILURE;
476 | }
477 | }
478 |
479 | // Returns a message related to the error code.
480 | const char *INclCheckErrorCode(cl_int errcode) {
481 | switch (errcode) {
482 | case -1:
483 | return "CL_DEVICE_NOT_FOUND";
484 | case -2:
485 | return "CL_DEVICE_NOT_AVAILABLE";
486 | case -3:
487 | return "CL_COMPILER_NOT_AVAILABLE";
488 | case -4:
489 | return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
490 | case -5:
491 | return "CL_OUT_OF_RESOURCES";
492 | case -6:
493 | return "CL_OUT_OF_HOST_MEMORY";
494 | case -7:
495 | return "CL_PROFILING_INFO_NOT_AVAILABLE";
496 | case -8:
497 | return "CL_MEM_COPY_OVERLAP";
498 | case -9:
499 | return "CL_IMAGE_FORMAT_MISMATCH";
500 | case -10:
501 | return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
502 | case -11:
503 | return "CL_BUILD_PROGRAM_FAILURE";
504 | case -12:
505 | return "CL_MAP_FAILURE";
506 | case -13:
507 | return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
508 | case -14:
509 | return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
510 | case -15:
511 | return "CL_COMPILE_PROGRAM_FAILURE";
512 | case -16:
513 | return "CL_LINKER_NOT_AVAILABLE";
514 | case -17:
515 | return "CL_LINK_PROGRAM_FAILURE";
516 | case -18:
517 | return "CL_DEVICE_PARTITION_FAILED";
518 | case -19:
519 | return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
520 | case -30:
521 | return "CL_INVALID_VALUE";
522 | case -31:
523 | return "CL_INVALID_DEVICE_TYPE";
524 | case -32:
525 | return "CL_INVALID_PLATFORM";
526 | case -33:
527 | return "CL_INVALID_DEVICE";
528 | case -34:
529 | return "CL_INVALID_CONTEXT";
530 | case -35:
531 | return "CL_INVALID_QUEUE_PROPERTIES";
532 | case -36:
533 | return "CL_INVALID_COMMAND_QUEUE";
534 | case -37:
535 | return "CL_INVALID_HOST_PTR";
536 | case -38:
537 | return "CL_INVALID_MEM_OBJECT";
538 | case -39:
539 | return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
540 | case -40:
541 | return "CL_INVALID_IMAGE_SIZE";
542 | case -41:
543 | return "CL_INVALID_SAMPLER";
544 | case -42:
545 | return "CL_INVALID_BINARY";
546 | case -43:
547 | return "CL_INVALID_BUILD_OPTIONS";
548 | case -44:
549 | return "CL_INVALID_PROGRAM";
550 | case -45:
551 | return "CL_INVALID_PROGRAM_EXECUTABLE";
552 | case -46:
553 | return "CL_INVALID_KERNEL_NAME";
554 | case -47:
555 | return "CL_INVALID_KERNEL_DEFINITION";
556 | case -48:
557 | return "CL_INVALID_KERNEL";
558 | case -49:
559 | return "CL_INVALID_ARG_INDEX";
560 | case -50:
561 | return "CL_INVALID_ARG_VALUE";
562 | case -51:
563 | return "CL_INVALID_ARG_SIZE";
564 | case -52:
565 | return "CL_INVALID_KERNEL_ARGS";
566 | case -53:
567 | return "CL_INVALID_WORK_DIMENSION";
568 | case -54:
569 | return "CL_INVALID_WORK_GROUP_SIZE";
570 | case -55:
571 | return "CL_INVALID_WORK_ITEM_SIZE";
572 | case -56:
573 | return "CL_INVALID_GLOBAL_OFFSET";
574 | case -57:
575 | return "CL_INVALID_EVENT_WAIT_LIST";
576 | case -58:
577 | return "CL_INVALID_EVENT";
578 | case -59:
579 | return "CL_INVALID_OPERATION";
580 | case -60:
581 | return "CL_INVALID_GL_OBJECT";
582 | case -61:
583 | return "CL_INVALID_BUFFER_SIZE";
584 | case -62:
585 | return "CL_INVALID_MIP_LEVEL";
586 | case -63:
587 | return "CL_INVALID_GLOBAL_WORK_SIZE";
588 | case -64:
589 | return "CL_INVALID_PROPERTY";
590 | case -65:
591 | return "CL_INVALID_IMAGE_DESCRIPTOR";
592 | case -66:
593 | return "CL_INVALID_COMPILER_OPTIONS";
594 | case -67:
595 | return "CL_INVALID_LINKER_OPTIONS";
596 | case -68:
597 | return "CL_INVALID_DEVICE_PARTITION_COUNT";
598 | case -69:
599 | return "CL_INVALID_PIPE_SIZE";
600 | case -70:
601 | return "CL_INVALID_DEVICE_QUEUE";
602 | default:
603 | return "CL_INVALID_ERROR_CODE";
604 | }
605 | }
606 |
--------------------------------------------------------------------------------