├── data └── .keep ├── host_srcs ├── inaccel │ ├── runtime-api.h │ ├── runtime-api.cpp │ ├── runtime.h │ └── runtime.cpp ├── common │ ├── INcl.h │ └── INcl.cpp └── LogisticRegression.cpp ├── Makefile ├── README.md ├── kernel_srcs ├── Gradients_0.cpp ├── Gradients_1.cpp ├── Gradients_2.cpp └── Gradients_3.cpp └── LICENSE /data/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /host_srcs/inaccel/runtime-api.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #ifndef RUNTIME_API_H 18 | #define RUNTIME_API_H 19 | 20 | #include "runtime.h" 21 | 22 | // InAccel function calls. 23 | class InAccel { 24 | 25 | public: 26 | // Creates the world. 27 | static cl_world create_world(int device_id); 28 | 29 | // Allocates a new buffer. 30 | static void *malloc(cl_world world, size_t size, int memory_id); 31 | 32 | // Transfers data to a previously allocated buffer. 33 | static void memcpy_to(cl_world world, void *dst_ptr, size_t offset, 34 | void *src_ptr, size_t size); 35 | 36 | // Creates a new program. 37 | static void create_program(cl_world world, const char *bitstream_name); 38 | 39 | // Creates a new egine. 40 | static cl_engine create_engine(cl_world world, const char *kernel_name); 41 | 42 | // Sets an engine argument using a buffer. 43 | static void set_engine_arg(cl_engine engine, int index, void *buffer); 44 | 45 | // Sets an engine argument using an int value. 46 | static void set_engine_arg(cl_engine engine, int index, int value); 47 | 48 | // Sets an engine argument using a long value. 49 | static void set_engine_arg(cl_engine engine, int index, long value); 50 | 51 | // Sets an engine argument using a float value. 52 | static void set_engine_arg(cl_engine engine, int index, float value); 53 | 54 | // Sets an engine argument using a double value. 55 | static void set_engine_arg(cl_engine engine, int index, double value); 56 | 57 | // Runs an engine. 58 | static void run_engine(cl_engine engine); 59 | 60 | // Awaits an engine. 61 | static void await_engine(cl_engine engine); 62 | 63 | // Releases an engine. 64 | static void release_engine(cl_engine engine); 65 | 66 | // Releases a program. 67 | static void release_program(cl_world world); 68 | 69 | // Transfers data from a previously allocated buffer. 70 | static void memcpy_from(cl_world world, void *src_ptr, size_t offset, 71 | void *dst_ptr, size_t size); 72 | 73 | // Frees a buffer. 74 | static void free(cl_world world, void *ptr); 75 | 76 | // Releases the world. 77 | static void release_world(cl_world world); 78 | }; 79 | 80 | #endif 81 | -------------------------------------------------------------------------------- /host_srcs/inaccel/runtime-api.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include "runtime-api.h" 18 | 19 | // Creates the world. 20 | cl_world InAccel::create_world(int device_id) { 21 | cl_world world = CreateWorld(); 22 | 23 | GetPlatformID(world); 24 | 25 | GetDeviceID(world, (cl_uint)device_id); 26 | 27 | CreateContext(world); 28 | 29 | return world; 30 | } 31 | 32 | // Allocates a new buffer. 33 | void *InAccel::malloc(cl_world world, size_t size, int memory_id) { 34 | return CreateBuffer(world, size, (cl_uint)memory_id); 35 | } 36 | 37 | // Transfers data to a previously allocated buffer. 38 | void InAccel::memcpy_to(cl_world world, void *dst_ptr, size_t offset, 39 | void *src_ptr, size_t size) { 40 | cl_command_queue command_queue = CreateCommandQueue(world); 41 | 42 | EnqueueMemcpyTo(command_queue, dst_ptr, offset, src_ptr, size); 43 | 44 | ReleaseCommandQueue(command_queue); 45 | } 46 | 47 | // Creates a new program. 48 | void InAccel::create_program(cl_world world, const char *bitstream_name) { 49 | CreateProgram(world, bitstream_name); 50 | } 51 | 52 | // Creates a new egine. 53 | cl_engine InAccel::create_engine(cl_world world, const char *kernel_name) { 54 | return CreateEngine(world, kernel_name); 55 | } 56 | 57 | // Sets an engine argument using a buffer. 58 | void InAccel::set_engine_arg(cl_engine engine, int index, void *buffer) { 59 | SetEngineArgPointer(engine, (cl_uint)index, buffer); 60 | } 61 | 62 | // Sets an engine argument using an int value. 63 | void InAccel::set_engine_arg(cl_engine engine, int index, int value) { 64 | SetEngineArg(engine, (cl_uint)index, sizeof(int), &value); 65 | } 66 | 67 | // Sets an engine argument using a long value. 68 | void InAccel::set_engine_arg(cl_engine engine, int index, long value) { 69 | SetEngineArg(engine, (cl_uint)index, sizeof(long), &value); 70 | } 71 | 72 | // Sets an engine argument using a float value. 73 | void InAccel::set_engine_arg(cl_engine engine, int index, float value) { 74 | SetEngineArg(engine, (cl_uint)index, sizeof(float), &value); 75 | } 76 | 77 | // Sets an engine argument using a double value. 78 | void InAccel::set_engine_arg(cl_engine engine, int index, double value) { 79 | SetEngineArg(engine, (cl_uint)index, sizeof(double), &value); 80 | } 81 | 82 | // Runs an engine. 83 | void InAccel::run_engine(cl_engine engine) { EnqueueEngine(engine); } 84 | 85 | // Awaits an engine. 86 | void InAccel::await_engine(cl_engine engine) { BlockEngine(engine); } 87 | 88 | // Releases an engine. 89 | void InAccel::release_engine(cl_engine engine) { ReleaseEngine(engine); } 90 | 91 | // Releases a program. 92 | void InAccel::release_program(cl_world world) { ReleaseProgram(world); } 93 | 94 | // Transfers data from a previously allocated buffer. 95 | void InAccel::memcpy_from(cl_world world, void *src_ptr, size_t offset, 96 | void *dst_ptr, size_t size) { 97 | cl_command_queue command_queue = CreateCommandQueue(world); 98 | 99 | EnqueueMemcpyFrom(command_queue, src_ptr, offset, dst_ptr, size); 100 | 101 | ReleaseCommandQueue(command_queue); 102 | } 103 | 104 | // Frees a buffer. 105 | void InAccel::free(cl_world world, void *ptr) { ReleaseBuffer(world, ptr); } 106 | 107 | // Releases the world. 108 | void InAccel::release_world(cl_world world) { 109 | ReleaseContext(world); 110 | 111 | ReleaseWorld(world); 112 | } 113 | -------------------------------------------------------------------------------- /host_srcs/inaccel/runtime.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #ifndef RUNTIME_H 18 | #define RUNTIME_H 19 | 20 | #include "common/INcl.h" 21 | 22 | // Packs a world struct. 23 | cl_world PackWorld(_cl_world *_world); 24 | 25 | // Unpacks a world struct. 26 | _cl_world *UnpackWorld(cl_world world); 27 | 28 | // Packs an engine struct. 29 | cl_engine PackEngine(_cl_engine *_engine); 30 | 31 | // Unpacks an engine struct. 32 | _cl_engine *UnpackEngine(cl_engine engine); 33 | 34 | // Transforms an engine to the world. 35 | cl_world EngineToWorld(cl_engine engine); 36 | 37 | // Creates the world struct. 38 | cl_world CreateWorld(); 39 | 40 | // Obtains the platform id. 41 | void GetPlatformID(cl_world world); 42 | 43 | // Obtains the specified device id. 44 | void GetDeviceID(cl_world world, cl_uint id); 45 | 46 | // Creates the context. 47 | void CreateContext(cl_world world); 48 | 49 | // Creates a program with the specified name. 50 | void CreateProgram(cl_world world, const char *bitstream_name); 51 | 52 | // Creates a command queue. 53 | cl_command_queue CreateCommandQueue(cl_world world); 54 | 55 | // Blocks until all tasks in a command queue have been completed. 56 | void BlockCommandQueue(cl_command_queue command_queue); 57 | 58 | // Releases a command queue. 59 | void ReleaseCommandQueue(cl_command_queue command_queue); 60 | 61 | // Allocates a memory buffer. 62 | void *CreateBuffer(cl_world world, size_t size, cl_uint memory); 63 | 64 | // Enqueues a memory copy operation to device. 65 | void EnqueueMemcpyTo(cl_command_queue command_queue, void *dst_ptr, size_t offset, void *src_ptr, size_t size); 66 | 67 | // Enqueues a memory copy operation from device. 68 | void EnqueueMemcpyFrom(cl_command_queue command_queue, void *src_ptr, size_t offset, void *dst_ptr, size_t size); 69 | 70 | // Frees a memory buffer. 71 | void ReleaseBuffer(cl_world world, void *ptr); 72 | 73 | // Creates a kernel with the specified name. 74 | cl_kernel CreateKernel(cl_world world, const char *kernel_name); 75 | 76 | // Sets a pointer kernel argument. 77 | void SetKernelArgPointer(cl_kernel kernel, cl_uint arg_index, const void *arg_value); 78 | 79 | // Sets a scalar kernel argument. 80 | void SetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value); 81 | 82 | // Enqueues a kernel operation (Task mode). 83 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel); 84 | 85 | // Enqueues a kernel operation (NDRangeKernel mode). 86 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel, const size_t *global_work_size, const size_t *local_work_size); 87 | 88 | // Releases a kernel. 89 | void ReleaseKernel(cl_kernel kernel); 90 | 91 | // Creates an engine struct with the specified name. 92 | cl_engine CreateEngine(cl_world world, const char *kernel_name); 93 | 94 | // Blocks until all tasks in an engine struct have been completed. 95 | void BlockEngine(cl_engine engine); 96 | 97 | // Sets a pointer engine struct argument. 98 | void SetEngineArgPointer(cl_engine engine, cl_uint arg_index, const void *arg_value); 99 | 100 | // Sets a scalar engine struct argument. 101 | void SetEngineArg(cl_engine engine, cl_uint arg_index, size_t arg_size, const void *arg_value); 102 | 103 | // Enqueues an engine struct operation (Task mode). 104 | void EnqueueEngine(cl_engine engine); 105 | 106 | // Enqueues an engine struct operation (NDRangeKernel mode). 107 | void EnqueueEngine(cl_engine engine, const size_t *global_work_size, const size_t *local_work_size); 108 | 109 | // Releases an engine struct. 110 | void ReleaseEngine(cl_engine engine); 111 | 112 | // Releases a program. 113 | void ReleaseProgram(cl_world world); 114 | 115 | // Releases the context. 116 | void ReleaseContext(cl_world world); 117 | 118 | // Releases the world struct. 119 | void ReleaseWorld(cl_world world); 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ifndef XILINX_SDX 2 | $(error XILINX_SDX is not set) 3 | endif 4 | 5 | ifndef AWS_PLATFORM 6 | $(error AWS_PLATFORM is not set) 7 | endif 8 | 9 | # Host compiler global settings 10 | CC = g++ -O3 -Wno-deprecated-declarations 11 | 12 | CLCC = xocc 13 | 14 | BITSTREAM_NAME = Gradients 15 | HOST_EXE = ${BITSTREAM_NAME} 16 | 17 | PLATFORM = ${AWS_PLATFORM} 18 | 19 | HOST_DIR = host_srcs 20 | KERNEL_DIR = kernel_srcs 21 | KERNEL_TYPE = cpp 22 | 23 | # Host and Kernel sources 24 | HOST_SRCS = $(wildcard $(HOST_DIR)/*/*.cpp) $(wildcard $(HOST_DIR)/*.cpp) 25 | KERNEL_SRCS_CPP = $(wildcard $(KERNEL_DIR)/*.cpp) 26 | 27 | HOST_OBJECTS := $(HOST_SRCS:.cpp=.o) 28 | KERNEL_OBJECTS := $(KERNEL_SRCS_CPP:.cpp=.xo) 29 | ESTIMATE_OBJCTS := $(KERNEL_SRCS_CPP:.cpp=.estimate) 30 | 31 | # Include Libraries 32 | HOST_CFLAGS = -O3 -Wall -I${XILINX_SDX}/runtime/include/1_2 -Ihost_srcs 33 | HOST_LFLAGS = -L${XILINX_XRT}/lib -lxilinxopencl 34 | 35 | # Connecting kernels to specific memory banks 36 | BANKS = --sp Gradients_0_1.m_axi_gmem0:bank0 --sp Gradients_0_1.m_axi_gmem1:bank0 --sp Gradients_0_1.m_axi_gmem2:bank0 --sp Gradients_0_1.m_axi_gmem3:bank0 --sp Gradients_1_1.m_axi_gmem0:bank1 --sp Gradients_1_1.m_axi_gmem1:bank1 --sp Gradients_1_1.m_axi_gmem2:bank1 --sp Gradients_1_1.m_axi_gmem3:bank1 --sp Gradients_2_1.m_axi_gmem0:bank2 --sp Gradients_2_1.m_axi_gmem1:bank2 --sp Gradients_2_1.m_axi_gmem2:bank2 --sp Gradients_2_1.m_axi_gmem3:bank2 --sp Gradients_3_1.m_axi_gmem0:bank3 --sp Gradients_3_1.m_axi_gmem1:bank3 --sp Gradients_3_1.m_axi_gmem2:bank3 --sp Gradients_3_1.m_axi_gmem3:bank3 37 | 38 | # Additional Vivado options 39 | VIVADO_OPTS = --xp misc:enableGlobalHoldIter="True" --xp vivado_prop:run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=NoTimingRelaxation 40 | 41 | SDA_FLOW = sw_emu 42 | ifeq (${SDA_FLOW},sw_emu) 43 | TARGET = -t sw_emu 44 | else ifeq (${SDA_FLOW},hw_emu) 45 | TARGET = -t hw_emu 46 | else ifeq (${SDA_FLOW},hw) 47 | TARGET = -t hw 48 | endif 49 | 50 | all: 51 | make _TEST_="-D _TEST_" host 52 | 53 | host: ${HOST_EXE} 54 | 55 | xbin_sw_em: 56 | @+make SDA_FLOW=sw_emu xbin 57 | 58 | xbin_hw_em: 59 | @+make SDA_FLOW=hw_emu xbin 60 | 61 | xbin_hw : 62 | @+make SDA_FLOW=hw xbin 63 | 64 | run_sw_em: 65 | @+make SDA_FLOW=sw_emu run_sem 66 | 67 | run_hw_em: 68 | @+make SDA_FLOW=hw_emu run_hem 69 | 70 | run_sem: xconfig host xbin 71 | XCL_EMULATION_MODE=sw_emu ./${HOST_EXE} 1 72 | 73 | run_hem: xconfig host xbin 74 | XCL_EMULATION_MODE=hw_emu ./${HOST_EXE} 1 75 | 76 | xconfig: 77 | emconfigutil --platform ${PLATFORM} --od . --nd 1 78 | 79 | # Building host 80 | ${HOST_EXE}: ${HOST_OBJECTS} 81 | ${CC} ${HOST_OBJECTS} ${HOST_LFLAGS} -o $@ 82 | ${RM} -rf ${HOST_OBJECTS} 83 | 84 | xbin: ${KERNEL_OBJECTS} 85 | ${CLCC} ${TARGET} --link -s --platform ${PLATFORM} ${VIVADO_OPTS} ${BANKS} ${KERNEL_OBJECTS} -o ${BITSTREAM_NAME}.xclbin 86 | ${RM} -rf ${KERNEL_OBJECTS} 87 | 88 | estimate: ${ESTIMATE_OBJCTS} 89 | ${RM} -rf $(patsubst %.estimate,%.xo,$(ESTIMATE_OBJCTS)) 90 | 91 | %.o: %.cpp 92 | ${CC} ${_TEST_} ${HOST_CFLAGS} -c $< -o $@ 93 | 94 | # Building kernel 95 | %.xo: %.cpp 96 | ${CLCC} ${TARGET} --save-temps --platform ${PLATFORM} --kernel $(notdir $(basename $<)) -c $< -o $@ 97 | 98 | %.estimate: %.${KERNEL_TYPE} 99 | ${CLCC} --target hw_emu --report_level estimate --save-temps --platform ${PLATFORM} --kernel $(notdir $(basename $<)) -c $< -o $(basename $<).xo 100 | 101 | clean: 102 | ${RM} -rf ${HOST_EXE} $(patsubst %.estimate,%.xo,$(ESTIMATE_OBJCTS)) ${KERNEL_OBJECTS} ${HOST_OBJECTS} emconfig.json *.log *.dir *.xml *.dcp *.dat _sds iprepo *.tcl xilinx_aws-vu9p-f1_dynamic_5_0.hpfm .Xil sdaccel_* system_estimate.xtxt _x top_sp.ltx 103 | 104 | cleanall: clean 105 | ${RM} -rf ${BITSTREAM_NAME}* 106 | 107 | help: 108 | @echo "Compile and run CPU emulation" 109 | @echo "make run_sw_em" 110 | @echo "" 111 | @echo "Compile and run hardware emulation" 112 | @echo "make run_hw_em" 113 | @echo "" 114 | @echo "Compile host executable only" 115 | @echo "make host" 116 | @echo "" 117 | @echo "Compile host executable only for SW version" 118 | @echo "make" 119 | @echo "" 120 | @echo "Compile .xclbin file for system run only" 121 | @echo "make xbin_hw" 122 | @echo "" 123 | @echo "Compile .xclbin file for sw emulation" 124 | @echo "make xbin_sw_em" 125 | @echo "" 126 | @echo "Compile .xclbin file for hw emulation" 127 | @echo "make xbin_hw_em" 128 | @echo "" 129 | @echo "Clean working diretory" 130 | @echo "make clean" 131 | @echo "Clean working diretory and bitstream files" 132 | @echo "make cleanall" 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | InAccel 4 |

5 |
6 | 7 | # Logistic Regression IP core 8 | 9 | 10 | This is an FPGA accelerated solution for Logistic Regression BGD algorithm. It can provide up to **70x** speedup compared to a single threaded execution and up to **12x** compared to an 8 threaded Intel Xeon CPU execution respectively. 11 | 12 | ## Specifications 13 | 14 | | Classes | Features | 15 | | :------: | :--------: | 16 | | up to 64 | up to 2047 | 17 | 18 | ## Supported Platforms 19 | 20 | | Board | 21 | | :-------------------------: | 22 | | [Xilinx Alveo U200](https://www.xilinx.com/products/boards-and-kits/alveo/u200.html) | 23 | | [Xilinx Alveo U250](https://www.xilinx.com/products/boards-and-kits/alveo/u250.html) | 24 | | [AWS VU9P (F1 instances)](https://aws.amazon.com/ec2/instance-types/f1/) | 25 | | Alibaba VU9P (F3 instances) | 26 | | Any other Xilinx platform with at least the same amount of VU9P resources | 27 | 28 | ## Design Files 29 | 30 | - The application code is located in the hosts_srcs directory. Accelerator kernel files are located under the kernel_srcs directory while any accelerator binaries will be compiled to the current directory. 31 | - The Makefile will help you generate any host executable and accelerator _.xclbin_ files. 32 | 33 | A listing of all the files in this repository is shown below: 34 | 35 | - Makefile 36 | - hosts_srcs/ 37 | - LogisticRegression.cpp 38 | - common/ 39 | - INcl.cpp (OpenCL wrapper functions) 40 | - INcl.h 41 | - inaccel/ 42 | - runtime-api.cpp (InAccel runtime abstraction layer) 43 | - runtime-api.h 44 | - runtime.cpp (InAccel runtime abstraction layer) 45 | - runtime.h 46 | - kernel_srcs/ 47 | - Gradients_0.cpp (Accelerated kernel) 48 | - Gradients_1.cpp (Accelerated kernel) 49 | - Gradients_2.cpp (Accelerated kernel) 50 | - Gradients_3.cpp (Accelerated kernel) 51 | - data/ 52 | 53 | ## Preparation 54 | 55 | **!** Before invoking any of the Makefile targets make sure you have sourced Xilinx **XRT** setup script. 56 | **!** Make sure you have set **XILINX_SDX** environment variable pointing to the SDx installation directory. 57 | 58 | As far as the **platform** (or board) is concerned, Makefile uses **AWS_PLATFORM** environment variable as the target platform for the kernels compilation. If you are running this on AWS make sure AWS_PLATFORM environment variable is present and points to the platform DSA files1. Otherwise you can set Makefile `PLATFORM` variable to point to your platform DSA files. 59 | 60 | 1. To obtain the AWS platform DSA files make sure you have cloned the aws-fpga github repository 61 | 62 | Download train letters train dataset to data directory. Navigate to data directory and execute the following commands: 63 | 64 | ``` bash 65 | wget https://s3.amazonaws.com/inaccel-demo/data/nist/letters_csv_train.dat 66 | wget https://s3.amazonaws.com/inaccel-demo/data/nist/letters_csv_test.dat 67 | ``` 68 | 69 | ## Compiling the kernels 70 | 71 | To compile the kernels for hardware target you just need to execute `make xbin_hw` while for software and hardware emulation you must execute `make xbin_sw` and `make xbin_hw` respectively. 72 | A full list of all the available Makefile targets can be found using `make help` command. 73 | 74 | ## Single-thread - Single-application Execution 75 | 76 | To test the generated xclbin file you can simply run `make host` command to create the host application. The host application takes only one input argument, the number of iterations. 77 | Example execution: `./Gradients 100` 78 | 79 | ## Scaling Up and Out with InAccel Coral 80 | 81 | 82 |

83 | InAccel Coral 84 |

85 |
86 | 87 | The above example application spawns a single thread and can train a model using a single FPGA device which **is not viable for datacenter-scale needs**. Data scientists rely on frameworks like Scikit Learn and Apache Spark to create and test their machine learning pipelines. 88 | **InAccel Coral** FPGA resource manager is able to automatically **scale** and **schedule** any acceleration requests to a **cluster of FPGAs**, perform **load balancing** techniques, **reconfigure** the FPGA devices, perform **memory management** etc., yet providing a simple to use **high level API** in Java, CPP and Python. 89 | We have also ready-to-use **integrations** with broadly used open source frameworks like Apache Spark to seamlessly accelerate your pipelines. 90 | Finally, shaping cutting edge technology, Coral is fully compatible with **Kubernetes** and using InAccel's device plugin you can set up a Kubernetes cluster aware of hardware accelerated resources or take advantage of **Serverless architecture** and provide acclerated serverless solutions to your own customers. 91 | 92 | * You can **create a free InAccel Coral license** [here](https://www.inaccel.com/license/). 93 | * You can **download** InAccel Coral docker from [dockerhub](https://hub.docker.com/r/inaccel/coral). 94 | * You can find **full documentation** as well as a **quick starting guide** in [InAccel Docs](https://docs.inaccel.com/). 95 | -------------------------------------------------------------------------------- /host_srcs/common/INcl.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #ifndef INCL_H 18 | #define INCL_H 19 | 20 | #include 21 | #include 22 | 23 | // InAccelCL world struct (Type). 24 | typedef struct{ 25 | cl_platform_id platform_id; 26 | cl_device_id device_id; 27 | cl_context context; 28 | cl_program program; 29 | } _cl_world; 30 | 31 | // InAccelCL world struct (API Type). 32 | typedef uintptr_t cl_world; 33 | 34 | // InAccelCL engine struct (Type). 35 | typedef struct{ 36 | cl_world world; 37 | 38 | cl_command_queue command_queue; 39 | cl_kernel kernel; 40 | } _cl_engine; 41 | 42 | // InAccelCL engine struct (API Type). 43 | typedef uintptr_t cl_engine; 44 | 45 | // Builds a program executable from the program binary. 46 | void INclBuildProgram(cl_program program); 47 | 48 | // Creates a buffer object. 49 | cl_mem INclCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr); 50 | 51 | // Create a command-queue on a specific device. 52 | cl_command_queue INclCreateCommandQueue(cl_context context, cl_device_id device); 53 | 54 | // Creates an OpenCL context. 55 | cl_context INclCreateContext(const cl_device_id device); 56 | 57 | // Creates a kernel object. 58 | cl_kernel INclCreateKernel(cl_program program, const char *kernel_name); 59 | 60 | // Creates a program object for a context, and loads specified binary data into the program object. 61 | cl_program INclCreateProgramWithBinary(cl_context context, cl_uint num_devices, const cl_device_id *device_list, const char *binary_name); 62 | 63 | // Enqueues a command to map a region of the buffer object given by buffer into the host address space and returns a pointer to this mapped region. 64 | void *INclEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, cl_map_flags map_flags, size_t cb, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); 65 | 66 | // Enqueues a command to indicate which device a set of memory objects should be associated with. 67 | void INclEnqueueMigrateMemObjects(cl_command_queue command_queue, cl_uint num_mem_objects, const cl_mem *mem_objects, cl_mem_migration_flags flags, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); 68 | 69 | // Enqueues a command to execute a kernel on a device. 70 | void INclEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); 71 | 72 | // Enqueue commands to read from a buffer object to host memory. 73 | void INclEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, size_t offset, size_t cb, void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); 74 | 75 | // Enqueues a command to execute a kernel on a device. 76 | void INclEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); 77 | 78 | // Enqueue commands to write to a buffer object from host memory. 79 | void INclEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, size_t offset, size_t cb, const void *ptr, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event); 80 | 81 | // Blocks until all previously queued OpenCL commands in a command-queue are issued to the associated device and have completed. 82 | void INclFinish(cl_command_queue command_queue); 83 | 84 | // Issues all previously queued OpenCL commands in a command-queue to the device associated with the command-queue. 85 | void INclFlush(cl_command_queue command_queue); 86 | 87 | // Obtain specified device, if available. 88 | cl_device_id INclGetDeviceID(cl_platform_id platform, cl_uint id); 89 | 90 | // Obtain the list of devices available on a platform. 91 | void INclGetDeviceIDs(cl_platform_id platform, cl_uint num_entries, cl_device_id *devices, cl_uint *num_devices); 92 | 93 | // Get specific information about the OpenCL device. 94 | void INclGetDeviceInfo(cl_device_id device, cl_device_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret); 95 | 96 | // Obtain platform, if available. 97 | cl_platform_id INclGetPlatformID(); 98 | 99 | // Obtain the list of platforms available. 100 | void INclGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, cl_uint *num_platforms); 101 | 102 | // Get specific information about the OpenCL platform. 103 | void INclGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret); 104 | 105 | // Decrements the command_queue reference count. 106 | void INclReleaseCommandQueue(cl_command_queue command_queue); 107 | 108 | // Decrement the context reference count. 109 | void INclReleaseContext(cl_context context); 110 | 111 | // Decrements the event reference count. 112 | void INclReleaseEvent(cl_event event); 113 | 114 | // Decrements the kernel reference count. 115 | void INclReleaseKernel(cl_kernel kernel); 116 | 117 | // Decrements the memory object reference count. 118 | void INclReleaseMemObject(cl_mem memobj); 119 | 120 | // Decrements the program reference count. 121 | void INclReleaseProgram(cl_program program); 122 | 123 | // Used to set the argument value for a specific argument of a kernel. 124 | void INclSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value); 125 | 126 | // Waits on the host thread for commands identified by event objects to complete. 127 | void INclWaitForEvents(cl_uint num_events, const cl_event *event_list); 128 | 129 | // Returns a message related to the error code. 130 | const char *INclCheckErrorCode(cl_int errcode); 131 | 132 | #endif 133 | -------------------------------------------------------------------------------- /kernel_srcs/Gradients_0.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #define chunk 8 21 | #define numClassesMax 64 22 | #define numFeaturesPlusOneMax 128 23 | #define vectorSize 16 24 | 25 | typedef ap_int<256> float8; 26 | typedef ap_int<512> float16; 27 | 28 | union { 29 | int asInt; 30 | float asFloat; 31 | } converter1, converter2; 32 | 33 | // This function represents a Logistc Regression HLS kernel. 34 | // The kernel is able to train a model of up to 64 classes and 2047 features. 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable. 36 | 37 | extern "C" { 38 | void Gradients_0(float8 *_labels, float16 *_data, float16 *_weights, 39 | float16 *_gradients, int numClasses, int numFeatures, 40 | int chunkSize) { 41 | 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control 54 | 55 | float16 features[chunk][numFeaturesPlusOneMax], 56 | weights[numClassesMax][numFeaturesPlusOneMax], 57 | gradients[numClassesMax][numFeaturesPlusOneMax]; 58 | float lin[numClassesMax][chunk * vectorSize]; 59 | float prd[chunk][numClassesMax]; 60 | 61 | // Using URAMs for features, weights and gradients buffers 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram 65 | 66 | // Partitioning the local arrays 67 | #pragma HLS array_partition variable = features complete dim = 1 68 | #pragma HLS array_partition variable = lin complete dim = 2 69 | #pragma HLS array_partition variable = prd complete dim = 1 70 | 71 | // Compute the number of features iterations for float16 input data 72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne = 73 | // 2) 74 | int numFeaturesPlusOne = 75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4; 76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid 77 | // dependencies in some loops 78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses; 79 | 80 | int c, i, j, k, t; 81 | 82 | // Reading weights and filling gradients with zeros 83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 84 | kj++, j++) { 85 | #pragma HLS pipeline II = 1 86 | if (j == numFeaturesPlusOne) { 87 | j = 0; 88 | k++; 89 | } 90 | weights[k][j] = _weights[kj]; 91 | gradients[k][j] = 0; 92 | } 93 | 94 | // Iterate over the points of the dataset each time reading a batch of 8 95 | // points 96 | for (i = 0; i < (chunkSize / chunk); i++) { 97 | int offset = (i * chunk) * numFeaturesPlusOne; 98 | 99 | // Reading the features of the dataset 100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) { 101 | #pragma HLS pipeline II = 1 102 | if (j == numFeaturesPlusOne) { 103 | j = 0; 104 | c++; 105 | } 106 | features[c][j] = _data[offset + cj]; 107 | } 108 | 109 | // Computing the algorithm's dot product 110 | for (k = 0; k < numClasses; k++) { 111 | #pragma HLS pipeline II = 1 112 | for (c = 0; c < chunk; c++) { 113 | for (t = 0; t < vectorSize; t++) { 114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32); 115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32); 116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat; 117 | } 118 | } 119 | } 120 | 121 | for (j = 1; j < numFeaturesPlusOne; j++) { 122 | for (k = 0; k < numClassesMin; k++) { 123 | #pragma HLS pipeline II = 1 124 | for (c = 0; c < chunk; c++) { 125 | for (t = 0; t < vectorSize; t++) { 126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32); 128 | lin[k][c * vectorSize + t] += 129 | converter1.asFloat * converter2.asFloat; 130 | } 131 | } 132 | } 133 | } 134 | 135 | for (k = 0; k < numClasses; k++) { 136 | #pragma HLS pipeline II = 1 137 | for (c = 0; c < chunk; c++) { 138 | prd[c][k] = 139 | 1.0 / 140 | (1.0 + 141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] + 142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] + 143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] + 144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] + 145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] + 146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] + 147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] + 148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15]))); 149 | } 150 | } 151 | 152 | // Reading the dataset labels and update predictions 153 | float8 labels = _labels[i]; 154 | for (c = 0; c < chunk; c++) { 155 | #pragma HLS unroll 156 | int label = labels.range((c + 1) * 32 - 1, c * 32); 157 | prd[c][label] -= 1.0; 158 | } 159 | 160 | // Compute the output gradients 161 | for (j = 0; j < numFeaturesPlusOne; j++) { 162 | for (k = 0; k < numClassesMin; k++) { 163 | #pragma HLS pipeline II = 1 164 | for (c = 0; c < chunk; c++) { 165 | for (t = 0; t < vectorSize; t++) { 166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32); 168 | converter2.asFloat += prd[c][k] * converter1.asFloat; 169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt; 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | // Write back gradients 177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 178 | kj++, j++) { 179 | #pragma HLS pipeline II = 1 180 | if (j == numFeaturesPlusOne) { 181 | j = 0; 182 | k++; 183 | } 184 | _gradients[kj] = gradients[k][j]; 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /kernel_srcs/Gradients_1.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #define chunk 8 21 | #define numClassesMax 64 22 | #define numFeaturesPlusOneMax 128 23 | #define vectorSize 16 24 | 25 | typedef ap_int<256> float8; 26 | typedef ap_int<512> float16; 27 | 28 | union { 29 | int asInt; 30 | float asFloat; 31 | } converter1, converter2; 32 | 33 | // This function represents a Logistc Regression HLS kernel. 34 | // The kernel is able to train a model of up to 64 classes and 2047 features. 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable. 36 | 37 | extern "C" { 38 | void Gradients_1(float8 *_labels, float16 *_data, float16 *_weights, 39 | float16 *_gradients, int numClasses, int numFeatures, 40 | int chunkSize) { 41 | 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control 54 | 55 | float16 features[chunk][numFeaturesPlusOneMax], 56 | weights[numClassesMax][numFeaturesPlusOneMax], 57 | gradients[numClassesMax][numFeaturesPlusOneMax]; 58 | float lin[numClassesMax][chunk * vectorSize]; 59 | float prd[chunk][numClassesMax]; 60 | 61 | // Using URAMs for features, weights and gradients buffers 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram 65 | 66 | // Partitioning the local arrays 67 | #pragma HLS array_partition variable = features complete dim = 1 68 | #pragma HLS array_partition variable = lin complete dim = 2 69 | #pragma HLS array_partition variable = prd complete dim = 1 70 | 71 | // Compute the number of features iterations for float16 input data 72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne = 73 | // 2) 74 | int numFeaturesPlusOne = 75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4; 76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid 77 | // dependencies in some loops 78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses; 79 | 80 | int c, i, j, k, t; 81 | 82 | // Reading weights and filling gradients with zeros 83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 84 | kj++, j++) { 85 | #pragma HLS pipeline II = 1 86 | if (j == numFeaturesPlusOne) { 87 | j = 0; 88 | k++; 89 | } 90 | weights[k][j] = _weights[kj]; 91 | gradients[k][j] = 0; 92 | } 93 | 94 | // Iterate over the points of the dataset each time reading a batch of 8 95 | // points 96 | for (i = 0; i < (chunkSize / chunk); i++) { 97 | int offset = (i * chunk) * numFeaturesPlusOne; 98 | 99 | // Reading the features of the dataset 100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) { 101 | #pragma HLS pipeline II = 1 102 | if (j == numFeaturesPlusOne) { 103 | j = 0; 104 | c++; 105 | } 106 | features[c][j] = _data[offset + cj]; 107 | } 108 | 109 | // Computing the algorithm's dot product 110 | for (k = 0; k < numClasses; k++) { 111 | #pragma HLS pipeline II = 1 112 | for (c = 0; c < chunk; c++) { 113 | for (t = 0; t < vectorSize; t++) { 114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32); 115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32); 116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat; 117 | } 118 | } 119 | } 120 | 121 | for (j = 1; j < numFeaturesPlusOne; j++) { 122 | for (k = 0; k < numClassesMin; k++) { 123 | #pragma HLS pipeline II = 1 124 | for (c = 0; c < chunk; c++) { 125 | for (t = 0; t < vectorSize; t++) { 126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32); 128 | lin[k][c * vectorSize + t] += 129 | converter1.asFloat * converter2.asFloat; 130 | } 131 | } 132 | } 133 | } 134 | 135 | for (k = 0; k < numClasses; k++) { 136 | #pragma HLS pipeline II = 1 137 | for (c = 0; c < chunk; c++) { 138 | prd[c][k] = 139 | 1.0 / 140 | (1.0 + 141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] + 142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] + 143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] + 144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] + 145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] + 146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] + 147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] + 148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15]))); 149 | } 150 | } 151 | 152 | // Reading the dataset labels and update predictions 153 | float8 labels = _labels[i]; 154 | for (c = 0; c < chunk; c++) { 155 | #pragma HLS unroll 156 | int label = labels.range((c + 1) * 32 - 1, c * 32); 157 | prd[c][label] -= 1.0; 158 | } 159 | 160 | // Compute the output gradients 161 | for (j = 0; j < numFeaturesPlusOne; j++) { 162 | for (k = 0; k < numClassesMin; k++) { 163 | #pragma HLS pipeline II = 1 164 | for (c = 0; c < chunk; c++) { 165 | for (t = 0; t < vectorSize; t++) { 166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32); 168 | converter2.asFloat += prd[c][k] * converter1.asFloat; 169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt; 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | // Write back gradients 177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 178 | kj++, j++) { 179 | #pragma HLS pipeline II = 1 180 | if (j == numFeaturesPlusOne) { 181 | j = 0; 182 | k++; 183 | } 184 | _gradients[kj] = gradients[k][j]; 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /kernel_srcs/Gradients_2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #define chunk 8 21 | #define numClassesMax 64 22 | #define numFeaturesPlusOneMax 128 23 | #define vectorSize 16 24 | 25 | typedef ap_int<256> float8; 26 | typedef ap_int<512> float16; 27 | 28 | union { 29 | int asInt; 30 | float asFloat; 31 | } converter1, converter2; 32 | 33 | // This function represents a Logistc Regression HLS kernel. 34 | // The kernel is able to train a model of up to 64 classes and 2047 features. 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable. 36 | 37 | extern "C" { 38 | void Gradients_2(float8 *_labels, float16 *_data, float16 *_weights, 39 | float16 *_gradients, int numClasses, int numFeatures, 40 | int chunkSize) { 41 | 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control 54 | 55 | float16 features[chunk][numFeaturesPlusOneMax], 56 | weights[numClassesMax][numFeaturesPlusOneMax], 57 | gradients[numClassesMax][numFeaturesPlusOneMax]; 58 | float lin[numClassesMax][chunk * vectorSize]; 59 | float prd[chunk][numClassesMax]; 60 | 61 | // Using URAMs for features, weights and gradients buffers 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram 65 | 66 | // Partitioning the local arrays 67 | #pragma HLS array_partition variable = features complete dim = 1 68 | #pragma HLS array_partition variable = lin complete dim = 2 69 | #pragma HLS array_partition variable = prd complete dim = 1 70 | 71 | // Compute the number of features iterations for float16 input data 72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne = 73 | // 2) 74 | int numFeaturesPlusOne = 75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4; 76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid 77 | // dependencies in some loops 78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses; 79 | 80 | int c, i, j, k, t; 81 | 82 | // Reading weights and filling gradients with zeros 83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 84 | kj++, j++) { 85 | #pragma HLS pipeline II = 1 86 | if (j == numFeaturesPlusOne) { 87 | j = 0; 88 | k++; 89 | } 90 | weights[k][j] = _weights[kj]; 91 | gradients[k][j] = 0; 92 | } 93 | 94 | // Iterate over the points of the dataset each time reading a batch of 8 95 | // points 96 | for (i = 0; i < (chunkSize / chunk); i++) { 97 | int offset = (i * chunk) * numFeaturesPlusOne; 98 | 99 | // Reading the features of the dataset 100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) { 101 | #pragma HLS pipeline II = 1 102 | if (j == numFeaturesPlusOne) { 103 | j = 0; 104 | c++; 105 | } 106 | features[c][j] = _data[offset + cj]; 107 | } 108 | 109 | // Computing the algorithm's dot product 110 | for (k = 0; k < numClasses; k++) { 111 | #pragma HLS pipeline II = 1 112 | for (c = 0; c < chunk; c++) { 113 | for (t = 0; t < vectorSize; t++) { 114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32); 115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32); 116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat; 117 | } 118 | } 119 | } 120 | 121 | for (j = 1; j < numFeaturesPlusOne; j++) { 122 | for (k = 0; k < numClassesMin; k++) { 123 | #pragma HLS pipeline II = 1 124 | for (c = 0; c < chunk; c++) { 125 | for (t = 0; t < vectorSize; t++) { 126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32); 128 | lin[k][c * vectorSize + t] += 129 | converter1.asFloat * converter2.asFloat; 130 | } 131 | } 132 | } 133 | } 134 | 135 | for (k = 0; k < numClasses; k++) { 136 | #pragma HLS pipeline II = 1 137 | for (c = 0; c < chunk; c++) { 138 | prd[c][k] = 139 | 1.0 / 140 | (1.0 + 141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] + 142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] + 143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] + 144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] + 145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] + 146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] + 147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] + 148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15]))); 149 | } 150 | } 151 | 152 | // Reading the dataset labels and update predictions 153 | float8 labels = _labels[i]; 154 | for (c = 0; c < chunk; c++) { 155 | #pragma HLS unroll 156 | int label = labels.range((c + 1) * 32 - 1, c * 32); 157 | prd[c][label] -= 1.0; 158 | } 159 | 160 | // Compute the output gradients 161 | for (j = 0; j < numFeaturesPlusOne; j++) { 162 | for (k = 0; k < numClassesMin; k++) { 163 | #pragma HLS pipeline II = 1 164 | for (c = 0; c < chunk; c++) { 165 | for (t = 0; t < vectorSize; t++) { 166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32); 168 | converter2.asFloat += prd[c][k] * converter1.asFloat; 169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt; 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | // Write back gradients 177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 178 | kj++, j++) { 179 | #pragma HLS pipeline II = 1 180 | if (j == numFeaturesPlusOne) { 181 | j = 0; 182 | k++; 183 | } 184 | _gradients[kj] = gradients[k][j]; 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /kernel_srcs/Gradients_3.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #define chunk 8 21 | #define numClassesMax 64 22 | #define numFeaturesPlusOneMax 128 23 | #define vectorSize 16 24 | 25 | typedef ap_int<256> float8; 26 | typedef ap_int<512> float16; 27 | 28 | union { 29 | int asInt; 30 | float asFloat; 31 | } converter1, converter2; 32 | 33 | // This function represents a Logistc Regression HLS kernel. 34 | // The kernel is able to train a model of up to 64 classes and 2047 features. 35 | // Maximum bandwidth is used for the M_AXI interfaces where applicable. 36 | 37 | extern "C" { 38 | void Gradients_3(float8 *_labels, float16 *_data, float16 *_weights, 39 | float16 *_gradients, int numClasses, int numFeatures, 40 | int chunkSize) { 41 | 42 | #pragma HLS INTERFACE m_axi port = _labels offset = slave bundle = gmem0 43 | #pragma HLS INTERFACE m_axi port = _data offset = slave bundle = gmem1 44 | #pragma HLS INTERFACE m_axi port = _weights offset = slave bundle = gmem2 45 | #pragma HLS INTERFACE m_axi port = _gradients offset = slave bundle = gmem3 46 | #pragma HLS INTERFACE s_axilite port = _labels bundle = control 47 | #pragma HLS INTERFACE s_axilite port = _data bundle = control 48 | #pragma HLS INTERFACE s_axilite port = _weights bundle = control 49 | #pragma HLS INTERFACE s_axilite port = _gradients bundle = control 50 | #pragma HLS INTERFACE s_axilite port = numClasses bundle = control 51 | #pragma HLS INTERFACE s_axilite port = numFeatures bundle = control 52 | #pragma HLS INTERFACE s_axilite port = chunkSize bundle = control 53 | #pragma HLS INTERFACE s_axilite port = return bundle = control 54 | 55 | float16 features[chunk][numFeaturesPlusOneMax], 56 | weights[numClassesMax][numFeaturesPlusOneMax], 57 | gradients[numClassesMax][numFeaturesPlusOneMax]; 58 | float lin[numClassesMax][chunk * vectorSize]; 59 | float prd[chunk][numClassesMax]; 60 | 61 | // Using URAMs for features, weights and gradients buffers 62 | #pragma HLS resource variable = features core = XPM_MEMORY uram 63 | #pragma HLS resource variable = weights core = XPM_MEMORY uram 64 | #pragma HLS resource variable = gradients core = XPM_MEMORY uram 65 | 66 | // Partitioning the local arrays 67 | #pragma HLS array_partition variable = features complete dim = 1 68 | #pragma HLS array_partition variable = lin complete dim = 2 69 | #pragma HLS array_partition variable = prd complete dim = 1 70 | 71 | // Compute the number of features iterations for float16 input data 72 | // (e.g. numFeatures = 31 -> (numFeatures + 1) = 16 -> numFeaturesPlusOne = 73 | // 2) 74 | int numFeaturesPlusOne = 75 | (((numFeatures + 1) + (vectorSize - 1)) & (~(vectorSize - 1))) >> 4; 76 | // Defining a minimum of 13 classes in numClassesMin. It will be used to avoid 77 | // dependencies in some loops 78 | int numClassesMin = (13 > numClasses) ? 13 : numClasses; 79 | 80 | int c, i, j, k, t; 81 | 82 | // Reading weights and filling gradients with zeros 83 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 84 | kj++, j++) { 85 | #pragma HLS pipeline II = 1 86 | if (j == numFeaturesPlusOne) { 87 | j = 0; 88 | k++; 89 | } 90 | weights[k][j] = _weights[kj]; 91 | gradients[k][j] = 0; 92 | } 93 | 94 | // Iterate over the points of the dataset each time reading a batch of 8 95 | // points 96 | for (i = 0; i < (chunkSize / chunk); i++) { 97 | int offset = (i * chunk) * numFeaturesPlusOne; 98 | 99 | // Reading the features of the dataset 100 | for (int cj = 0, c = 0, j = 0; cj < chunk * numFeaturesPlusOne; cj++, j++) { 101 | #pragma HLS pipeline II = 1 102 | if (j == numFeaturesPlusOne) { 103 | j = 0; 104 | c++; 105 | } 106 | features[c][j] = _data[offset + cj]; 107 | } 108 | 109 | // Computing the algorithm's dot product 110 | for (k = 0; k < numClasses; k++) { 111 | #pragma HLS pipeline II = 1 112 | for (c = 0; c < chunk; c++) { 113 | for (t = 0; t < vectorSize; t++) { 114 | converter1.asInt = features[c][0].range((t + 1) * 32 - 1, t * 32); 115 | converter2.asInt = weights[k][0].range((t + 1) * 32 - 1, t * 32); 116 | lin[k][c * vectorSize + t] = converter1.asFloat * converter2.asFloat; 117 | } 118 | } 119 | } 120 | 121 | for (j = 1; j < numFeaturesPlusOne; j++) { 122 | for (k = 0; k < numClassesMin; k++) { 123 | #pragma HLS pipeline II = 1 124 | for (c = 0; c < chunk; c++) { 125 | for (t = 0; t < vectorSize; t++) { 126 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 127 | converter2.asInt = weights[k][j].range((t + 1) * 32 - 1, t * 32); 128 | lin[k][c * vectorSize + t] += 129 | converter1.asFloat * converter2.asFloat; 130 | } 131 | } 132 | } 133 | } 134 | 135 | for (k = 0; k < numClasses; k++) { 136 | #pragma HLS pipeline II = 1 137 | for (c = 0; c < chunk; c++) { 138 | prd[c][k] = 139 | 1.0 / 140 | (1.0 + 141 | exp(-(lin[k][c * vectorSize] + lin[k][c * vectorSize + 1] + 142 | lin[k][c * vectorSize + 2] + lin[k][c * vectorSize + 3] + 143 | lin[k][c * vectorSize + 4] + lin[k][c * vectorSize + 5] + 144 | lin[k][c * vectorSize + 6] + lin[k][c * vectorSize + 7] + 145 | lin[k][c * vectorSize + 8] + lin[k][c * vectorSize + 9] + 146 | lin[k][c * vectorSize + 10] + lin[k][c * vectorSize + 11] + 147 | lin[k][c * vectorSize + 12] + lin[k][c * vectorSize + 13] + 148 | lin[k][c * vectorSize + 14] + lin[k][c * vectorSize + 15]))); 149 | } 150 | } 151 | 152 | // Reading the dataset labels and update predictions 153 | float8 labels = _labels[i]; 154 | for (c = 0; c < chunk; c++) { 155 | #pragma HLS unroll 156 | int label = labels.range((c + 1) * 32 - 1, c * 32); 157 | prd[c][label] -= 1.0; 158 | } 159 | 160 | // Compute the output gradients 161 | for (j = 0; j < numFeaturesPlusOne; j++) { 162 | for (k = 0; k < numClassesMin; k++) { 163 | #pragma HLS pipeline II = 1 164 | for (c = 0; c < chunk; c++) { 165 | for (t = 0; t < vectorSize; t++) { 166 | converter1.asInt = features[c][j].range((t + 1) * 32 - 1, t * 32); 167 | converter2.asInt = gradients[k][j].range((t + 1) * 32 - 1, t * 32); 168 | converter2.asFloat += prd[c][k] * converter1.asFloat; 169 | gradients[k][j].range((t + 1) * 32 - 1, t * 32) = converter2.asInt; 170 | } 171 | } 172 | } 173 | } 174 | } 175 | 176 | // Write back gradients 177 | for (int kj = 0, k = 0, j = 0; kj < numClasses * numFeaturesPlusOne; 178 | kj++, j++) { 179 | #pragma HLS pipeline II = 1 180 | if (j == numFeaturesPlusOne) { 181 | j = 0; 182 | k++; 183 | } 184 | _gradients[kj] = gradients[k][j]; 185 | } 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /host_srcs/inaccel/runtime.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include "runtime.h" 18 | 19 | // Packs a world struct. 20 | cl_world PackWorld(_cl_world *_world) { return (cl_world)_world; } 21 | 22 | // Unpacks a world struct. 23 | _cl_world *UnpackWorld(cl_world world) { return (_cl_world *)world; } 24 | 25 | // Packs an engine struct. 26 | cl_engine PackEngine(_cl_engine *_engine) { return (cl_engine)_engine; } 27 | 28 | // Unpacks an engine struct. 29 | _cl_engine *UnpackEngine(cl_engine engine) { return (_cl_engine *)engine; } 30 | 31 | // Transforms an engine to the world. 32 | cl_world EngineToWorld(cl_engine engine) { return UnpackEngine(engine)->world; } 33 | 34 | // Creates the world struct. 35 | cl_world CreateWorld() { 36 | _cl_world *_world = (_cl_world *)malloc(sizeof(_cl_world)); 37 | 38 | return PackWorld(_world); 39 | } 40 | 41 | // Obtains the platform id. 42 | void GetPlatformID(cl_world world) { 43 | _cl_world *_world = UnpackWorld(world); 44 | 45 | _world->platform_id = INclGetPlatformID(); 46 | } 47 | 48 | // Obtains the specified device id. 49 | void GetDeviceID(cl_world world, cl_uint id) { 50 | _cl_world *_world = UnpackWorld(world); 51 | 52 | _world->device_id = INclGetDeviceID(_world->platform_id, id); 53 | } 54 | 55 | // Creates the context. 56 | void CreateContext(cl_world world) { 57 | _cl_world *_world = UnpackWorld(world); 58 | 59 | _world->context = INclCreateContext(_world->device_id); 60 | } 61 | 62 | // Creates a program with the specified name. 63 | void CreateProgram(cl_world world, const char *bitstream_name) { 64 | _cl_world *_world = UnpackWorld(world); 65 | 66 | _world->program = INclCreateProgramWithBinary( 67 | _world->context, 1, &_world->device_id, bitstream_name); 68 | 69 | INclBuildProgram(_world->program); 70 | } 71 | 72 | // Creates a command queue. 73 | cl_command_queue CreateCommandQueue(cl_world world) { 74 | _cl_world *_world = UnpackWorld(world); 75 | 76 | return INclCreateCommandQueue(_world->context, _world->device_id); 77 | } 78 | 79 | // Blocks until all tasks in a command queue have been completed. 80 | void BlockCommandQueue(cl_command_queue command_queue) { 81 | INclFlush(command_queue); 82 | INclFinish(command_queue); 83 | } 84 | 85 | // Releases a command queue. 86 | void ReleaseCommandQueue(cl_command_queue command_queue) { 87 | BlockCommandQueue(command_queue); 88 | 89 | INclReleaseCommandQueue(command_queue); 90 | } 91 | 92 | // Allocates a memory buffer. 93 | void *CreateBuffer(cl_world world, size_t size, cl_uint memory) { 94 | _cl_world *_world = UnpackWorld(world); 95 | 96 | cl_uint CL_MEM_EXT_PTR = 1 << 31; 97 | 98 | typedef struct { 99 | unsigned flags; 100 | void *obj; 101 | void *param; 102 | } cl_mem_ext_ptr_t; 103 | 104 | cl_uint CL_MEMORY = 1 << memory; 105 | 106 | cl_mem_ext_ptr_t buffer; 107 | buffer.flags = CL_MEMORY; 108 | buffer.obj = NULL; 109 | buffer.param = 0; 110 | 111 | return (void *)INclCreateBuffer( 112 | _world->context, CL_MEM_READ_WRITE | CL_MEM_EXT_PTR, size, &buffer); 113 | } 114 | 115 | // Enqueues a memory copy operation to device. 116 | void EnqueueMemcpyTo(cl_command_queue command_queue, void *dst_ptr, 117 | size_t offset, void *src_ptr, size_t size) { 118 | INclEnqueueWriteBuffer(command_queue, (cl_mem)dst_ptr, offset, size, src_ptr, 119 | 0, NULL, NULL); 120 | } 121 | 122 | // Enqueues a memory copy operation from device. 123 | void EnqueueMemcpyFrom(cl_command_queue command_queue, void *src_ptr, 124 | size_t offset, void *dst_ptr, size_t size) { 125 | INclEnqueueReadBuffer(command_queue, (cl_mem)src_ptr, offset, size, dst_ptr, 126 | 0, NULL, NULL); 127 | } 128 | 129 | // Frees a memory buffer. 130 | void ReleaseBuffer(cl_world world, void *ptr) { 131 | INclReleaseMemObject((cl_mem)ptr); 132 | } 133 | 134 | // Creates a kernel with the specified name. 135 | cl_kernel CreateKernel(cl_world world, const char *kernel_name) { 136 | _cl_world *_world = UnpackWorld(world); 137 | 138 | return INclCreateKernel(_world->program, kernel_name); 139 | } 140 | 141 | // Sets a pointer kernel argument. 142 | void SetKernelArgPointer(cl_kernel kernel, cl_uint arg_index, 143 | const void *arg_value) { 144 | INclSetKernelArg(kernel, arg_index, sizeof(cl_mem), &arg_value); 145 | } 146 | 147 | // Sets a scalar kernel argument. 148 | void SetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, 149 | const void *arg_value) { 150 | INclSetKernelArg(kernel, arg_index, arg_size, arg_value); 151 | } 152 | 153 | // Enqueues a kernel operation (Task mode). 154 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel) { 155 | INclEnqueueTask(command_queue, kernel, 0, NULL, NULL); 156 | } 157 | 158 | // Enqueues a kernel operation (NDRangeKernel mode). 159 | void EnqueueKernel(cl_command_queue command_queue, cl_kernel kernel, 160 | const size_t *global_work_size, 161 | const size_t *local_work_size) { 162 | INclEnqueueNDRangeKernel(command_queue, kernel, 3, global_work_size, 163 | local_work_size, 0, NULL, NULL); 164 | } 165 | 166 | // Releases a kernel. 167 | void ReleaseKernel(cl_kernel kernel) { INclReleaseKernel(kernel); } 168 | 169 | // Creates an engine struct with the specified name. 170 | cl_engine CreateEngine(cl_world world, const char *kernel_name) { 171 | _cl_engine *_engine = (_cl_engine *)malloc(sizeof(_cl_engine)); 172 | 173 | _engine->world = world; 174 | 175 | _engine->command_queue = CreateCommandQueue(world); 176 | _engine->kernel = CreateKernel(world, kernel_name); 177 | 178 | return PackEngine(_engine); 179 | } 180 | 181 | // Blocks until all tasks in an engine struct have been completed. 182 | void BlockEngine(cl_engine engine) { 183 | _cl_engine *_engine = UnpackEngine(engine); 184 | 185 | BlockCommandQueue(_engine->command_queue); 186 | } 187 | 188 | // Sets a pointer engine struct argument. 189 | void SetEngineArgPointer(cl_engine engine, cl_uint arg_index, 190 | const void *arg_value) { 191 | _cl_engine *_engine = UnpackEngine(engine); 192 | 193 | SetKernelArgPointer(_engine->kernel, arg_index, arg_value); 194 | } 195 | 196 | // Sets a scalar engine struct argument. 197 | void SetEngineArg(cl_engine engine, cl_uint arg_index, size_t arg_size, 198 | const void *arg_value) { 199 | _cl_engine *_engine = UnpackEngine(engine); 200 | 201 | SetKernelArg(_engine->kernel, arg_index, arg_size, arg_value); 202 | } 203 | 204 | // Enqueues an engine struct operation (Task mode). 205 | void EnqueueEngine(cl_engine engine) { 206 | _cl_engine *_engine = UnpackEngine(engine); 207 | 208 | EnqueueKernel(_engine->command_queue, _engine->kernel); 209 | } 210 | 211 | // Enqueues an engine struct operation (NDRangeKernel mode). 212 | void EnqueueEngine(cl_engine engine, const size_t *global_work_size, 213 | const size_t *local_work_size) { 214 | _cl_engine *_engine = UnpackEngine(engine); 215 | 216 | EnqueueKernel(_engine->command_queue, _engine->kernel, global_work_size, 217 | local_work_size); 218 | } 219 | 220 | // Releases an engine struct. 221 | void ReleaseEngine(cl_engine engine) { 222 | _cl_engine *_engine = UnpackEngine(engine); 223 | 224 | ReleaseCommandQueue(_engine->command_queue); 225 | ReleaseKernel(_engine->kernel); 226 | 227 | free(_engine); 228 | } 229 | 230 | // Releases a program. 231 | void ReleaseProgram(cl_world world) { 232 | _cl_world *_world = UnpackWorld(world); 233 | 234 | INclReleaseProgram(_world->program); 235 | } 236 | 237 | // Releases the context. 238 | void ReleaseContext(cl_world world) { 239 | _cl_world *_world = UnpackWorld(world); 240 | 241 | INclReleaseContext(_world->context); 242 | } 243 | 244 | // Releases the world struct. 245 | void ReleaseWorld(cl_world world) { 246 | _cl_world *_world = UnpackWorld(world); 247 | 248 | free(_world); 249 | } 250 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /host_srcs/LogisticRegression.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #ifndef _TEST_ 18 | #define _accel_ 1 19 | #else 20 | #define _accel_ 0 21 | #endif 22 | 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | #include "inaccel/runtime-api.h" 34 | 35 | using namespace std; 36 | 37 | // Dataset specific options 38 | // Change below definitions according to your input dataset 39 | #define NUMCLASSES 26 40 | #define NUMFEATURES 784 41 | #define NUMEXAMPLES 124800 42 | #define NUM_KERNELS 4 43 | 44 | // Function to allocate an aligned memory buffer 45 | void *INalligned_malloc(size_t size) { 46 | void *ptr = memalign(4096, size); 47 | if (!ptr) { 48 | printf("Error: alligned_malloc\n"); 49 | exit(EXIT_FAILURE); 50 | } 51 | 52 | return ptr; 53 | } 54 | 55 | // Function to split a string on specified delimiter 56 | vector split(const string &s) { 57 | vector elements; 58 | stringstream ss(s); 59 | string item; 60 | 61 | while (getline(ss, item)) { 62 | size_t prev = 0; 63 | size_t pos; 64 | 65 | while ((pos = item.find_first_of(" (,[])=", prev)) != std::string::npos) { 66 | if (pos > prev) 67 | elements.push_back(item.substr(prev, pos - prev)); 68 | prev = pos + 1; 69 | } 70 | 71 | if (prev < item.length()) 72 | elements.push_back(item.substr(prev, std::string::npos)); 73 | } 74 | 75 | return elements; 76 | } 77 | 78 | // Reads the input dataset and sets features and labels buffers accordingly 79 | void read_input(string filename, float *features, int *labels, int numFeatures, 80 | int numExamples) { 81 | ifstream train; 82 | train.open(filename.c_str()); 83 | 84 | string line; 85 | int i; 86 | int n = 0; 87 | 88 | while (getline(train, line) && (n < numExamples)) { 89 | if (line.length()) { 90 | vector tokens = split(line); 91 | features[n * (16 + numFeatures) + numFeatures] = 1.0; 92 | labels[n] = atoi(tokens[0].c_str()); 93 | for (i = 0; i < numFeatures; i++) { 94 | features[n * (16 + numFeatures) + i] = atof(tokens[i + 1].c_str()); 95 | } 96 | n++; 97 | } 98 | } 99 | 100 | train.close(); 101 | } 102 | 103 | // Writes a trained model to the specified filename 104 | void write_output(string filename, float *weights, int numClasses, 105 | int numFeatures) { 106 | 107 | ofstream results; 108 | results.open(filename.c_str()); 109 | 110 | for (int k = 0; k < numClasses; k++) { 111 | results << weights[k * (16 + numFeatures)]; 112 | for (int j = 1; j < (16 + numFeatures); j++) { 113 | results << "," << weights[k * (16 + numFeatures) + j]; 114 | } 115 | results << endl; 116 | } 117 | 118 | results.close(); 119 | } 120 | 121 | // A simple classifier. Given an point it matches the class with the greatest 122 | // probability 123 | int classify(float *features, float *weights, int numClasses, int numFeatures) { 124 | float prob = -1.0; 125 | int prediction = -1; 126 | 127 | for (int k = 0; k < numClasses; k++) { 128 | float dot = weights[k * (16 + numFeatures) + numFeatures]; 129 | 130 | for (int j = 0; j < numFeatures; j++) { 131 | dot += features[j] * weights[k * (16 + numFeatures) + j]; 132 | } 133 | 134 | if (1.0 / (1.0 + exp(-dot)) > prob) { 135 | prob = 1.0 / (1.0 + exp(-dot)); 136 | prediction = k; 137 | } 138 | } 139 | 140 | return prediction; 141 | } 142 | 143 | // A simple prediction function to evaluate the accuracy of a trained model 144 | void predict(string filename, float *weights, int numClasses, int numFeatures) { 145 | cout << " * LogisticRegression Testing *" << endl; 146 | 147 | float tr = 0.0; 148 | float fls = 0.0; 149 | float example[numFeatures]; 150 | string line; 151 | ifstream test; 152 | 153 | test.open(filename.c_str()); 154 | 155 | while (getline(test, line)) { 156 | if (line.length()) { 157 | if (line[0] != '#' && line[0] != ' ') { 158 | vector tokens = split(line); 159 | 160 | int label = (int)atof(tokens[0].c_str()); 161 | for (int j = 1; j < (1 + numFeatures); j++) { 162 | example[j - 1] = atof(tokens[j].c_str()); 163 | } 164 | 165 | int prediction = classify(example, weights, numClasses, numFeatures); 166 | 167 | if (prediction == label) 168 | tr++; 169 | else 170 | fls++; 171 | } 172 | } 173 | } 174 | 175 | test.close(); 176 | 177 | printf(" # accuracy: %1.3f (%i/%i)\n", (tr / (tr + fls)), (int)tr, 178 | (int)(tr + fls)); 179 | printf(" # true: %i\n", (int)tr); 180 | printf(" # false: %i\n", (int)fls); 181 | } 182 | 183 | // CPU implementation of Logistic Regression gradients calculation 184 | void gradients_sw(int *labels, float *features, float *weights, 185 | float *gradients, int numClasses, int numFeatures, 186 | int numExamples) { 187 | for (int k = 0; k < numClasses; k++) { 188 | for (int j = 0; j < (16 + numFeatures); j++) { 189 | gradients[k * (16 + numFeatures) + j] = 0.0; 190 | } 191 | } 192 | 193 | for (int i = 0; i < numExamples; i++) { 194 | for (int k = 0; k < numClasses; k++) { 195 | float dot = weights[k * (16 + numFeatures) + numFeatures]; 196 | 197 | for (int j = 0; j < numFeatures; j++) { 198 | dot += weights[k * (16 + numFeatures) + j] * 199 | features[i * (16 + numFeatures) + j]; 200 | } 201 | 202 | float dif = 1.0 / (1.0 + exp(-dot)); 203 | if (labels[i] == k) 204 | dif -= 1; 205 | 206 | for (int j = 0; j < (16 + numFeatures); j++) { 207 | gradients[k * (16 + numFeatures) + j] += 208 | dif * features[i * (16 + numFeatures) + j]; 209 | } 210 | } 211 | } 212 | } 213 | 214 | int main(int argc, char *argv[]) { 215 | if (argc != 2) { 216 | cout << "Usage: " << argv[0] << " " << endl; 217 | exit(-1); 218 | } 219 | 220 | struct timeval start, end; 221 | 222 | float alpha = 0.3f; 223 | float gamma = 0.95f; 224 | int iter = atoi(argv[1]); 225 | 226 | // Set up the specifications of the model to be trained 227 | int numClasses = NUMCLASSES; 228 | int numFeatures = NUMFEATURES; 229 | int numExamples = NUMEXAMPLES; 230 | 231 | // Split the dataset among the availbale kernels 232 | int chunkSize = numExamples / NUM_KERNELS; 233 | 234 | // Allocate host buffers for lables and features of the dataset as well as 235 | // weights and gradients for the model to be trained and lastly velocity 236 | // buffer for accuracy optimization 237 | int *labels = (int *)INalligned_malloc(numExamples * sizeof(int)); 238 | float *features = (float *)INalligned_malloc( 239 | numExamples * (16 + numFeatures) * sizeof(float)); 240 | float *weights = (float *)INalligned_malloc(numClasses * (16 + numFeatures) * 241 | sizeof(float)); 242 | float *gradients = (float *)INalligned_malloc( 243 | numClasses * (16 + numFeatures) * sizeof(float)); 244 | float *velocity = (float *)INalligned_malloc(numClasses * (1 + numFeatures) * 245 | sizeof(float)); 246 | 247 | // Specify train and test input files as well as output model file 248 | string trainFile = "data/letters_csv_train.dat"; 249 | string testFile = "data/letters_csv_test.dat"; 250 | string modelFile = "data/weights.out"; 251 | 252 | // Read the input dataset 253 | cout << "! Reading train file..." << endl; 254 | read_input(trainFile, features, labels, numFeatures, numExamples); 255 | 256 | // Initialize model weights to zero 257 | for (int i = 0; i < numClasses * (16 + numFeatures); i++) 258 | weights[i] = 0.0; 259 | 260 | if (_accel_) { 261 | // Invoke the hardware accelerated implementation of the algorithm 262 | 263 | cl_engine engine[NUM_KERNELS]; 264 | float *ffeatures[NUM_KERNELS], *fweights[NUM_KERNELS]; 265 | float *fgradients[NUM_KERNELS], *grads[NUM_KERNELS]; 266 | int *flabels[NUM_KERNELS]; 267 | 268 | size_t labels_size = chunkSize * sizeof(int); 269 | size_t features_size = chunkSize * (numFeatures + 16) * sizeof(float); 270 | size_t weights_size = numClasses * (numFeatures + 16) * sizeof(float); 271 | 272 | // Initialize the FPGA world 273 | cl_world world = InAccel::create_world(0); 274 | // Program the FPGA device using the provided bitstream 275 | InAccel::create_program(world, "Gradients.xclbin"); 276 | 277 | // Instanisate the kernels of the bitstream. Each engine holds a kernel 278 | // along with its command queue 279 | engine[0] = InAccel::create_engine(world, "Gradients_0"); 280 | engine[1] = InAccel::create_engine(world, "Gradients_1"); 281 | engine[2] = InAccel::create_engine(world, "Gradients_2"); 282 | engine[3] = InAccel::create_engine(world, "Gradients_3"); 283 | 284 | // Memcpy to each memory bank the corresponding part of the input dataset 285 | for (int i = 0; i < NUM_KERNELS; i++) { 286 | flabels[i] = (int *)InAccel::malloc(world, labels_size, i); 287 | InAccel::memcpy_to(world, flabels[i], 0, labels + i * chunkSize, 288 | labels_size); 289 | ffeatures[i] = (float *)InAccel::malloc(world, features_size, i); 290 | InAccel::memcpy_to(world, ffeatures[i], 0, 291 | features + (i * chunkSize * (16 + numFeatures)), 292 | features_size); 293 | 294 | fweights[i] = (float *)InAccel::malloc(world, weights_size, i); 295 | 296 | fgradients[i] = (float *)InAccel::malloc(world, weights_size, i); 297 | grads[i] = (float *)INalligned_malloc(weights_size); 298 | } 299 | 300 | gettimeofday(&start, NULL); 301 | // Start the iterative part for the training of the algorithm 302 | for (int t = 0; t < iter; t++) { 303 | for (int i = 0; i < NUM_KERNELS; i++) { 304 | // Memcpy to DDR the weights of the model 305 | InAccel::memcpy_to(world, fweights[i], 0, weights, weights_size); 306 | 307 | // Set the kernel arguments 308 | InAccel::set_engine_arg(engine[i], 0, flabels[i]); 309 | InAccel::set_engine_arg(engine[i], 1, ffeatures[i]); 310 | InAccel::set_engine_arg(engine[i], 2, fweights[i]); 311 | InAccel::set_engine_arg(engine[i], 3, fgradients[i]); 312 | InAccel::set_engine_arg(engine[i], 4, numClasses); 313 | InAccel::set_engine_arg(engine[i], 5, numFeatures); 314 | InAccel::set_engine_arg(engine[i], 6, chunkSize); 315 | 316 | // Invoke the kernel execution 317 | InAccel::run_engine(engine[i]); 318 | } 319 | 320 | // Wait for the kernels to finish 321 | for (int i = 0; i < NUM_KERNELS; i++) { 322 | InAccel::await_engine(engine[i]); 323 | } 324 | 325 | // Get the gradients as computed by the kernels 326 | for (int i = 0; i < NUM_KERNELS; i++) { 327 | InAccel::memcpy_from(world, fgradients[i], 0, grads[i], weights_size); 328 | } 329 | 330 | // Aggregate the gradients from all kernels 331 | for (int j = 0; j < numClasses * (16 + numFeatures); j++) { 332 | gradients[j] = grads[0][j]; 333 | for (int i = 1; i < NUM_KERNELS; i++) { 334 | gradients[j] += grads[i][j]; 335 | } 336 | } 337 | 338 | // Compute the new weights of the model applying some software 339 | // optimizations for better model accuracy 340 | for (int k = 0; k < numClasses; k++) { 341 | for (int j = 0; j < (1 + numFeatures); j++) { 342 | velocity[k * (1 + numFeatures) + j] = 343 | gamma * velocity[k * (1 + numFeatures) + j] + 344 | (alpha / numExamples) * gradients[k * (16 + numFeatures) + j]; 345 | weights[k * (16 + numFeatures) + j] -= 346 | velocity[k * (1 + numFeatures) + j]; 347 | } 348 | } 349 | } 350 | 351 | gettimeofday(&end, NULL); 352 | 353 | // Free any allocated buffers for the FPGA device and release the allocated 354 | // kernels and command queues 355 | for (int i = 0; i < NUM_KERNELS; i++) { 356 | free(grads[i]); 357 | InAccel::free(world, fgradients[i]); 358 | InAccel::free(world, fweights[i]); 359 | InAccel::free(world, ffeatures[i]); 360 | InAccel::free(world, flabels[i]); 361 | InAccel::release_engine(engine[i]); 362 | } 363 | 364 | // Release the FPGA program 365 | InAccel::release_program(world); 366 | // Release the FPGA world 367 | InAccel::release_world(world); 368 | } else { 369 | // Invoke the software implementation of the algorithm 370 | gettimeofday(&start, NULL); 371 | for (int t = 0; t < iter; t++) { 372 | gradients_sw(labels, features, weights, gradients, numClasses, 373 | numFeatures, numExamples); 374 | for (int k = 0; k < numClasses; k++) { 375 | for (int j = 0; j < (1 + numFeatures); j++) { 376 | velocity[k * (1 + numFeatures) + j] = 377 | gamma * velocity[k * (1 + numFeatures) + j] + 378 | (alpha / numExamples) * gradients[k * (16 + numFeatures) + j]; 379 | weights[k * (16 + numFeatures) + j] -= 380 | velocity[k * (1 + numFeatures) + j]; 381 | } 382 | } 383 | } 384 | gettimeofday(&end, NULL); 385 | } 386 | 387 | float time_us = ((end.tv_sec * 1000000) + end.tv_usec) - 388 | ((start.tv_sec * 1000000) + start.tv_usec); 389 | float time_s = (end.tv_sec - start.tv_sec); 390 | 391 | cout << "! Time running Gradients Kernel: " << time_us / 1000 << " msec, " 392 | << time_s << " sec " << endl; 393 | 394 | // Compute the accuracy of the trained model on a given test dataset. 395 | predict(testFile, weights, numClasses, numFeatures); 396 | 397 | // Save the model to the specified user file 398 | write_output(modelFile, weights, numClasses, numFeatures); 399 | 400 | // Free any host allocated buffers 401 | free(labels); 402 | free(features); 403 | free(weights); 404 | free(gradients); 405 | free(velocity); 406 | 407 | return 0; 408 | } 409 | -------------------------------------------------------------------------------- /host_srcs/common/INcl.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright © 2019 InAccel 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | */ 16 | 17 | #include 18 | #include 19 | 20 | #include "INcl.h" 21 | 22 | // Builds a program executable from the program binary. 23 | void INclBuildProgram(cl_program program) { 24 | cl_int errcode_ret = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 25 | if (errcode_ret != CL_SUCCESS) { 26 | fprintf(stderr, "Error: clBuildProgram %s (%d)\n", 27 | INclCheckErrorCode(errcode_ret), errcode_ret); 28 | throw EXIT_FAILURE; 29 | } 30 | } 31 | 32 | // Creates a buffer object. 33 | cl_mem INclCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, 34 | void *host_ptr) { 35 | cl_int errcode_ret; 36 | cl_mem mem = clCreateBuffer(context, flags, size, host_ptr, &errcode_ret); 37 | if (errcode_ret != CL_SUCCESS || !mem) { 38 | fprintf(stderr, "Error: clCreateBuffer %s (%d)\n", 39 | INclCheckErrorCode(errcode_ret), errcode_ret); 40 | throw EXIT_FAILURE; 41 | } 42 | 43 | return mem; 44 | } 45 | 46 | // Create a command-queue on a specific device. 47 | cl_command_queue INclCreateCommandQueue(cl_context context, 48 | cl_device_id device) { 49 | cl_int errcode_ret; 50 | cl_command_queue command_queue = 51 | clCreateCommandQueue(context, device, 0, &errcode_ret); 52 | if (errcode_ret != CL_SUCCESS || !command_queue) { 53 | fprintf(stderr, "Error: clCreateCommandQueue %s (%d)\n", 54 | INclCheckErrorCode(errcode_ret), errcode_ret); 55 | throw EXIT_FAILURE; 56 | } 57 | 58 | return command_queue; 59 | } 60 | 61 | // Creates an OpenCL context. 62 | cl_context INclCreateContext(cl_device_id device) { 63 | cl_int errcode_ret; 64 | cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &errcode_ret); 65 | if (errcode_ret != CL_SUCCESS || !context) { 66 | fprintf(stderr, "Error: clCreateContext %s (%d)\n", 67 | INclCheckErrorCode(errcode_ret), errcode_ret); 68 | throw EXIT_FAILURE; 69 | } 70 | 71 | return context; 72 | } 73 | 74 | // Creates a kernel object. 75 | cl_kernel INclCreateKernel(cl_program program, const char *kernel_name) { 76 | cl_int errcode_ret; 77 | cl_kernel kernel = clCreateKernel(program, kernel_name, &errcode_ret); 78 | if (errcode_ret != CL_SUCCESS || !kernel) { 79 | fprintf(stderr, "Error: clCreateKernel %s (%d)\n", 80 | INclCheckErrorCode(errcode_ret), errcode_ret); 81 | throw EXIT_FAILURE; 82 | } 83 | 84 | return kernel; 85 | } 86 | 87 | // Creates a program object for a context, and loads specified binary data into 88 | // the program object. 89 | cl_program INclCreateProgramWithBinary(cl_context context, cl_uint num_devices, 90 | const cl_device_id *device_list, 91 | const char *binary_name) { 92 | FILE *file = fopen(binary_name, "rb"); 93 | if (!file) { 94 | fprintf(stderr, "Error: fopen\n"); 95 | throw EXIT_FAILURE; 96 | } 97 | 98 | fseek(file, 0, SEEK_END); 99 | size_t size = ftell(file); 100 | fseek(file, 0, SEEK_SET); 101 | 102 | char *temp = (char *)malloc((size + 1) * sizeof(char)); 103 | if (!temp) { 104 | fprintf(stderr, "Error: malloc\n"); 105 | throw EXIT_FAILURE; 106 | } 107 | 108 | if (size != fread(temp, sizeof(char), size, file)) { 109 | free(temp); 110 | 111 | fprintf(stderr, "Error: fread\n"); 112 | throw EXIT_FAILURE; 113 | } 114 | 115 | fclose(file); 116 | temp[size] = 0; 117 | 118 | char *binary = temp; 119 | 120 | cl_int errcode_ret; 121 | cl_program program = clCreateProgramWithBinary( 122 | context, num_devices, device_list, &size, (const unsigned char **)&binary, 123 | NULL, &errcode_ret); 124 | if (errcode_ret != CL_SUCCESS || !program) { 125 | fprintf(stderr, "Error: clCreateProgramWithBinary %s (%d)\n", 126 | INclCheckErrorCode(errcode_ret), errcode_ret); 127 | throw EXIT_FAILURE; 128 | } 129 | 130 | free(temp); 131 | 132 | return program; 133 | } 134 | 135 | // Enqueues a command to map a region of the buffer object given by buffer into 136 | // the host address space and returns a pointer to this mapped region. 137 | void *INclEnqueueMapBuffer(cl_command_queue command_queue, cl_mem buffer, 138 | cl_map_flags map_flags, size_t cb, 139 | cl_uint num_events_in_wait_list, 140 | const cl_event *event_wait_list, cl_event *event) { 141 | cl_int errcode_ret; 142 | void *ptr = clEnqueueMapBuffer(command_queue, buffer, CL_FALSE, map_flags, 0, 143 | cb, num_events_in_wait_list, event_wait_list, 144 | event, &errcode_ret); 145 | if (errcode_ret != CL_SUCCESS || !ptr) { 146 | fprintf(stderr, "Error: clEnqueueMapBuffer %s (%d)\n", 147 | INclCheckErrorCode(errcode_ret), errcode_ret); 148 | throw EXIT_FAILURE; 149 | } 150 | 151 | return ptr; 152 | } 153 | 154 | // Enqueues a command to indicate which device a set of memory objects should be 155 | // associated with. 156 | void INclEnqueueMigrateMemObjects(cl_command_queue command_queue, 157 | cl_uint num_mem_objects, 158 | const cl_mem *mem_objects, 159 | cl_mem_migration_flags flags, 160 | cl_uint num_events_in_wait_list, 161 | const cl_event *event_wait_list, 162 | cl_event *event) { 163 | cl_int errcode_ret = clEnqueueMigrateMemObjects( 164 | command_queue, num_mem_objects, mem_objects, flags, 165 | num_events_in_wait_list, event_wait_list, event); 166 | if (errcode_ret != CL_SUCCESS) { 167 | fprintf(stderr, "Error: clEnqueueMigrateMemObjects %s (%d)\n", 168 | INclCheckErrorCode(errcode_ret), errcode_ret); 169 | throw EXIT_FAILURE; 170 | } 171 | } 172 | 173 | // Enqueues a command to execute a kernel on a device. 174 | void INclEnqueueNDRangeKernel(cl_command_queue command_queue, cl_kernel kernel, 175 | cl_uint work_dim, const size_t *global_work_size, 176 | const size_t *local_work_size, 177 | cl_uint num_events_in_wait_list, 178 | const cl_event *event_wait_list, 179 | cl_event *event) { 180 | cl_int errcode_ret = clEnqueueNDRangeKernel( 181 | command_queue, kernel, work_dim, NULL, global_work_size, local_work_size, 182 | num_events_in_wait_list, event_wait_list, event); 183 | if (errcode_ret != CL_SUCCESS) { 184 | fprintf(stderr, "Error: clEnqueueNDRangeKernel %s (%d)\n", 185 | INclCheckErrorCode(errcode_ret), errcode_ret); 186 | throw EXIT_FAILURE; 187 | } 188 | } 189 | 190 | // Enqueue commands to read from a buffer object to host memory. 191 | void INclEnqueueReadBuffer(cl_command_queue command_queue, cl_mem buffer, 192 | size_t offset, size_t cb, void *ptr, 193 | cl_uint num_events_in_wait_list, 194 | const cl_event *event_wait_list, cl_event *event) { 195 | cl_int errcode_ret = 196 | clEnqueueReadBuffer(command_queue, buffer, CL_FALSE, offset, cb, ptr, 197 | num_events_in_wait_list, event_wait_list, event); 198 | if (errcode_ret != CL_SUCCESS) { 199 | fprintf(stderr, "Error: clEnqueueReadBuffer %s (%d)\n", 200 | INclCheckErrorCode(errcode_ret), errcode_ret); 201 | throw EXIT_FAILURE; 202 | } 203 | } 204 | 205 | // Enqueues a command to execute a kernel on a device. 206 | void INclEnqueueTask(cl_command_queue command_queue, cl_kernel kernel, 207 | cl_uint num_events_in_wait_list, 208 | const cl_event *event_wait_list, cl_event *event) { 209 | cl_int errcode_ret = clEnqueueTask( 210 | command_queue, kernel, num_events_in_wait_list, event_wait_list, event); 211 | if (errcode_ret != CL_SUCCESS) { 212 | fprintf(stderr, "Error: clEnqueueTask %s (%d)\n", 213 | INclCheckErrorCode(errcode_ret), errcode_ret); 214 | throw EXIT_FAILURE; 215 | } 216 | } 217 | 218 | // Enqueue commands to write to a buffer object from host memory. 219 | void INclEnqueueWriteBuffer(cl_command_queue command_queue, cl_mem buffer, 220 | size_t offset, size_t cb, const void *ptr, 221 | cl_uint num_events_in_wait_list, 222 | const cl_event *event_wait_list, cl_event *event) { 223 | cl_int errcode_ret = 224 | clEnqueueWriteBuffer(command_queue, buffer, CL_FALSE, offset, cb, ptr, 225 | num_events_in_wait_list, event_wait_list, event); 226 | if (errcode_ret != CL_SUCCESS) { 227 | fprintf(stderr, "Error: clEnqueueWriteBuffer %s (%d)\n", 228 | INclCheckErrorCode(errcode_ret), errcode_ret); 229 | throw EXIT_FAILURE; 230 | } 231 | } 232 | 233 | // Blocks until all previously queued OpenCL commands in a command-queue are 234 | // issued to the associated device and have completed. 235 | void INclFinish(cl_command_queue command_queue) { 236 | cl_int errcode_ret = clFinish(command_queue); 237 | if (errcode_ret != CL_SUCCESS) { 238 | fprintf(stderr, "Error: clFinish %s (%d)\n", 239 | INclCheckErrorCode(errcode_ret), errcode_ret); 240 | throw EXIT_FAILURE; 241 | } 242 | } 243 | 244 | // Issues all previously queued OpenCL commands in a command-queue to the device 245 | // associated with the command-queue. 246 | void INclFlush(cl_command_queue command_queue) { 247 | cl_int errcode_ret = clFlush(command_queue); 248 | if (errcode_ret != CL_SUCCESS) { 249 | fprintf(stderr, "Error: clFlush %s (%d)\n", INclCheckErrorCode(errcode_ret), 250 | errcode_ret); 251 | throw EXIT_FAILURE; 252 | } 253 | } 254 | 255 | // Obtain specified device, if available. 256 | cl_device_id INclGetDeviceID(cl_platform_id platform, cl_uint id) { 257 | cl_device_id device_id = (cl_device_id)malloc(sizeof(cl_device_id)); 258 | if (!device_id) { 259 | fprintf(stderr, "Error: malloc\n"); 260 | throw EXIT_FAILURE; 261 | } 262 | 263 | cl_uint num_devices; 264 | INclGetDeviceIDs(platform, 0, NULL, &num_devices); 265 | 266 | cl_device_id *devices = 267 | (cl_device_id *)malloc(num_devices * sizeof(cl_device_id)); 268 | if (!devices) { 269 | fprintf(stderr, "Error: malloc\n"); 270 | throw EXIT_FAILURE; 271 | } 272 | 273 | INclGetDeviceIDs(platform, num_devices, devices, NULL); 274 | 275 | cl_uint i; 276 | for (i = 0; i < num_devices; i++) { 277 | if (i == id) { 278 | device_id = devices[i]; 279 | break; 280 | } 281 | } 282 | 283 | free(devices); 284 | 285 | if (i == num_devices) { 286 | fprintf(stderr, "Error: clGetDeviceID\n"); 287 | throw EXIT_FAILURE; 288 | } 289 | 290 | return device_id; 291 | } 292 | 293 | // Obtain the list of devices available on a platform. 294 | void INclGetDeviceIDs(cl_platform_id platform, cl_uint num_entries, 295 | cl_device_id *devices, cl_uint *num_devices) { 296 | cl_int errcode_ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_entries, 297 | devices, num_devices); 298 | if (errcode_ret != CL_SUCCESS) { 299 | fprintf(stderr, "Error: clGetDeviceIDs %s (%d)\n", 300 | INclCheckErrorCode(errcode_ret), errcode_ret); 301 | throw EXIT_FAILURE; 302 | } 303 | } 304 | 305 | // Get specific information about the OpenCL device. 306 | void INclGetDeviceInfo(cl_device_id device, cl_device_info param_name, 307 | size_t param_value_size, void *param_value, 308 | size_t *param_value_size_ret) { 309 | cl_int errcode_ret = clGetDeviceInfo(device, param_name, param_value_size, 310 | param_value, param_value_size_ret); 311 | if (errcode_ret != CL_SUCCESS) { 312 | fprintf(stderr, "Error: clGetDeviceInfo %s (%d)\n", 313 | INclCheckErrorCode(errcode_ret), errcode_ret); 314 | throw EXIT_FAILURE; 315 | } 316 | } 317 | 318 | // Obtain platform, if available. 319 | cl_platform_id INclGetPlatformID() { 320 | cl_platform_id platform_id = (cl_platform_id)malloc(sizeof(cl_platform_id)); 321 | if (!platform_id) { 322 | fprintf(stderr, "Error: malloc\n"); 323 | throw EXIT_FAILURE; 324 | } 325 | 326 | cl_uint num_platforms; 327 | INclGetPlatformIDs(0, NULL, &num_platforms); 328 | 329 | cl_platform_id *platforms = 330 | (cl_platform_id *)malloc(num_platforms * sizeof(cl_platform_id)); 331 | if (!platforms) { 332 | fprintf(stderr, "Error: malloc\n"); 333 | throw EXIT_FAILURE; 334 | } 335 | 336 | INclGetPlatformIDs(num_platforms, platforms, NULL); 337 | 338 | cl_uint i; 339 | for (i = 0; i < num_platforms; i++) { 340 | size_t platform_name_size; 341 | INclGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 0, NULL, 342 | &platform_name_size); 343 | 344 | char *platform_name = (char *)malloc(platform_name_size * sizeof(char)); 345 | if (!platform_name) { 346 | fprintf(stderr, "Error: malloc\n"); 347 | throw EXIT_FAILURE; 348 | } 349 | 350 | INclGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, platform_name_size, 351 | platform_name, NULL); 352 | 353 | if (strstr(platform_name, "Xilinx")) { 354 | free(platform_name); 355 | 356 | platform_id = platforms[i]; 357 | break; 358 | } 359 | 360 | free(platform_name); 361 | } 362 | 363 | free(platforms); 364 | 365 | if (i == num_platforms) { 366 | fprintf(stderr, "Error: clGetPlatformID\n"); 367 | throw EXIT_FAILURE; 368 | } 369 | 370 | return platform_id; 371 | } 372 | 373 | // Obtain the list of platforms available. 374 | void INclGetPlatformIDs(cl_uint num_entries, cl_platform_id *platforms, 375 | cl_uint *num_platforms) { 376 | cl_int errcode_ret = clGetPlatformIDs(num_entries, platforms, num_platforms); 377 | if (errcode_ret != CL_SUCCESS) { 378 | fprintf(stderr, "Error: clGetPlatformIDs %s (%d)\n", 379 | INclCheckErrorCode(errcode_ret), errcode_ret); 380 | throw EXIT_FAILURE; 381 | } 382 | } 383 | 384 | // Get specific information about the OpenCL platform. 385 | void INclGetPlatformInfo(cl_platform_id platform, cl_platform_info param_name, 386 | size_t param_value_size, void *param_value, 387 | size_t *param_value_size_ret) { 388 | cl_int errcode_ret = clGetPlatformInfo(platform, param_name, param_value_size, 389 | param_value, param_value_size_ret); 390 | if (errcode_ret != CL_SUCCESS) { 391 | fprintf(stderr, "Error: clGetPlatformInfo %s (%d)\n", 392 | INclCheckErrorCode(errcode_ret), errcode_ret); 393 | throw EXIT_FAILURE; 394 | } 395 | } 396 | 397 | // Decrements the command_queue reference count. 398 | void INclReleaseCommandQueue(cl_command_queue command_queue) { 399 | cl_int errcode_ret = clReleaseCommandQueue(command_queue); 400 | if (errcode_ret != CL_SUCCESS) { 401 | fprintf(stderr, "Error: clReleaseCommandQueue %s (%d)\n", 402 | INclCheckErrorCode(errcode_ret), errcode_ret); 403 | throw EXIT_FAILURE; 404 | } 405 | } 406 | 407 | // Decrement the context reference count. 408 | void INclReleaseContext(cl_context context) { 409 | cl_int errcode_ret = clReleaseContext(context); 410 | if (errcode_ret != CL_SUCCESS) { 411 | fprintf(stderr, "Error: clReleaseContext %s (%d)\n", 412 | INclCheckErrorCode(errcode_ret), errcode_ret); 413 | throw EXIT_FAILURE; 414 | } 415 | } 416 | 417 | // Decrements the event reference count. 418 | void INclReleaseEvent(cl_event event) { 419 | cl_int errcode_ret = clReleaseEvent(event); 420 | if (errcode_ret != CL_SUCCESS) { 421 | fprintf(stderr, "Error: clReleaseEvent %s (%d)\n", 422 | INclCheckErrorCode(errcode_ret), errcode_ret); 423 | throw EXIT_FAILURE; 424 | } 425 | } 426 | 427 | // Decrements the kernel reference count. 428 | void INclReleaseKernel(cl_kernel kernel) { 429 | cl_int errcode_ret = clReleaseKernel(kernel); 430 | if (errcode_ret != CL_SUCCESS) { 431 | fprintf(stderr, "Error: clReleaseKernel %s (%d)\n", 432 | INclCheckErrorCode(errcode_ret), errcode_ret); 433 | throw EXIT_FAILURE; 434 | } 435 | } 436 | 437 | // Decrements the memory object reference count. 438 | void INclReleaseMemObject(cl_mem memobj) { 439 | cl_int errcode_ret = clReleaseMemObject(memobj); 440 | if (errcode_ret != CL_SUCCESS) { 441 | fprintf(stderr, "Error: clReleaseMemObject %s (%d)\n", 442 | INclCheckErrorCode(errcode_ret), errcode_ret); 443 | throw EXIT_FAILURE; 444 | } 445 | } 446 | 447 | // Decrements the program reference count. 448 | void INclReleaseProgram(cl_program program) { 449 | cl_int errcode_ret = clReleaseProgram(program); 450 | if (errcode_ret != CL_SUCCESS) { 451 | fprintf(stderr, "Error: clReleaseProgram %s (%d)\n", 452 | INclCheckErrorCode(errcode_ret), errcode_ret); 453 | throw EXIT_FAILURE; 454 | } 455 | } 456 | 457 | // Used to set the argument value for a specific argument of a kernel. 458 | void INclSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, 459 | const void *arg_value) { 460 | cl_int errcode_ret = clSetKernelArg(kernel, arg_index, arg_size, arg_value); 461 | if (errcode_ret != CL_SUCCESS) { 462 | fprintf(stderr, "Error: clSetKernelArg %s (%d)\n", 463 | INclCheckErrorCode(errcode_ret), errcode_ret); 464 | throw EXIT_FAILURE; 465 | } 466 | } 467 | 468 | // Waits on the host thread for commands identified by event objects to 469 | // complete. 470 | void INclWaitForEvents(cl_uint num_events, const cl_event *event_list) { 471 | cl_int errcode_ret = clWaitForEvents(num_events, event_list); 472 | if (errcode_ret != CL_SUCCESS) { 473 | fprintf(stderr, "Error: clWaitForEvents %s (%d)\n", 474 | INclCheckErrorCode(errcode_ret), errcode_ret); 475 | throw EXIT_FAILURE; 476 | } 477 | } 478 | 479 | // Returns a message related to the error code. 480 | const char *INclCheckErrorCode(cl_int errcode) { 481 | switch (errcode) { 482 | case -1: 483 | return "CL_DEVICE_NOT_FOUND"; 484 | case -2: 485 | return "CL_DEVICE_NOT_AVAILABLE"; 486 | case -3: 487 | return "CL_COMPILER_NOT_AVAILABLE"; 488 | case -4: 489 | return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; 490 | case -5: 491 | return "CL_OUT_OF_RESOURCES"; 492 | case -6: 493 | return "CL_OUT_OF_HOST_MEMORY"; 494 | case -7: 495 | return "CL_PROFILING_INFO_NOT_AVAILABLE"; 496 | case -8: 497 | return "CL_MEM_COPY_OVERLAP"; 498 | case -9: 499 | return "CL_IMAGE_FORMAT_MISMATCH"; 500 | case -10: 501 | return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; 502 | case -11: 503 | return "CL_BUILD_PROGRAM_FAILURE"; 504 | case -12: 505 | return "CL_MAP_FAILURE"; 506 | case -13: 507 | return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; 508 | case -14: 509 | return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; 510 | case -15: 511 | return "CL_COMPILE_PROGRAM_FAILURE"; 512 | case -16: 513 | return "CL_LINKER_NOT_AVAILABLE"; 514 | case -17: 515 | return "CL_LINK_PROGRAM_FAILURE"; 516 | case -18: 517 | return "CL_DEVICE_PARTITION_FAILED"; 518 | case -19: 519 | return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; 520 | case -30: 521 | return "CL_INVALID_VALUE"; 522 | case -31: 523 | return "CL_INVALID_DEVICE_TYPE"; 524 | case -32: 525 | return "CL_INVALID_PLATFORM"; 526 | case -33: 527 | return "CL_INVALID_DEVICE"; 528 | case -34: 529 | return "CL_INVALID_CONTEXT"; 530 | case -35: 531 | return "CL_INVALID_QUEUE_PROPERTIES"; 532 | case -36: 533 | return "CL_INVALID_COMMAND_QUEUE"; 534 | case -37: 535 | return "CL_INVALID_HOST_PTR"; 536 | case -38: 537 | return "CL_INVALID_MEM_OBJECT"; 538 | case -39: 539 | return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; 540 | case -40: 541 | return "CL_INVALID_IMAGE_SIZE"; 542 | case -41: 543 | return "CL_INVALID_SAMPLER"; 544 | case -42: 545 | return "CL_INVALID_BINARY"; 546 | case -43: 547 | return "CL_INVALID_BUILD_OPTIONS"; 548 | case -44: 549 | return "CL_INVALID_PROGRAM"; 550 | case -45: 551 | return "CL_INVALID_PROGRAM_EXECUTABLE"; 552 | case -46: 553 | return "CL_INVALID_KERNEL_NAME"; 554 | case -47: 555 | return "CL_INVALID_KERNEL_DEFINITION"; 556 | case -48: 557 | return "CL_INVALID_KERNEL"; 558 | case -49: 559 | return "CL_INVALID_ARG_INDEX"; 560 | case -50: 561 | return "CL_INVALID_ARG_VALUE"; 562 | case -51: 563 | return "CL_INVALID_ARG_SIZE"; 564 | case -52: 565 | return "CL_INVALID_KERNEL_ARGS"; 566 | case -53: 567 | return "CL_INVALID_WORK_DIMENSION"; 568 | case -54: 569 | return "CL_INVALID_WORK_GROUP_SIZE"; 570 | case -55: 571 | return "CL_INVALID_WORK_ITEM_SIZE"; 572 | case -56: 573 | return "CL_INVALID_GLOBAL_OFFSET"; 574 | case -57: 575 | return "CL_INVALID_EVENT_WAIT_LIST"; 576 | case -58: 577 | return "CL_INVALID_EVENT"; 578 | case -59: 579 | return "CL_INVALID_OPERATION"; 580 | case -60: 581 | return "CL_INVALID_GL_OBJECT"; 582 | case -61: 583 | return "CL_INVALID_BUFFER_SIZE"; 584 | case -62: 585 | return "CL_INVALID_MIP_LEVEL"; 586 | case -63: 587 | return "CL_INVALID_GLOBAL_WORK_SIZE"; 588 | case -64: 589 | return "CL_INVALID_PROPERTY"; 590 | case -65: 591 | return "CL_INVALID_IMAGE_DESCRIPTOR"; 592 | case -66: 593 | return "CL_INVALID_COMPILER_OPTIONS"; 594 | case -67: 595 | return "CL_INVALID_LINKER_OPTIONS"; 596 | case -68: 597 | return "CL_INVALID_DEVICE_PARTITION_COUNT"; 598 | case -69: 599 | return "CL_INVALID_PIPE_SIZE"; 600 | case -70: 601 | return "CL_INVALID_DEVICE_QUEUE"; 602 | default: 603 | return "CL_INVALID_ERROR_CODE"; 604 | } 605 | } 606 | --------------------------------------------------------------------------------