├── .gitignore ├── .idea ├── compiler.xml ├── copyright │ └── profiles_settings.xml ├── gradle.xml ├── misc.xml ├── modules.xml ├── runConfigurations.xml └── vcs.xml ├── README.md ├── app ├── .gitignore ├── CMakeLists.txt ├── build.gradle ├── proguard-rules.pro └── src │ ├── androidTest │ └── java │ │ └── com │ │ └── lanytek │ │ └── deepsensev3 │ │ └── ExampleInstrumentedTest.java │ ├── main │ ├── AndroidManifest.xml │ ├── assets │ │ └── deepsense.cl │ ├── cpp │ │ ├── basic_functions.cpp │ │ ├── classifier.cpp │ │ ├── clio.cpp │ │ ├── deepsense.cpp │ │ ├── deepsense_lib.cpp │ │ ├── include │ │ │ ├── basic_functions.hpp │ │ │ ├── classifier.hpp │ │ │ ├── clio.hpp │ │ │ ├── deepsense_internal_lib.hpp │ │ │ ├── deepsense_lib.hpp │ │ │ ├── layers │ │ │ │ ├── conv_layer.hpp │ │ │ │ ├── fully_connected.hpp │ │ │ │ ├── lrn.hpp │ │ │ │ ├── maxpool.hpp │ │ │ │ └── softmax.hpp │ │ │ ├── predefine.hpp │ │ │ └── utilities.hpp │ │ ├── layers │ │ │ ├── conv_layer.cpp │ │ │ ├── fully_connected.cpp │ │ │ ├── lrn.cpp │ │ │ ├── maxpool.cpp │ │ │ └── softmax.cpp │ │ └── utilities.cpp │ ├── java │ │ └── com │ │ │ └── lanytek │ │ │ └── deepsensev3 │ │ │ ├── MainActivity.java │ │ │ └── Utilities.java │ └── res │ │ ├── layout │ │ └── activity_main.xml │ │ ├── mipmap-hdpi │ │ └── ic_launcher.png │ │ ├── mipmap-mdpi │ │ └── ic_launcher.png │ │ ├── mipmap-xhdpi │ │ └── ic_launcher.png │ │ ├── mipmap-xxhdpi │ │ └── ic_launcher.png │ │ ├── mipmap-xxxhdpi │ │ └── ic_launcher.png │ │ ├── values-w820dp │ │ └── dimens.xml │ │ └── values │ │ ├── colors.xml │ │ ├── dimens.xml │ │ ├── strings.xml │ │ └── styles.xml │ └── test │ └── java │ └── com │ └── lanytek │ └── deepsensev3 │ └── ExampleUnitTest.java ├── build.gradle ├── distribution └── opencl │ ├── include │ └── CL │ │ ├── cl.h │ │ ├── cl.hpp │ │ ├── cl_ext.h │ │ ├── cl_ext_qcom.h │ │ ├── cl_gl.h │ │ ├── cl_gl_ext.h │ │ ├── cl_perf_monitor_qcom.h │ │ ├── cl_platform.h │ │ └── opencl.h │ └── lib │ └── armeabi-v7a │ ├── Adreno-Android5 │ ├── libOpenCL.so │ └── libllvm-qcom.so │ ├── Adreno-Android6 │ ├── libOpenCL.so │ └── libllvm-qcom.so │ ├── libGLES_mali.so │ ├── libOpenCL.so │ └── libllvm-qcom.so ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat └── settings.gradle /.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .gradle 3 | /local.properties 4 | /.idea/workspace.xml 5 | /.idea/libraries 6 | .DS_Store 7 | /build 8 | /captures 9 | .externalNativeBuild 10 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.idea/copyright/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/gradle.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 18 | 19 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 19 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 46 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.idea/runConfigurations.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 11 | 12 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DeepSense 2 | 3 | Download the models below, extract and copy them onto mobile devices and set the link in DeepSense App to load them. 4 | 5 | VGG-F Link: https://drive.google.com/file/d/0B_GMfaURPvQDQk9sU3FHdU1sUzA/view?usp=sharing 6 | 7 | Yolo Tiny Link: https://drive.google.com/file/d/0B_GMfaURPvQDZVVFMnBXQUU3X2s/view?usp=sharing 8 | 9 | ## The app is configured to work on Samsung Galaxy S7 with Mali GPU, if you need to run it on Adreno-based devices 10 | - 1) copy the appropriate shared libraries (libllvm-qcom.so and libOpenCL.so) from distribution/opencl/lib/armeabi-v7a/Adreno-Android5 OR distribution/opencl/lib/armeabi-v7a/Adreno-Android6 into distribution/opencl/lib/armeabi-v7a 11 | - 2) comment out Mali-shared library in app/CMakeLists.txt and uncomment Adreno shared library 12 | 13 | ## To run the app 14 | - 1) Download and extract the model 15 | - 2) Put the whole model's directory onto device's storage 16 | - 3) Change the path in MainActivity.java 17 | - 4) Run :) 18 | 19 | Enjoy DeepSense 20 | 21 | 22 | -------------------------------------------------------------------------------- /app/.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | -------------------------------------------------------------------------------- /app/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Sets the minimum version of CMake required to build the native 2 | # library. You should either keep the default value or only pass a 3 | # value of 3.4.0 or lower. 4 | 5 | cmake_minimum_required(VERSION 3.4.1) 6 | 7 | # configure import libs 8 | set(distribution_DIR ${CMAKE_SOURCE_DIR}/../distribution) 9 | set(source_DIR ${CMAKE_SOURCE_DIR}/src/main/cpp) 10 | 11 | add_library(lib_opencl SHARED IMPORTED) 12 | set_target_properties( lib_opencl PROPERTIES IMPORTED_LOCATION ${distribution_DIR}/opencl/lib/${ANDROID_ABI}/libGLES_mali.so ) 13 | #set_target_properties( lib_opencl PROPERTIES IMPORTED_LOCATION ${distribution_DIR}/opencl/lib/${ANDROID_ABI}/libllvm-qcom.so ) 14 | #set_target_properties( lib_opencl PROPERTIES IMPORTED_LOCATION ${distribution_DIR}/opencl/lib/${ANDROID_ABI}/libOpenCL.so ) 15 | 16 | # Creates and names a library, sets it as either STATIC 17 | # or SHARED, and provides the relative paths to its source code. 18 | # You can define multiple libraries, and CMake builds it for you. 19 | # Gradle automatically packages shared libraries with your APK. 20 | 21 | # build application's shared lib 22 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xlinker --no-warn-mismatch -O2 -mfpu=vfpv3-d16 -mhard-float -D_NDK_MATH_NO_SOFTFP=1 -march=armv7-a -mfloat-abi=hard") 23 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xlinker --no-warn-mismatch -O2 -mfpu=vfpv3-d16 -mhard-float -D_NDK_MATH_NO_SOFTFP=1 -march=armv7-a -mfloat-abi=hard -std=gnu++11") 24 | set(STLPORT_FORCE_REBUILD "true") 25 | 26 | add_library( # Sets the name of the library. 27 | deepsense 28 | 29 | # Sets the library as a shared library. 30 | SHARED 31 | 32 | # Provides a relative path to your source file(s). 33 | # Associated headers in the same location as their source 34 | # file are automatically included. 35 | src/main/cpp/deepsense.cpp 36 | ${source_DIR}/deepsense_lib.cpp 37 | ${source_DIR}/clio.cpp 38 | ${source_DIR}/basic_functions.cpp 39 | ${source_DIR}/utilities.cpp 40 | ${source_DIR}/classifier.cpp 41 | ${source_DIR}/layers/conv_layer.cpp 42 | ${source_DIR}/layers/maxpool.cpp 43 | ${source_DIR}/layers/fully_connected.cpp 44 | ${source_DIR}/layers/softmax.cpp 45 | ${source_DIR}/layers/lrn.cpp) 46 | 47 | # Searches for a specified prebuilt library and stores the path as a 48 | # variable. Because system libraries are included in the search path by 49 | # default, you only need to specify the name of the public NDK library 50 | # you want to add. CMake verifies that the library exists before 51 | # completing its build. 52 | 53 | target_include_directories(deepsense PRIVATE ${distribution_DIR}/opencl/include ${distribution_DIR}/clblast/include) 54 | include_directories(${CMAKE_SOURCE_DIR}/src/main/cpp/include) 55 | 56 | find_library( # Sets the name of the path variable. 57 | log-lib 58 | 59 | # Specifies the name of the NDK library that 60 | # you want CMake to locate. 61 | log 62 | m_hard) 63 | 64 | # Specifies libraries CMake should link to your target library. You 65 | # can link multiple libraries, such as libraries you define in the 66 | # build script, prebuilt third-party libraries, or system libraries. 67 | 68 | target_link_libraries( # Specifies the target library. 69 | deepsense 70 | 71 | lib_opencl 72 | 73 | # Links the target library to the log library 74 | # included in the NDK. 75 | ${log-lib} ) 76 | -------------------------------------------------------------------------------- /app/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'com.android.application' 2 | 3 | android { 4 | compileSdkVersion 22 5 | buildToolsVersion "22.0.0" 6 | defaultConfig { 7 | applicationId "com.lanytek.deepsensev3" 8 | minSdkVersion 19 9 | targetSdkVersion 21 10 | versionCode 1 11 | versionName "1.0" 12 | testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner" 13 | externalNativeBuild { 14 | cmake { 15 | cppFlags "-std=c++11" 16 | } 17 | } 18 | ndk { 19 | // Specifies the ABI configurations of your native 20 | // libraries Gradle should build and package with your APK. 21 | abiFilters 'armeabi-v7a' 22 | } 23 | } 24 | buildTypes { 25 | release { 26 | minifyEnabled false 27 | proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro' 28 | } 29 | } 30 | sourceSets { 31 | main { 32 | // let gradle pack the shared library into apk 33 | jniLibs.srcDirs = ['../distribution/opencl/lib'] 34 | } 35 | } 36 | externalNativeBuild { 37 | cmake { 38 | path "CMakeLists.txt" 39 | } 40 | } 41 | } 42 | 43 | dependencies { 44 | compile fileTree(dir: 'libs', include: ['*.jar']) 45 | androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', { 46 | exclude group: 'com.android.support', module: 'support-annotations' 47 | }) 48 | compile 'com.android.support:appcompat-v7:22+' 49 | testCompile 'junit:junit:4.12' 50 | compile 'com.squareup.picasso:picasso:2.5.2' 51 | } 52 | -------------------------------------------------------------------------------- /app/proguard-rules.pro: -------------------------------------------------------------------------------- 1 | # Add project specific ProGuard rules here. 2 | # By default, the flags in this file are appended to flags specified 3 | # in /home/JC1DA/Android/Sdk/tools/proguard/proguard-android.txt 4 | # You can edit the include path and order by changing the proguardFiles 5 | # directive in build.gradle. 6 | # 7 | # For more details, see 8 | # http://developer.android.com/guide/developing/tools/proguard.html 9 | 10 | # Add any project specific keep options here: 11 | 12 | # If your project uses WebView with JS, uncomment the following 13 | # and specify the fully qualified class name to the JavaScript interface 14 | # class: 15 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview { 16 | # public *; 17 | #} 18 | -------------------------------------------------------------------------------- /app/src/androidTest/java/com/lanytek/deepsensev3/ExampleInstrumentedTest.java: -------------------------------------------------------------------------------- 1 | package com.lanytek.deepsensev3; 2 | 3 | import android.content.Context; 4 | import android.support.test.InstrumentationRegistry; 5 | import android.support.test.runner.AndroidJUnit4; 6 | 7 | import org.junit.Test; 8 | import org.junit.runner.RunWith; 9 | 10 | import static org.junit.Assert.*; 11 | 12 | /** 13 | * Instrumentation test, which will execute on an Android device. 14 | * 15 | * @see Testing documentation 16 | */ 17 | @RunWith(AndroidJUnit4.class) 18 | public class ExampleInstrumentedTest { 19 | @Test 20 | public void useAppContext() throws Exception { 21 | // Context of the app under test. 22 | Context appContext = InstrumentationRegistry.getTargetContext(); 23 | 24 | assertEquals("com.lanytek.deepsensev3", appContext.getPackageName()); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /app/src/main/assets/deepsense.cl: -------------------------------------------------------------------------------- 1 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable 2 | 3 | static inline int getIndexFrom3D(int d1, int d2, int d3, int i1, int i2, int i3) { 4 | return i1 * (d2 * d3) + i2 * d3 + i3; 5 | } 6 | 7 | static inline int getIndexFrom4D(int d1, int d2, int d3, int d4, int i1, int i2, int i3, int i4) { 8 | return i1 * (d2 * d3 * d4) + i2 * (d3 * d4) + i3 * d4 + i4; 9 | } 10 | 11 | kernel void convertFloatToHalf( 12 | global const float *input, 13 | global half *output) { 14 | int idx = get_global_id(0); 15 | vstore_half(input[idx], 0, &output[idx]); 16 | } 17 | 18 | kernel void convertHalfToFloat( 19 | global const half *input, 20 | global float *output) { 21 | int idx = get_global_id(0); 22 | //output[idx] = convert_float(input[idx]); 23 | output[idx] = (float)input[idx]; 24 | } 25 | 26 | __kernel void conv_kernel_half( 27 | __global const half *input, 28 | const int input_w, 29 | const int input_h, 30 | const int input_c, 31 | __global const half *conv_weight, 32 | __global const half *bias, 33 | const int conv_w, 34 | const int conv_h, 35 | const int conv_c, 36 | const int conv_n, 37 | const int stride_w, 38 | const int stride_h, 39 | const int pad_left, 40 | const int pad_right, 41 | const int pad_top, 42 | const int pad_bot, 43 | __global half *output, 44 | const int output_w, 45 | const int output_h, 46 | const int output_c) { 47 | int x,y,z,n,i,j; 48 | 49 | int threadId_x = get_global_id(0); 50 | int threadId_y = get_global_id(1); 51 | int threadId_z = get_global_id(2); 52 | 53 | int useBase3 = (input_c % 3 == 0) ? 1 : 0; 54 | 55 | for(n = threadId_z ; n < output_c ; n += get_global_size(2)) { 56 | for(j = threadId_y ; j < output_h ; j += get_global_size(1)) { 57 | for(i = threadId_x ; i < output_w ; i += get_global_size(0)) { 58 | half result = 0.0f; 59 | for(y = 0 ; y < conv_h ; y++) { 60 | int global_input_y = j * stride_h - pad_top + y; 61 | for(x = 0 ; x < conv_w ; x++) { 62 | int global_input_x = i * stride_w - pad_left + x; 63 | if(global_input_x >= 0 && global_input_y >= 0 && global_input_x < input_w && global_input_y < input_h) { 64 | if(useBase3 == 1) { 65 | for(z = 0 ; z < conv_c ; z += 3) { 66 | int global_filter_index = getIndexFrom4D(conv_n, conv_h, conv_w, conv_c, n, y, x, z); 67 | int global_input_index = getIndexFrom3D(input_h, input_w, input_c, global_input_y, global_input_x, z); 68 | 69 | half2 tmp_input = vload2(0, &input[global_input_index]); 70 | half2 tmp_weight = vload2(0, &conv_weight[global_filter_index]); 71 | result += dot(tmp_input, tmp_weight); 72 | 73 | result += input[global_input_index + 2] * conv_weight[global_filter_index + 2]; 74 | } 75 | } else { 76 | for(z = 0 ; z < conv_c ; z += 16) { 77 | int global_filter_index = getIndexFrom4D(conv_n, conv_h, conv_w, conv_c, n, y, x, z); 78 | int global_input_index = getIndexFrom3D(input_h, input_w, input_c, global_input_y, global_input_x, z); 79 | 80 | half16 tmp_input = vload16(0, &input[global_input_index]); 81 | half16 tmp_weight = vload16(0, &conv_weight[global_filter_index]); 82 | 83 | result += dot(tmp_input.s0123, tmp_weight.s0123); 84 | result += dot(tmp_input.s4567, tmp_weight.s4567); 85 | result += dot(tmp_input.s89ab, tmp_weight.s89ab); 86 | result += dot(tmp_input.scdef, tmp_weight.scdef); 87 | } 88 | } 89 | } 90 | } 91 | } 92 | 93 | result += bias[n]; 94 | 95 | output[getIndexFrom3D(output_h, output_w, output_c, j, i, n)] = result; 96 | } 97 | } 98 | } 99 | } 100 | 101 | __kernel void conv_kernel_float( 102 | __global const float *input, 103 | const int input_w, 104 | const int input_h, 105 | const int input_c, 106 | __global const float *conv_weight, 107 | __global const float *bias, 108 | const int conv_w, 109 | const int conv_h, 110 | const int conv_c, 111 | const int conv_n, 112 | const int stride_w, 113 | const int stride_h, 114 | const int pad_left, 115 | const int pad_right, 116 | const int pad_top, 117 | const int pad_bot, 118 | __global float *output, 119 | const int output_w, 120 | const int output_h, 121 | const int output_c) { 122 | int x,y,z,n,i,j; 123 | 124 | int threadId_x = get_global_id(0); 125 | int threadId_y = get_global_id(1); 126 | int threadId_z = get_global_id(2); 127 | 128 | int useBase3 = (input_c % 3 == 0) ? 1 : 0; 129 | 130 | for(n = threadId_z ; n < output_c ; n += get_global_size(2)) { 131 | for(j = threadId_y ; j < output_h ; j += get_global_size(1)) { 132 | for(i = threadId_x ; i < output_w ; i += get_global_size(0)) { 133 | float result = 0.0f; 134 | for(y = 0 ; y < conv_h ; y++) { 135 | int global_input_y = j * stride_h - pad_top + y; 136 | for(x = 0 ; x < conv_w ; x++) { 137 | int global_input_x = i * stride_w - pad_left + x; 138 | if(global_input_x >= 0 && global_input_y >= 0 && global_input_x < input_w && global_input_y < input_h) { 139 | if(useBase3 == 1) { 140 | for(z = 0 ; z < conv_c ; z += 3) { 141 | int global_filter_index = getIndexFrom4D(conv_n, conv_h, conv_w, conv_c, n, y, x, z); 142 | int global_input_index = getIndexFrom3D(input_h, input_w, input_c, global_input_y, global_input_x, z); 143 | 144 | float2 tmp_input = vload2(0, &input[global_input_index]); 145 | float2 tmp_weight = vload2(0, &conv_weight[global_filter_index]); 146 | result += dot(tmp_input, tmp_weight); 147 | 148 | result += input[global_input_index + 2] * conv_weight[global_filter_index + 2]; 149 | } 150 | } else { 151 | for(z = 0 ; z < conv_c ; z += 16) { 152 | int global_filter_index = getIndexFrom4D(conv_n, conv_h, conv_w, conv_c, n, y, x, z); 153 | int global_input_index = getIndexFrom3D(input_h, input_w, input_c, global_input_y, global_input_x, z); 154 | 155 | float16 tmp_input = vload16(0, &input[global_input_index]); 156 | float16 tmp_weight = vload16(0, &conv_weight[global_filter_index]); 157 | 158 | result += dot(tmp_input.s0123, tmp_weight.s0123); 159 | result += dot(tmp_input.s4567, tmp_weight.s4567); 160 | result += dot(tmp_input.s89ab, tmp_weight.s89ab); 161 | result += dot(tmp_input.scdef, tmp_weight.scdef); 162 | } 163 | } 164 | } 165 | } 166 | } 167 | 168 | result += bias[n]; 169 | 170 | output[getIndexFrom3D(output_h, output_w, output_c, j, i, n)] = result; 171 | } 172 | } 173 | } 174 | } 175 | 176 | __kernel void conv_fc_kernel_half( 177 | __global const half *input, 178 | const int input_w, 179 | const int input_h, 180 | const int input_c, 181 | __global const half *conv_weight, 182 | __global const half *bias, 183 | const int conv_w, 184 | const int conv_h, 185 | const int conv_c, 186 | const int conv_n, 187 | const int stride_w, 188 | const int stride_h, 189 | const int pad_left, 190 | const int pad_right, 191 | const int pad_top, 192 | const int pad_bot, 193 | __global half *output, 194 | const int output_w, 195 | const int output_h, 196 | const int output_c 197 | ) { 198 | for(int threadId_x = get_global_id(0) ; threadId_x < output_c ; threadId_x += get_global_size(0)) { 199 | int i; 200 | int weight_start_index = getIndexFrom4D(conv_n, conv_h, conv_w, conv_c, threadId_x, 0, 0, 0); 201 | float result = 0.0f; 202 | 203 | int remaining = conv_w * conv_h * conv_c; 204 | i = 0; 205 | while(remaining > 0 && remaining / 16 > 0) { 206 | half16 tmp_input = vload16(0, &input[i]); 207 | half16 tmp_weight = vload16(0, &conv_weight[weight_start_index + i]); 208 | 209 | result += dot(tmp_input.s0123, tmp_weight.s0123); 210 | result += dot(tmp_input.s4567, tmp_weight.s4567); 211 | result += dot(tmp_input.s89ab, tmp_weight.s89ab); 212 | result += dot(tmp_input.scdef, tmp_weight.scdef); 213 | 214 | remaining -= 16; 215 | i += 16; 216 | } 217 | 218 | while(remaining > 0 && remaining / 4 > 0) { 219 | half4 tmp_input = vload4(0, &input[i]); 220 | half4 tmp_weight = vload4(0, &conv_weight[weight_start_index + i]); 221 | 222 | result += dot(tmp_input, tmp_weight); 223 | 224 | remaining -= 4; 225 | i += 4; 226 | } 227 | 228 | while(remaining > 0) { 229 | result += input[i] * conv_weight[weight_start_index + i]; 230 | 231 | remaining--; 232 | i++; 233 | } 234 | 235 | result += bias[threadId_x]; 236 | 237 | output[threadId_x] = result; 238 | //vstore_half(result, 0, &output[threadId_x]); 239 | } 240 | } 241 | 242 | __kernel void conv_fc_kernel_float( 243 | __global const float *input, 244 | const int input_w, 245 | const int input_h, 246 | const int input_c, 247 | __global const float *conv_weight, 248 | __global const float *bias, 249 | const int conv_w, 250 | const int conv_h, 251 | const int conv_c, 252 | const int conv_n, 253 | const int stride_w, 254 | const int stride_h, 255 | const int pad_left, 256 | const int pad_right, 257 | const int pad_top, 258 | const int pad_bot, 259 | __global float *output, 260 | const int output_w, 261 | const int output_h, 262 | const int output_c 263 | ) { 264 | for(int threadId_x = get_global_id(0) ; threadId_x < output_c ; threadId_x += get_global_size(0)) { 265 | int weight_start_index = getIndexFrom4D(conv_n, conv_h, conv_w, conv_c, threadId_x, 0, 0, 0); 266 | float result = 0.0f; 267 | 268 | int remaining = conv_w * conv_h * conv_c; 269 | int i = 0; 270 | while(remaining > 0 && remaining / 16 > 0) { 271 | float16 tmp_input = vload16(0, &input[i]); 272 | float16 tmp_weight = vload16(0, &conv_weight[weight_start_index + i]); 273 | 274 | result += dot(tmp_input.s0123, tmp_weight.s0123); 275 | result += dot(tmp_input.s4567, tmp_weight.s4567); 276 | result += dot(tmp_input.s89ab, tmp_weight.s89ab); 277 | result += dot(tmp_input.scdef, tmp_weight.scdef); 278 | 279 | remaining -= 16; 280 | i += 16; 281 | } 282 | 283 | while(remaining > 0 && remaining / 4 > 0) { 284 | float4 tmp_input = vload4(0, &input[i]); 285 | float4 tmp_weight = vload4(0, &conv_weight[weight_start_index + i]); 286 | 287 | result += dot(tmp_input, tmp_weight); 288 | 289 | remaining -= 4; 290 | i += 4; 291 | } 292 | 293 | while(remaining > 0) { 294 | result += input[i] * conv_weight[weight_start_index + i]; 295 | 296 | remaining--; 297 | i++; 298 | } 299 | 300 | result += bias[threadId_x]; 301 | 302 | output[threadId_x] = result; 303 | } 304 | } 305 | 306 | kernel void fully_connected_kernel_half( 307 | global const half *input_frame, 308 | const int input_w, 309 | const int input_h, 310 | const int input_d, 311 | global const half *layer_W, 312 | global const half *layer_bias, 313 | global half *output_frame, 314 | const int output_size 315 | ) { 316 | int thrIdx = get_global_id(0); 317 | int maxThreads = get_global_size(0); 318 | 319 | for(int n = thrIdx; n < output_size ; n += maxThreads) { 320 | float result = 0.0f; 321 | 322 | int input_idx = 0; 323 | int filter_idx = n * input_h * input_w * input_d; 324 | 325 | int idx_remaining = input_h * input_w * input_d; 326 | 327 | while(idx_remaining >= 4) { 328 | half4 tmp1 = vload4(0, &input_frame[input_idx]); 329 | half4 tmp2 = vload4(0, &layer_W[filter_idx]); 330 | result += dot(tmp1,tmp2); 331 | 332 | input_idx += 4; 333 | filter_idx += 4; 334 | idx_remaining -= 4; 335 | } 336 | 337 | while(idx_remaining >= 2) { 338 | half2 tmp1 = vload2(0, &input_frame[input_idx]); 339 | half2 tmp2 = vload2(0, &layer_W[filter_idx]); 340 | result += dot(tmp1,tmp2); 341 | 342 | input_idx += 2; 343 | filter_idx += 2; 344 | idx_remaining -= 2; 345 | } 346 | 347 | while(idx_remaining > 0) { 348 | half tmp1 = input_frame[input_idx]; 349 | half tmp2 = layer_W[filter_idx]; 350 | result += tmp1 * tmp2; 351 | 352 | idx_remaining -= 1; 353 | } 354 | 355 | result += layer_bias[n]; 356 | 357 | output_frame[n] = result; 358 | } 359 | } 360 | 361 | kernel void fully_connected_kernel_float( 362 | global const float *input_frame, 363 | const int input_w, 364 | const int input_h, 365 | const int input_d, 366 | global const float *layer_W, 367 | global const float *layer_bias, 368 | global float *output_frame, 369 | const int output_size 370 | ) { 371 | int thrIdx = get_global_id(0); 372 | int maxThreads = get_global_size(0); 373 | 374 | for(int n = thrIdx; n < output_size ; n += maxThreads) { 375 | float result = 0.0f; 376 | 377 | int input_idx = 0; 378 | int filter_idx = n * input_h * input_w * input_d; 379 | 380 | int idx_remaining = input_h * input_w * input_d; 381 | 382 | while(idx_remaining >= 4) { 383 | float4 tmp1 = vload4(0, &input_frame[input_idx]); 384 | float4 tmp2 = vload4(0, &layer_W[filter_idx]); 385 | result += dot(tmp1,tmp2); 386 | 387 | input_idx += 4; 388 | filter_idx += 4; 389 | idx_remaining -= 4; 390 | } 391 | 392 | while(idx_remaining > 0) { 393 | float tmp1 = input_frame[input_idx]; 394 | float tmp2 = layer_W[filter_idx]; 395 | result += tmp1 * tmp2; 396 | 397 | idx_remaining -= 1; 398 | } 399 | 400 | result += layer_bias[n]; 401 | 402 | output_frame[n] = result; 403 | } 404 | } 405 | 406 | __kernel void maxpool_kernel_half( 407 | __global const half *input_frame, 408 | const int input_w, 409 | const int input_h, 410 | const int input_d, 411 | const int size, 412 | const int stride_1, 413 | const int stride_2, 414 | const int pad_1, 415 | const int pad_2, 416 | const int pad_3, 417 | const int pad_4, 418 | __global half *output_frame, 419 | const int output_w, 420 | const int output_h, 421 | const int output_d) { 422 | 423 | int thrId_i = get_global_id(0); 424 | int thrId_j = get_global_id(1); 425 | int thrId_k = get_global_id(2); 426 | 427 | int max_i = get_global_size(0); 428 | int max_j = get_global_size(1); 429 | int max_k = get_global_size(2); 430 | 431 | int i,j,k; 432 | int x,y; 433 | 434 | for(i = thrId_i ; i < output_w ; i += max_i) { 435 | for(j = thrId_j ; j < output_h ; j += max_j) { 436 | for(k = thrId_k ; k < output_d ; k += max_k) { 437 | half max = -9999.9f; 438 | for(x = 0 ; x < size ; x++) { 439 | for(y = 0 ; y < size ; y++) { 440 | int x_ = i * stride_1 + x - pad_1; 441 | int y_ = j * stride_2 + y - pad_3; 442 | int valid = (x_ >= 0 && x_ < input_w && y_ >= 0 && y_ < input_h); 443 | //float val = (valid != 0) ? input_frame[getIndexFrom3D(input_h, input_w, input_d, y_, x_, k)] : -999999.9f; 444 | half val = (valid != 0) ? input_frame[getIndexFrom3D(input_h, input_w, input_d, y_, x_, k)] : 0.0f; 445 | max = (val > max) ? val : max; 446 | } 447 | } 448 | output_frame[getIndexFrom3D(output_h, output_w, output_d, j, i, k)] = max; 449 | //vstore_half(max, 0, &output_frame[getIndexFrom3D(output_h, output_w, output_d, j, i, k)]); 450 | } 451 | } 452 | } 453 | } 454 | 455 | __kernel void maxpool_kernel_float( 456 | __global const float *input_frame, 457 | const int input_w, 458 | const int input_h, 459 | const int input_d, 460 | const int size, 461 | const int stride_1, 462 | const int stride_2, 463 | const int pad_1, 464 | const int pad_2, 465 | const int pad_3, 466 | const int pad_4, 467 | __global float *output_frame, 468 | const int output_w, 469 | const int output_h, 470 | const int output_c) { 471 | 472 | int thrId_i = get_global_id(0); 473 | int thrId_j = get_global_id(1); 474 | int thrId_k = get_global_id(2); 475 | 476 | int max_i = get_global_size(0); 477 | int max_j = get_global_size(1); 478 | int max_k = get_global_size(2); 479 | 480 | int i,j,k; 481 | int x,y; 482 | 483 | for(i = thrId_i ; i < output_w ; i += max_i) { 484 | for(j = thrId_j ; j < output_h ; j += max_j) { 485 | for(k = thrId_k ; k < output_c ; k += max_k) { 486 | float max = -9999.9f; 487 | for(x = 0 ; x < size ; x++) { 488 | for(y = 0 ; y < size ; y++) { 489 | int x_ = i * stride_1 + x - pad_1; 490 | int y_ = j * stride_2 + y - pad_3; 491 | int valid = (x_ >= 0 && x_ < input_w && y_ >= 0 && y_ < input_h); 492 | //float val = (valid != 0) ? input_frame[getIndexFrom3D(input_h, input_w, input_d, y_, x_, k)] : -999999.9f; 493 | float val = (valid != 0) ? input_frame[getIndexFrom3D(input_h, input_w, input_d, y_, x_, k)] : 0.0f; 494 | max = (val > max) ? val : max; 495 | } 496 | } 497 | output_frame[getIndexFrom3D(output_h, output_w, output_c, j, i, k)] = max; 498 | } 499 | } 500 | } 501 | } 502 | 503 | __kernel void cross_channels_lrn_kernel_half( 504 | __global const half *in, //[h x w x c] 505 | const int channels, 506 | const int height, 507 | const int width, 508 | const int k, 509 | const int size, 510 | const float alpha_over_size, 511 | const float beta, 512 | __global half *out) { 513 | 514 | half beta_half = 0.0f; 515 | vstore_half(beta, 0, &beta_half); 516 | 517 | for(int w = get_global_id(0) ; w < width ; w += get_global_size(0)) { 518 | for(int h = get_global_id(1) ; h < height ; h += get_global_size(1)) { 519 | int offset = (h * width + w) * channels; 520 | int head = 0; 521 | int pre_pad = (size - 1) / 2; 522 | int post_pad = size - pre_pad - 1; 523 | half accum_scale = 0; 524 | 525 | while (head < post_pad) { 526 | half data = in[offset + head]; 527 | accum_scale += data * data; 528 | head++; 529 | } 530 | 531 | while (head < size) { 532 | half data = in[offset + head]; 533 | accum_scale += data * data; 534 | half scale = k + accum_scale * alpha_over_size; 535 | out[offset + head - post_pad] = in[offset + head - post_pad] * pow(scale, -beta_half); 536 | head++; 537 | } 538 | 539 | while (head < channels) { 540 | half data = in[offset + head]; 541 | accum_scale += data * data; 542 | data = in[offset + head - size]; 543 | accum_scale -= data * data; 544 | half scale = k + accum_scale * alpha_over_size; 545 | out[offset + head - post_pad] = in[offset + head - post_pad] * pow(scale, -beta_half); 546 | head++; 547 | } 548 | 549 | while (head < channels + post_pad) { 550 | half data = in[offset + head - size]; 551 | accum_scale -= data * data; 552 | half scale = k + accum_scale * alpha_over_size; 553 | out[offset + head - post_pad] = in[offset + head - post_pad] * pow(scale, -beta_half); 554 | head++; 555 | } 556 | } 557 | } 558 | } 559 | 560 | __kernel void cross_channels_lrn_kernel_float( 561 | __global const float *input, //[h x w x c] 562 | const int channels, 563 | const int height, 564 | const int width, 565 | const int k, 566 | const int size, 567 | const float alpha_over_size, 568 | const float beta, 569 | __global float *output) { 570 | 571 | for(int w = get_global_id(0) ; w < width ; w += get_global_size(0)) { 572 | for(int h = get_global_id(1) ; h < height ; h += get_global_size(1)) { 573 | int offset = getIndexFrom3D(height, width, channels, h, w, 0); 574 | int head = 0; 575 | int pre_pad = (size - 1) / 2; 576 | int post_pad = size - pre_pad - 1; 577 | float accum_scale = 0; 578 | 579 | const __global float *in = input + offset; 580 | __global float *out = output + offset; 581 | 582 | while (head < post_pad) { 583 | float data = in[head]; 584 | accum_scale += data * data; 585 | head++; 586 | } 587 | 588 | while (head < size) { 589 | float data = in[head]; 590 | accum_scale += data * data; 591 | float scale = k + accum_scale * alpha_over_size; 592 | out[head - post_pad] = in[head - post_pad] * pow(scale, -beta); 593 | head++; 594 | } 595 | 596 | while (head < channels) { 597 | float data = in[head]; 598 | accum_scale += data * data; 599 | data = in[head - size]; 600 | accum_scale -= data * data; 601 | float scale = k + accum_scale * alpha_over_size; 602 | out[head - post_pad] = in[head - post_pad] * pow(scale, -beta); 603 | head++; 604 | } 605 | 606 | while (head < channels + post_pad) { 607 | float data = in[head - size]; 608 | accum_scale -= data * data; 609 | float scale = k + accum_scale * alpha_over_size; 610 | out[ head - post_pad] = in[head - post_pad] * pow(scale, -beta); 611 | head++; 612 | } 613 | } 614 | } 615 | } 616 | 617 | __kernel void activation_kernel_half( 618 | __global half *data, 619 | const int activation) { 620 | half result = data[get_global_id(0)]; 621 | 622 | switch(activation) { 623 | case 0: 624 | //no activation 625 | break; 626 | case 1: 627 | //RAMP 628 | result = result * (result > 0) + 0.1 * result; 629 | break; 630 | case 2: 631 | //LOGISTIC 632 | result = 1.0 / (1.0 + exp(-result)); 633 | break; 634 | case 3: 635 | //LEAKY 636 | result = (result > 0) ? result : 0.1 * result; 637 | break; 638 | case 4: 639 | //LINEAR 640 | break; 641 | case 5: 642 | //RELU 643 | result = (result > 0) ? result : 0.0f; 644 | break; 645 | } 646 | 647 | data[get_global_id(0)] = result; 648 | } 649 | 650 | __kernel void activation_kernel_float( 651 | __global float *data, 652 | const int activation) { 653 | float result = data[get_global_id(0)]; 654 | 655 | switch(activation) { 656 | case 0: 657 | //no activation 658 | break; 659 | case 1: 660 | //RAMP 661 | result = result * (result > 0) + 0.1 * result; 662 | break; 663 | case 2: 664 | //LOGISTIC 665 | result = 1.0 / (1.0 + exp(-result)); 666 | break; 667 | case 3: 668 | //LEAKY 669 | result = (result > 0) ? result : 0.1 * result; 670 | break; 671 | case 4: 672 | //LINEAR 673 | break; 674 | case 5: 675 | //RELU 676 | result = (result > 0) ? result : 0.0f; 677 | break; 678 | } 679 | 680 | data[get_global_id(0)] = result; 681 | } -------------------------------------------------------------------------------- /app/src/main/cpp/basic_functions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | timestamp_t get_timestamp () { 11 | struct timeval now; 12 | gettimeofday (&now, NULL); 13 | return now.tv_usec + (timestamp_t)now.tv_sec * 1000000; 14 | } 15 | 16 | int getIndexFrom4D(int d1, int d2, int d3, int d4, int i1, int i2, int i3, int i4) { 17 | return i1 * (d2 * d3 * d4) + i2 * (d3 * d4) + i3 * d4 + i4; 18 | } 19 | 20 | float getDataFrom4D(float *data, int d1, int d2, int d3, int d4, int i1, int i2, int i3, int i4) { 21 | int index = i1 * (d2 * d3 * d4) + i2 * (d3 * d4) + i3 * d4 + i4; 22 | return data[index]; 23 | } 24 | 25 | int getIndexFrom3D(int d1, int d2, int d3, int i1, int i2, int i3) { 26 | return i1 * (d2 * d3) + i2 * d3 + i3; 27 | } 28 | 29 | float getDataFrom3D(float *data, int d1, int d2, int d3, int i1, int i2, int i3) { 30 | int index = i1 * (d2 * d3) + i2 * d3 + i3; 31 | return data[index]; 32 | } 33 | 34 | cnn_frame *activate_RAMP(cnn_frame *frame) { 35 | int i; 36 | for(i = 0 ; i < frame->c * frame->h * frame->w ; i++) { 37 | float x = frame->data[i]; 38 | frame->data[i] = x * (x > 0) + 0.1 * x; 39 | } 40 | return frame; 41 | } 42 | 43 | cnn_frame *activate_LOGISTIC(cnn_frame *frame) { 44 | int i; 45 | for(i = 0 ; i < frame->c * frame->h * frame->w ; i++) { 46 | float x = frame->data[i]; 47 | frame->data[i] = 1./(1. + exp(-x)); 48 | } 49 | return frame; 50 | } 51 | 52 | cnn_frame *activate_RELU(cnn_frame *frame) { 53 | int i; 54 | for(i = 0 ; i < frame->c * frame->h * frame->w ; i++) { 55 | float x = frame->data[i]; 56 | frame->data[i] =(x > 0) ? x : 0; 57 | } 58 | return frame; 59 | } 60 | 61 | cnn_frame *activate_LEAKY(cnn_frame *frame) { 62 | int i; 63 | for(i = 0 ; i < frame->c * frame->h * frame->w ; i++) { 64 | float x = frame->data[i]; 65 | frame->data[i] =(x > 0) ? x : 0.1 * x; 66 | } 67 | return frame; 68 | } 69 | 70 | cnn_frame *doFeedForward_Activation(cnn_frame *frame, int activation) { 71 | if(activation == NO_ACTIVATION) 72 | return frame; 73 | 74 | if(!frame->useGPU) { 75 | switch(activation) { 76 | case LOGISTIC: 77 | activate_LOGISTIC(frame); 78 | break; 79 | case RAMP: 80 | activate_RAMP(frame); 81 | break; 82 | case LEAKY: 83 | activate_LEAKY(frame); 84 | break; 85 | case RELU: 86 | activate_RELU(frame); 87 | break; 88 | } 89 | } else { 90 | OpenCLObjects *openCLObjects = getOpenClObject(); 91 | cl_int err = CL_SUCCESS; 92 | int i = 0; 93 | 94 | cl_kernel kernel = (frame->useHalf) ? openCLObjects->activation_kernel.kernel : openCLObjects->activation_kernel_float.kernel; 95 | err = clSetKernelArg(kernel, i++, sizeof(cl_mem), &frame->cl_data); 96 | err |= clSetKernelArg(kernel, i++, sizeof(int), &activation); 97 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 98 | 99 | size_t globalSize[1] = {(size_t)(frame->w * frame->h * frame->c)}; 100 | 101 | err = clEnqueueNDRangeKernel( 102 | openCLObjects->queue, 103 | kernel, 104 | 1, 105 | 0, 106 | globalSize, 107 | 0, 108 | 0, 0, 0 109 | ); 110 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 111 | 112 | err |= clFinish(openCLObjects->queue); 113 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 114 | } 115 | 116 | return frame; 117 | } 118 | 119 | cnn_frame *frame_init(int w, int h, int c) { 120 | cnn_frame *frame = (cnn_frame *)calloc(1, sizeof(cnn_frame)); 121 | frame->w = w; 122 | frame->h = h; 123 | frame->c = c; 124 | frame->data = (float *)calloc(w * h * c, sizeof(float)); 125 | frame->useGPU = 0; 126 | frame->useHalf = 0; 127 | return frame; 128 | } 129 | 130 | cnn_frame *frame_init_gpu(int w, int h, int c) { 131 | cnn_frame *frame = (cnn_frame *)calloc(1, sizeof(cnn_frame)); 132 | frame->w = w; 133 | frame->h = h; 134 | frame->c = c; 135 | frame->useGPU = 1; 136 | frame->useHalf = 0; 137 | 138 | cl_int err; 139 | OpenCLObjects *openCLObjects = getOpenClObject(); 140 | 141 | frame->cl_data = clCreateBuffer( 142 | openCLObjects->context, 143 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 144 | frame->w * frame->h * frame->c * sizeof(float), //size in bytes 145 | NULL,//buffer of data 146 | &err); 147 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 148 | 149 | if(err == CL_SUCCESS) 150 | return frame; 151 | else { 152 | free(frame); 153 | return NULL; 154 | } 155 | } 156 | 157 | cnn_frame *frame_init_gpu_half(int w, int h, int c) { 158 | cnn_frame *frame = (cnn_frame *)calloc(1, sizeof(cnn_frame)); 159 | frame->w = w; 160 | frame->h = h; 161 | frame->c = c; 162 | frame->useGPU = 1; 163 | frame->useHalf = 1; 164 | 165 | cl_int err; 166 | OpenCLObjects *openCLObjects = getOpenClObject(); 167 | 168 | frame->cl_data = clCreateBuffer( 169 | openCLObjects->context, 170 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 171 | frame->w * frame->h * frame->c * sizeof(cl_half), //size in bytes 172 | NULL,//buffer of data 173 | &err); 174 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 175 | 176 | if(err == CL_SUCCESS) 177 | return frame; 178 | else { 179 | free(frame); 180 | return NULL; 181 | } 182 | } 183 | 184 | cnn_frame * frame_clone(cnn_frame *src) { 185 | if(!src->useGPU) { 186 | cnn_frame *frame = frame_init(src->w, src->h, src->c); 187 | memcpy(frame->data, src->data, frame->w * frame->h * frame->c * sizeof(float)); 188 | return frame; 189 | } else { 190 | cl_int err = CL_SUCCESS; 191 | cnn_frame *frame = NULL; 192 | if(src->useHalf == 0) 193 | frame = frame_init_gpu(src->w, src->h, src->c); 194 | else 195 | frame = frame_init_gpu_half(src->w, src->h, src->c); 196 | 197 | if(frame == NULL) 198 | return NULL; 199 | 200 | int mapped_size = src->w * src->h * src->c * sizeof(float); 201 | if(src->useHalf == 1) 202 | mapped_size = src->w * src->h * src->c * sizeof(cl_half); 203 | 204 | OpenCLObjects *openCLObjects = getOpenClObject(); 205 | float *buf_src = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 206 | src->cl_data, \ 207 | CL_TRUE, CL_MAP_READ, \ 208 | 0, \ 209 | mapped_size, \ 210 | 0, NULL, NULL, &err); 211 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 212 | 213 | float *buf_dst = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 214 | frame->cl_data, \ 215 | CL_TRUE, CL_MAP_WRITE, \ 216 | 0, \ 217 | mapped_size, \ 218 | 0, NULL, NULL, &err); 219 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 220 | 221 | memcpy((void*)buf_dst, (void*)buf_src, mapped_size); 222 | 223 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 224 | src->cl_data, \ 225 | buf_src, \ 226 | 0, NULL, NULL); 227 | 228 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 229 | frame->cl_data, \ 230 | buf_dst, \ 231 | 0, NULL, NULL); 232 | return frame; 233 | } 234 | } 235 | 236 | cnn_frame* frame_convert_to_gpu_float(cnn_frame *frame) { 237 | if(frame->useGPU && !frame->useHalf) 238 | return frame; 239 | 240 | OpenCLObjects *openCLObjects = getOpenClObject(); 241 | cnn_frame *output = frame_init_gpu(frame->w, frame->h, frame->c); 242 | int err = CL_SUCCESS; 243 | 244 | if(!frame->useGPU) { 245 | //CPU-mode 246 | float *buf_dest = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 247 | output->cl_data, \ 248 | CL_TRUE, CL_MAP_WRITE, \ 249 | 0, \ 250 | output->w * output->h * output->c * sizeof(cl_float), \ 251 | 0, NULL, NULL, &err); 252 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 253 | 254 | memcpy((void *)buf_dest, frame->data, output->w * output->h * output->c * sizeof(cl_float)); 255 | 256 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 257 | output->cl_data, \ 258 | buf_dest, \ 259 | 0, NULL, NULL); 260 | } else { 261 | //GPU-half-mode 262 | cl_kernel kernel = openCLObjects->convert_half_to_float_kernel.kernel; 263 | err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &frame->cl_data); 264 | err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->cl_data); 265 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 266 | 267 | size_t convertSize[1] = {(size_t) output->w * output->h * output->c}; 268 | err = clEnqueueNDRangeKernel( 269 | openCLObjects->queue, 270 | kernel, 271 | 1, 272 | 0, 273 | convertSize, 274 | 0, 275 | 0, 0, 0 276 | ); 277 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 278 | 279 | err = clFinish(openCLObjects->queue); 280 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 281 | } 282 | 283 | frame_free(frame); 284 | 285 | //test 286 | { 287 | float *buf_dest = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 288 | output->cl_data, \ 289 | CL_TRUE, CL_MAP_READ, \ 290 | 0, \ 291 | output->w * output->h * output->c * sizeof(cl_float), \ 292 | 0, NULL, NULL, &err); 293 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 294 | 295 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 296 | output->cl_data, \ 297 | buf_dest, \ 298 | 0, NULL, NULL); 299 | } 300 | 301 | return output; 302 | } 303 | 304 | cnn_frame* frame_convert_to_gpu_half(cnn_frame *frame) { 305 | if(frame->useGPU && frame->useHalf) 306 | return frame; 307 | 308 | cnn_frame *output = frame_init_gpu_half(frame->w, frame->h, frame->c); 309 | OpenCLObjects *openCLObjects = getOpenClObject(); 310 | int err = CL_SUCCESS; 311 | 312 | cl_mem cl_data = NULL; 313 | 314 | if(!frame->useGPU) { 315 | //cpu-mode 316 | cl_data = clCreateBuffer( 317 | openCLObjects->context, 318 | CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, 319 | frame->w * frame->h * frame->c * sizeof(float), //size in bytes 320 | frame->data,//buffer of data 321 | &err); 322 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 323 | } else { 324 | //gpu-float-mode 325 | cl_data = frame->cl_data; 326 | } 327 | 328 | cl_kernel kernel = openCLObjects->convert_float_to_half_kernel.kernel; 329 | err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_data); 330 | err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output->cl_data); 331 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 332 | 333 | size_t convertSize[1] = {(size_t) output->w * output->h * output->c}; 334 | err = clEnqueueNDRangeKernel( 335 | openCLObjects->queue, 336 | kernel, 337 | 1, 338 | 0, 339 | convertSize, 340 | 0, 341 | 0, 0, 0 342 | ); 343 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 344 | 345 | err = clFinish(openCLObjects->queue); 346 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 347 | 348 | if(!frame->useGPU) 349 | clReleaseMemObject(cl_data); 350 | 351 | frame_free(frame); 352 | 353 | //test 354 | { 355 | float *buf_dest = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 356 | output->cl_data, \ 357 | CL_TRUE, CL_MAP_READ, \ 358 | 0, \ 359 | output->w * output->h * output->c * sizeof(cl_half), \ 360 | 0, NULL, NULL, &err); 361 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 362 | 363 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 364 | output->cl_data, \ 365 | buf_dest, \ 366 | 0, NULL, NULL); 367 | } 368 | 369 | return output; 370 | } 371 | 372 | cnn_frame * frame_convert_to_cpu(cnn_frame *frame) { 373 | if(!frame->useGPU) 374 | return frame; 375 | 376 | cnn_frame *output = frame_init(frame->w, frame->h, frame->c); 377 | OpenCLObjects *openCLObjects = getOpenClObject(); 378 | int err = CL_SUCCESS; 379 | 380 | //convert half to float first 381 | if(frame->useHalf) { 382 | 383 | cnn_frame *tmp = frame_init_gpu(frame->w, frame->h, frame->c); 384 | 385 | cl_kernel kernel = openCLObjects->convert_half_to_float_kernel.kernel; 386 | err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &frame->cl_data); 387 | err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &tmp->cl_data); 388 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 389 | 390 | size_t convertSize[1] = {(size_t) tmp->w * tmp->h * tmp->c}; 391 | err = clEnqueueNDRangeKernel( 392 | openCLObjects->queue, 393 | kernel, 394 | 1, 395 | 0, 396 | convertSize, 397 | 0, 398 | 0, 0, 0 399 | ); 400 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 401 | 402 | err = clFinish(openCLObjects->queue); 403 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 404 | 405 | frame_free(frame); 406 | frame = tmp; 407 | } 408 | 409 | //map gpu-mem to cpu-mem and copy data 410 | float *buf_src = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 411 | frame->cl_data, \ 412 | CL_TRUE, CL_MAP_READ, \ 413 | 0, \ 414 | frame->w * frame->h * frame->c * sizeof(cl_float), \ 415 | 0, NULL, NULL, &err); 416 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 417 | 418 | memcpy((void*)output->data, (void*)buf_src, frame->w * frame->h * frame->c * sizeof(cl_float)); 419 | 420 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 421 | frame->cl_data, \ 422 | buf_src, \ 423 | 0, NULL, NULL); 424 | 425 | frame_free(frame); 426 | 427 | return output; 428 | } 429 | 430 | void frame_free(cnn_frame *frame) { 431 | if(frame->useGPU == 0) 432 | free(frame->data); 433 | else { 434 | clReleaseMemObject(frame->cl_data); 435 | } 436 | free(frame); 437 | } -------------------------------------------------------------------------------- /app/src/main/cpp/classifier.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | float * cnn_doClassification(cnn_frame *frame, cnn *model) { 10 | cnn_frame *result = frame; 11 | 12 | double totalTime = 0; 13 | double t0,t1; 14 | double global_t0 = get_timestamp(); 15 | 16 | OpenCLObjects *openCLObjects = getOpenClObject(); 17 | cl_int err; 18 | cl_event event; 19 | 20 | for(int i = 0 ; i < model->nLayers ; i++) { 21 | cnn_layer *layer = &model->layers[i]; 22 | 23 | t0 = get_timestamp(); 24 | 25 | result = layer->doFeedForward(result, layer); 26 | 27 | if(result->useGPU) { 28 | int size = result->w * result->h * result->c * (layer->useHalf ? sizeof(cl_half) : sizeof(cl_float)); 29 | float *buf_dest = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 30 | result->cl_data, \ 31 | CL_TRUE, CL_MAP_READ, \ 32 | 0, \ 33 | size, \ 34 | 0, NULL, NULL, &err); 35 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 36 | 37 | if(!result->useHalf) 38 | LOGD("1st data: %f", buf_dest[0]); 39 | 40 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 41 | result->cl_data, \ 42 | buf_dest, \ 43 | 0, NULL, NULL); 44 | } 45 | 46 | t1 = get_timestamp(); 47 | double milsecs = (t1 - t0) / 1000.0L; 48 | 49 | LOGD("Processed layer %d in %f ms\n", (i + 1), milsecs); 50 | } 51 | 52 | if(result != NULL && result->useGPU) { 53 | 54 | result = frame_convert_to_gpu_float(result); 55 | 56 | result->data = (float *)malloc(result->w * result->h * result->c * sizeof(float)); 57 | 58 | err = clEnqueueReadBuffer (openCLObjects->queue, 59 | result->cl_data, 60 | true, 61 | 0, 62 | result->w * result->h * result->c * sizeof(float), 63 | result->data, 64 | 0, 65 | 0, 66 | 0); 67 | 68 | err |= clFinish(openCLObjects->queue); 69 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 70 | 71 | err = clReleaseMemObject(result->cl_data); 72 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 73 | 74 | result->useGPU = 0; 75 | } 76 | 77 | timestamp_t global_t1 = get_timestamp(); 78 | 79 | totalTime = (global_t1 - global_t0) / 1000.0L; 80 | 81 | LOGD("CNN finished in %f ms\n", totalTime); 82 | 83 | float *output = (result == NULL) ? NULL : result->data; 84 | 85 | if(result != NULL) 86 | frame_free(result); 87 | 88 | return output; 89 | } 90 | -------------------------------------------------------------------------------- /app/src/main/cpp/clio.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | const char* opencl_error_to_str (cl_int error) { 4 | #define CASE_CL_CONSTANT(NAME) case NAME: return #NAME; 5 | 6 | // Suppose that no combinations are possible. 7 | switch(error) { 8 | CASE_CL_CONSTANT(CL_SUCCESS) 9 | CASE_CL_CONSTANT(CL_DEVICE_NOT_FOUND) 10 | CASE_CL_CONSTANT(CL_DEVICE_NOT_AVAILABLE) 11 | CASE_CL_CONSTANT(CL_COMPILER_NOT_AVAILABLE) 12 | CASE_CL_CONSTANT(CL_MEM_OBJECT_ALLOCATION_FAILURE) 13 | CASE_CL_CONSTANT(CL_OUT_OF_RESOURCES) 14 | CASE_CL_CONSTANT(CL_OUT_OF_HOST_MEMORY) 15 | CASE_CL_CONSTANT(CL_PROFILING_INFO_NOT_AVAILABLE) 16 | CASE_CL_CONSTANT(CL_MEM_COPY_OVERLAP) 17 | CASE_CL_CONSTANT(CL_IMAGE_FORMAT_MISMATCH) 18 | CASE_CL_CONSTANT(CL_IMAGE_FORMAT_NOT_SUPPORTED) 19 | CASE_CL_CONSTANT(CL_BUILD_PROGRAM_FAILURE) 20 | CASE_CL_CONSTANT(CL_MAP_FAILURE) 21 | CASE_CL_CONSTANT(CL_MISALIGNED_SUB_BUFFER_OFFSET) 22 | CASE_CL_CONSTANT(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST) 23 | CASE_CL_CONSTANT(CL_INVALID_VALUE) 24 | CASE_CL_CONSTANT(CL_INVALID_DEVICE_TYPE) 25 | CASE_CL_CONSTANT(CL_INVALID_PLATFORM) 26 | CASE_CL_CONSTANT(CL_INVALID_DEVICE) 27 | CASE_CL_CONSTANT(CL_INVALID_CONTEXT) 28 | CASE_CL_CONSTANT(CL_INVALID_QUEUE_PROPERTIES) 29 | CASE_CL_CONSTANT(CL_INVALID_COMMAND_QUEUE) 30 | CASE_CL_CONSTANT(CL_INVALID_HOST_PTR) 31 | CASE_CL_CONSTANT(CL_INVALID_MEM_OBJECT) 32 | CASE_CL_CONSTANT(CL_INVALID_IMAGE_FORMAT_DESCRIPTOR) 33 | CASE_CL_CONSTANT(CL_INVALID_IMAGE_SIZE) 34 | CASE_CL_CONSTANT(CL_INVALID_SAMPLER) 35 | CASE_CL_CONSTANT(CL_INVALID_BINARY) 36 | CASE_CL_CONSTANT(CL_INVALID_BUILD_OPTIONS) 37 | CASE_CL_CONSTANT(CL_INVALID_PROGRAM) 38 | CASE_CL_CONSTANT(CL_INVALID_PROGRAM_EXECUTABLE) 39 | CASE_CL_CONSTANT(CL_INVALID_KERNEL_NAME) 40 | CASE_CL_CONSTANT(CL_INVALID_KERNEL_DEFINITION) 41 | CASE_CL_CONSTANT(CL_INVALID_KERNEL) 42 | CASE_CL_CONSTANT(CL_INVALID_ARG_INDEX) 43 | CASE_CL_CONSTANT(CL_INVALID_ARG_VALUE) 44 | CASE_CL_CONSTANT(CL_INVALID_ARG_SIZE) 45 | CASE_CL_CONSTANT(CL_INVALID_KERNEL_ARGS) 46 | CASE_CL_CONSTANT(CL_INVALID_WORK_DIMENSION) 47 | CASE_CL_CONSTANT(CL_INVALID_WORK_GROUP_SIZE) 48 | CASE_CL_CONSTANT(CL_INVALID_WORK_ITEM_SIZE) 49 | CASE_CL_CONSTANT(CL_INVALID_GLOBAL_OFFSET) 50 | CASE_CL_CONSTANT(CL_INVALID_EVENT_WAIT_LIST) 51 | CASE_CL_CONSTANT(CL_INVALID_EVENT) 52 | CASE_CL_CONSTANT(CL_INVALID_OPERATION) 53 | CASE_CL_CONSTANT(CL_INVALID_GL_OBJECT) 54 | CASE_CL_CONSTANT(CL_INVALID_BUFFER_SIZE) 55 | CASE_CL_CONSTANT(CL_INVALID_MIP_LEVEL) 56 | CASE_CL_CONSTANT(CL_INVALID_GLOBAL_WORK_SIZE) 57 | CASE_CL_CONSTANT(CL_INVALID_PROPERTY) 58 | 59 | default: 60 | return "UNKNOWN ERROR CODE"; 61 | } 62 | 63 | #undef CASE_CL_CONSTANT 64 | } -------------------------------------------------------------------------------- /app/src/main/cpp/deepsense.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | cnn *model = NULL; 11 | OpenCLObjects openCLObjects; 12 | 13 | OpenCLObjects *getOpenClObject() { 14 | return &openCLObjects; 15 | } 16 | 17 | cnn *getModel() { 18 | return model; 19 | } 20 | 21 | extern "C" void Java_com_lanytek_deepsensev3_MainActivity_InitGPU( 22 | JNIEnv* env, 23 | jobject thiz, 24 | jstring model_dir_path, 25 | jstring packageName 26 | ) { 27 | //init GPU first 28 | const char *packageNameStr = env->GetStringUTFChars(packageName, 0); 29 | init_OpenCL(CL_DEVICE_TYPE_GPU, openCLObjects, packageNameStr); 30 | env->ReleaseStringUTFChars(packageName, packageNameStr); 31 | 32 | //init model 33 | const char *modelPath = env->GetStringUTFChars(model_dir_path, 0); 34 | if(model != NULL) { 35 | cnn_free(model); 36 | } 37 | 38 | model = cnn_loadModel(modelPath, 1); 39 | 40 | env->ReleaseStringUTFChars(model_dir_path, modelPath); 41 | } 42 | 43 | extern "C" jfloatArray Java_com_lanytek_deepsensev3_MainActivity_GetInferrence( 44 | JNIEnv* env, 45 | jobject thisObject, 46 | jfloatArray input 47 | ) { 48 | if(model == NULL) 49 | return NULL; 50 | 51 | cnn_frame *frame = frame_init(model->input_w, model->input_h, model->input_c); 52 | jfloat* data = env->GetFloatArrayElements(input, 0); 53 | memcpy(frame->data, data, model->input_w * model->input_h * model->input_c * sizeof(float)); 54 | env->ReleaseFloatArrayElements(input, data, 0); 55 | 56 | float *result = cnn_doClassification(frame, model); 57 | 58 | if(result != NULL) { 59 | int outputSize = model->layers[model->nLayers - 1].output_w * model->layers[model->nLayers - 1].output_h * model->layers[model->nLayers - 1].output_c; 60 | jfloatArray resultArr = env->NewFloatArray(outputSize); 61 | env->SetFloatArrayRegion(resultArr, 0, outputSize, result); 62 | //may lead to memory leak 63 | return resultArr; 64 | } else 65 | return NULL; 66 | } 67 | 68 | -------------------------------------------------------------------------------- /app/src/main/cpp/deepsense_lib.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by JC1DA on 6/3/16. 3 | // 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | static inline int CMP_OPTION(char *str, const char *option) { 20 | int ret = strncmp(str, option, strlen(option)) == 0 ? 1 : 0; 21 | return ret; 22 | } 23 | 24 | static inline int PARSE_ACTIVATION(char *line) { 25 | char buf[32]; 26 | sscanf(line,"ACTIVATION: %s\n",buf); 27 | if(CMP_OPTION(buf, "RAMP")) 28 | return RAMP; 29 | else if(CMP_OPTION(buf, "LOGISTIC")) 30 | return LOGISTIC; 31 | else if(CMP_OPTION(buf, "LEAKY")) 32 | return LEAKY; 33 | else if(CMP_OPTION(buf, "RELU")) 34 | return RELU; 35 | return NO_ACTIVATION; 36 | } 37 | 38 | cnn *cnn_loadModel(const char *modelDirPath, int useGPU) { 39 | cnn *model = (cnn *)calloc(1, sizeof(cnn)); 40 | model->useGPU = useGPU; 41 | 42 | { 43 | /* read number of layers */ 44 | char fileNameBuf[256]; 45 | char line[256]; 46 | sprintf(fileNameBuf,"%s/model",modelDirPath); 47 | 48 | FILE *fp = fopen(fileNameBuf,"r"); 49 | //PLEASE FILL IN NEW FORMAT 50 | while(fgets(line, sizeof(line), fp)) { 51 | if(CMP_OPTION(line, "NUMLAYERS")) 52 | sscanf(line, "NUMLAYERS: %d\n", &model->nLayers); 53 | else if(CMP_OPTION(line, "W")) 54 | sscanf(line, "W: %d\n", &model->input_w); 55 | else if(CMP_OPTION(line, "H")) 56 | sscanf(line, "H: %d\n", &model->input_h); 57 | else if(CMP_OPTION(line, "C")) 58 | sscanf(line, "C: %d\n", &model->input_c); 59 | } 60 | fclose(fp); 61 | } 62 | 63 | model->layers = (cnn_layer *)calloc(model->nLayers, sizeof(cnn_layer)); 64 | cnn_layer *layers = model->layers; 65 | 66 | for(int i = 1 ; i <= model->nLayers ; i++) { 67 | char fileNameBuf[256]; 68 | char line[256]; 69 | 70 | sprintf(fileNameBuf,"%s/l_%d",modelDirPath,i); 71 | 72 | cnn_layer *layer = &layers[i - 1]; 73 | layer->index = i - 1; 74 | layer->useGPU = model->useGPU; 75 | layer->type = LAYER_TYPE_UNKNOWN; 76 | layer->activation = NO_ACTIVATION; 77 | 78 | LOGD("Loading layer %d\n", i); 79 | 80 | FILE *layerfp = fopen(fileNameBuf,"r"); 81 | while (fgets(line, sizeof(line), layerfp)) { 82 | if(layer->type == LAYER_TYPE_UNKNOWN) { 83 | if(CMP_OPTION(line, "CONV")) { 84 | layer->type = LAYER_TYPE_CONV; 85 | layer->conv_layer = (cnn_layer_conv *) calloc(1, sizeof(cnn_layer_conv)); 86 | if (!layer->useGPU) 87 | layer->doFeedForward = doFeedForward_CONV; 88 | else { 89 | layer->doFeedForward = doFeedForward_CONV_GPU; 90 | } 91 | layer->conv_layer->group = 1; 92 | } else if(CMP_OPTION(line, "FULLY_CONNECTED")) { 93 | layer->type = LAYER_TYPE_FULLY_CONNECTED; 94 | layer->connected_layer = (cnn_layer_fully_connected *) calloc(1, 95 | sizeof(cnn_layer_fully_connected)); 96 | layer->connected_layer->need_reshape = 0; 97 | if (!layer->useGPU) 98 | layer->doFeedForward = doFeedForward_FULLY_CONNECTED; 99 | else 100 | layer->doFeedForward = doFeedForward_FULLY_CONNECTED_GPU; 101 | } else if(CMP_OPTION(line, "MAXPOOL")) { 102 | layer->type = LAYER_TYPE_MAXPOOL; 103 | layer->maxpool_layer = (cnn_layer_maxpool *) calloc(1, 104 | sizeof(cnn_layer_maxpool)); 105 | if (!layer->useGPU) 106 | layer->doFeedForward = doFeedForward_MAXPOOL; 107 | else { 108 | layer->doFeedForward = doFeedForward_MAXPOOL_GPU; 109 | } 110 | } else if(CMP_OPTION(line, "SOFTMAX")) { 111 | layer->type = LAYER_TYPE_SOFTMAX; 112 | layer->doFeedForward = doFeedForward_SOFTMAX; 113 | } else if(CMP_OPTION(line, "LRN_NORM")) { 114 | layer->type = LAYER_TYPE_LRN_NORMALIZE; 115 | layer->lrn_layer = (cnn_layer_lrn *) malloc(sizeof(cnn_layer_lrn)); 116 | layer->lrn_layer->k = 1; 117 | if (!layer->useGPU) 118 | layer->doFeedForward = doFeedForward_LRN; 119 | else 120 | layer->doFeedForward = doFeedForward_LRN_GPU; 121 | } 122 | } else { 123 | 124 | if(CMP_OPTION(line, "USE_HALF")) { 125 | sscanf(line,"USE_HALF: %d", &layer->useHalf); 126 | if(layer->useHalf != 0) 127 | layer->useHalf = 1; 128 | } 129 | 130 | 131 | switch(layer->type) { 132 | case LAYER_TYPE_CONV: 133 | if(CMP_OPTION(line, "STRIDE")) { 134 | sscanf(line, "STRIDE: %d %d\n", \ 135 | &layer->conv_layer->stride[0], \ 136 | &layer->conv_layer->stride[1]); 137 | } else if(CMP_OPTION(line, "PAD")) { 138 | sscanf(line,"PAD: %d %d %d %d\n", \ 139 | &layer->conv_layer->pad[0], \ 140 | &layer->conv_layer->pad[1], \ 141 | &layer->conv_layer->pad[2], \ 142 | &layer->conv_layer->pad[3]); 143 | } else if(CMP_OPTION(line, "WIDTH")) { 144 | sscanf(line,"WIDTH: %d\n",&layer->conv_layer->w); 145 | } else if(CMP_OPTION(line, "HEIGHT")) { 146 | sscanf(line,"HEIGHT: %d\n",&layer->conv_layer->h); 147 | } else if(CMP_OPTION(line, "IN_CHANNELS")) { 148 | sscanf(line,"IN_CHANNELS: %d\n",&layer->conv_layer->c); 149 | } else if(CMP_OPTION(line, "OUT_CHANNELS")) { 150 | sscanf(line,"OUT_CHANNELS: %d\n",&layer->conv_layer->n); 151 | } else if(CMP_OPTION(line, "ACTIVATION")) { 152 | layer->activation = PARSE_ACTIVATION(line); 153 | } else if(CMP_OPTION(line, "GROUP")) { 154 | sscanf(line,"GROUP: %d\n",&layer->conv_layer->group); 155 | } 156 | break; 157 | case LAYER_TYPE_FULLY_CONNECTED: 158 | if(CMP_OPTION(line, "INPUTSIZE")) { 159 | sscanf(line, "INPUTSIZE: %d\n", &layer->connected_layer->inputSize); 160 | } else if(CMP_OPTION(line, "OUTPUTSIZE")) { 161 | sscanf(line,"OUTPUTSIZE: %d\n", &layer->connected_layer->outputSize); 162 | } else if(CMP_OPTION(line, "ACTIVATION")) { 163 | layer->activation = PARSE_ACTIVATION(line); 164 | } else if(CMP_OPTION(line, "RESHAPE")) { 165 | sscanf(line,"RESHAPE: %d\n",&layer->connected_layer->need_reshape); 166 | } 167 | break; 168 | case LAYER_TYPE_MAXPOOL: 169 | if(CMP_OPTION(line, "SIZE")) { 170 | sscanf(line,"SIZE: %d\n", &layer->maxpool_layer->size); 171 | } else if(CMP_OPTION(line, "STRIDE")) { 172 | sscanf(line,"STRIDE: %d %d\n", &layer->maxpool_layer->stride[0], &layer->maxpool_layer->stride[1]); 173 | } else if(CMP_OPTION(line, "PAD")) { 174 | sscanf(line,"PAD: %d %d %d %d\n", &layer->maxpool_layer->pad[0], &layer->maxpool_layer->pad[1], \ 175 | &layer->maxpool_layer->pad[2], &layer->maxpool_layer->pad[3]); 176 | } 177 | break; 178 | case LAYER_TYPE_LRN_NORMALIZE: 179 | if(CMP_OPTION(line, "SIZE")) { 180 | sscanf(line,"SIZE: %d\n", &layer->lrn_layer->size); 181 | } else if(CMP_OPTION(line, "ALPHA")) { 182 | sscanf(line,"ALPHA: %f\n", &layer->lrn_layer->alpha); 183 | } else if(CMP_OPTION(line, "BETA")) { 184 | sscanf(line,"BETA: %f\n", &layer->lrn_layer->beta); 185 | } 186 | break; 187 | case LAYER_TYPE_SOFTMAX: 188 | break; 189 | case LAYER_TYPE_UNKNOWN: 190 | break; 191 | } 192 | } 193 | } 194 | fclose(layerfp); 195 | 196 | if(layer->type == LAYER_TYPE_CONV) { 197 | //determine output size 198 | if(layer->index == 0) { 199 | layer->output_w = (model->input_w + \ 200 | layer->conv_layer->pad[0] + \ 201 | layer->conv_layer->pad[1] - \ 202 | layer->conv_layer->w) / \ 203 | layer->conv_layer->stride[0] + 1; 204 | layer->output_h = (model->input_h + \ 205 | layer->conv_layer->pad[2] + \ 206 | layer->conv_layer->pad[3] - \ 207 | layer->conv_layer->h) / \ 208 | layer->conv_layer->stride[1] + 1; 209 | layer->output_c = layer->conv_layer->n; 210 | } else { 211 | layer->output_w = (layers[layer->index - 1].output_w + \ 212 | layer->conv_layer->pad[0] + \ 213 | layer->conv_layer->pad[1] - \ 214 | layer->conv_layer->w) / \ 215 | layer->conv_layer->stride[0] + 1; 216 | layer->output_h = (layers[layer->index - 1].output_h + \ 217 | layer->conv_layer->pad[2] + \ 218 | layer->conv_layer->pad[3] - \ 219 | layer->conv_layer->h) / \ 220 | layer->conv_layer->stride[1] + 1; 221 | layer->output_c = layer->conv_layer->n; 222 | } 223 | 224 | //switch to another kernel if this conv layer is equivalent to fully connected layer 225 | if(layer->output_h == 1 && layer->output_w == 1 && model->useGPU) { 226 | layer->doFeedForward = doFeedForward_CONV_FC_GPU; 227 | } 228 | 229 | //LOAD BIAS & WEIGHTS DATA 230 | char biasFilePath[256]; 231 | strcpy(biasFilePath, fileNameBuf); 232 | strcat(biasFilePath, "_bias"); 233 | FILE *biasfp = fopen(biasFilePath, "r"); 234 | if(!layer->useGPU) { 235 | layer->conv_layer->bias = (float *)calloc(layer->conv_layer->n, sizeof(float)); 236 | fread(layer->conv_layer->bias, sizeof(float), layer->conv_layer->n, biasfp); 237 | } else { 238 | cl_int err; 239 | OpenCLObjects *openCLObjects = getOpenClObject(); 240 | 241 | layer->conv_layer->cl_bias = clCreateBuffer( 242 | openCLObjects->context, 243 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 244 | layer->conv_layer->n * sizeof(float), //size in bytes 245 | NULL,//buffer of data 246 | &err); 247 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 248 | 249 | float *mappedBuffer = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 250 | layer->conv_layer->cl_bias, \ 251 | CL_TRUE, CL_MAP_WRITE, \ 252 | 0, \ 253 | layer->conv_layer->n * sizeof(float), \ 254 | 0, NULL, NULL, NULL); 255 | 256 | fread(mappedBuffer, sizeof(float), layer->conv_layer->n, biasfp); 257 | 258 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 259 | layer->conv_layer->cl_bias, \ 260 | mappedBuffer, \ 261 | 0, NULL, NULL); 262 | 263 | if(layer->useHalf) { 264 | cl_mem cl_bias_half = clCreateBuffer( 265 | openCLObjects->context, 266 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 267 | layer->conv_layer->n * sizeof(cl_half), //size in bytes 268 | NULL,//buffer of data 269 | &err); 270 | 271 | err = clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 0, sizeof(cl_mem), &layer->conv_layer->cl_bias); 272 | err |= clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 1, sizeof(cl_mem), &cl_bias_half); 273 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 274 | 275 | size_t convertSize[1] = {(size_t) layer->conv_layer->n}; 276 | err = clEnqueueNDRangeKernel( 277 | openCLObjects->queue, 278 | openCLObjects->convert_float_to_half_kernel.kernel, 279 | 1, 280 | 0, 281 | convertSize, 282 | 0, 283 | 0, 0, 0 284 | ); 285 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 286 | err = clFinish(openCLObjects->queue); 287 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 288 | 289 | clReleaseMemObject(layer->conv_layer->cl_bias); 290 | 291 | layer->conv_layer->cl_bias = cl_bias_half; 292 | } 293 | } 294 | fclose(biasfp); 295 | 296 | char wFilePath[256]; 297 | strcpy(wFilePath, fileNameBuf); 298 | strcat(wFilePath, "_weight"); 299 | FILE *wfp = fopen(wFilePath, "r"); 300 | if(!layer->useGPU) { 301 | layer->conv_layer->W = (float *) calloc(\ 302 | layer->conv_layer->w * \ 303 | layer->conv_layer->h * \ 304 | layer->conv_layer->c * \ 305 | layer->conv_layer->n / layer->conv_layer->group, sizeof(float)); 306 | 307 | /* 308 | * Our old model format is [n x c x h x w] 309 | * We change memory layout from [n x c x h x w] into [n x h x w x c] 310 | */ 311 | 312 | float *buffer = (float *)malloc(layer->conv_layer->h * layer->conv_layer->w * sizeof(float)); 313 | for(int k = 0 ; k < layer->conv_layer->n / layer->conv_layer->group ; k++) { 314 | for(int c = 0 ; c < layer->conv_layer->c ; c++) { 315 | fread(buffer, sizeof(float), layer->conv_layer->h * layer->conv_layer->w, wfp); 316 | for(int h = 0 ; h < layer->conv_layer->h ; h++) { 317 | for(int w = 0 ; w < layer->conv_layer->w ; w++) { 318 | int buf_idx = h * layer->conv_layer->w + w; 319 | int new_idx = getIndexFrom4D( 320 | layer->conv_layer->n, 321 | layer->conv_layer->h, 322 | layer->conv_layer->w, 323 | layer->conv_layer->c, 324 | k, h, w, c 325 | ); 326 | layer->conv_layer->W[new_idx] = buffer[buf_idx]; 327 | } 328 | } 329 | } 330 | } 331 | free(buffer); 332 | } else { 333 | cl_int err; 334 | OpenCLObjects *openCLObjects = getOpenClObject(); 335 | 336 | layer->conv_layer->cl_W = clCreateBuffer( 337 | openCLObjects->context, 338 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 339 | layer->conv_layer->w * layer->conv_layer->h * layer->conv_layer->c * layer->conv_layer->n / layer->conv_layer->group * sizeof(float), //size in bytes 340 | NULL,//buffer of data 341 | &err); 342 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 343 | 344 | float *mappedBuffer = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 345 | layer->conv_layer->cl_W, \ 346 | CL_TRUE, CL_MAP_WRITE, \ 347 | 0, \ 348 | layer->conv_layer->w * layer->conv_layer->h * layer->conv_layer->c * layer->conv_layer->n / layer->conv_layer->group * sizeof(float), \ 349 | 0, NULL, NULL, NULL); 350 | 351 | float *buffer = (float *)malloc(layer->conv_layer->h * layer->conv_layer->w * sizeof(float)); 352 | for(int k = 0 ; k < layer->conv_layer->n / layer->conv_layer->group ; k++) { 353 | for(int c = 0 ; c < layer->conv_layer->c ; c++) { 354 | fread(buffer, sizeof(float), layer->conv_layer->h * layer->conv_layer->w, wfp); 355 | for(int h = 0 ; h < layer->conv_layer->h ; h++) { 356 | for(int w = 0 ; w < layer->conv_layer->w ; w++) { 357 | int buf_idx = h * layer->conv_layer->w + w; 358 | int new_idx = getIndexFrom4D( 359 | layer->conv_layer->n, 360 | layer->conv_layer->h, 361 | layer->conv_layer->w, 362 | layer->conv_layer->c, 363 | k, h, w, c 364 | ); 365 | mappedBuffer[new_idx] = buffer[buf_idx]; 366 | } 367 | } 368 | } 369 | } 370 | 371 | free(buffer); 372 | 373 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 374 | layer->conv_layer->cl_W, \ 375 | mappedBuffer, \ 376 | 0, NULL, NULL); 377 | 378 | if(layer->useHalf == 1) { 379 | cl_mem cl_W_half = clCreateBuffer( 380 | openCLObjects->context, 381 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 382 | layer->conv_layer->w * layer->conv_layer->h * layer->conv_layer->c * layer->conv_layer->n / layer->conv_layer->group * sizeof(cl_half), //size in bytes 383 | NULL,//buffer of data 384 | &err); 385 | 386 | err = clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 0, sizeof(cl_mem), &layer->conv_layer->cl_W); 387 | err |= clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 1, sizeof(cl_mem), &cl_W_half); 388 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 389 | 390 | size_t convertSize[1] = {(size_t)layer->conv_layer->w * layer->conv_layer->h * layer->conv_layer->c * layer->conv_layer->n / layer->conv_layer->group}; 391 | err = clEnqueueNDRangeKernel( 392 | openCLObjects->queue, 393 | openCLObjects->convert_float_to_half_kernel.kernel, 394 | 1, 395 | 0, 396 | convertSize, 397 | 0, 398 | 0, 0, 0 399 | ); 400 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 401 | 402 | err = clFinish(openCLObjects->queue); 403 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 404 | 405 | clReleaseMemObject(layer->conv_layer->cl_W); 406 | 407 | layer->conv_layer->cl_W = cl_W_half; 408 | } 409 | } 410 | fclose(wfp); 411 | } 412 | 413 | if(layer->type == LAYER_TYPE_FULLY_CONNECTED) { 414 | layer->output_w = 1; 415 | layer->output_h = 1; 416 | layer->output_c = layer->connected_layer->outputSize; 417 | 418 | layer->connected_layer->weightSize = layer->connected_layer->inputSize * layer->connected_layer->outputSize; 419 | 420 | //LOAD BIAS AND WEIGHTS DATA 421 | char biasFilePath[256]; 422 | strcpy(biasFilePath, fileNameBuf); 423 | strcat(biasFilePath, "_bias"); 424 | FILE *biasfp = fopen(biasFilePath, "r"); 425 | if(!layer->useGPU) { 426 | layer->connected_layer->bias = (float *)calloc(layer->connected_layer->outputSize, sizeof(float)); 427 | fread(layer->connected_layer->bias, sizeof(float), layer->connected_layer->outputSize, biasfp); 428 | } else { 429 | cl_int err; 430 | OpenCLObjects *openCLObjects = getOpenClObject(); 431 | 432 | layer->connected_layer->cl_bias = clCreateBuffer( 433 | openCLObjects->context, 434 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 435 | layer->connected_layer->outputSize * sizeof(float), //size in bytes 436 | NULL,//buffer of data 437 | &err); 438 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 439 | 440 | float *mappedBuffer = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 441 | layer->connected_layer->cl_bias, \ 442 | CL_TRUE, CL_MAP_WRITE, \ 443 | 0, \ 444 | layer->connected_layer->outputSize * sizeof(float), \ 445 | 0, NULL, NULL, NULL); 446 | 447 | fread(mappedBuffer, sizeof(float), layer->connected_layer->outputSize, biasfp); 448 | 449 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 450 | layer->connected_layer->cl_bias, \ 451 | mappedBuffer, \ 452 | 0, NULL, NULL); 453 | 454 | if(layer->useHalf) { 455 | cl_mem cl_bias_half = clCreateBuffer( 456 | openCLObjects->context, 457 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 458 | layer->connected_layer->outputSize * sizeof(cl_half), //size in bytes 459 | NULL,//buffer of data 460 | &err); 461 | 462 | err = clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 0, sizeof(cl_mem), &layer->connected_layer->cl_bias); 463 | err |= clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 1, sizeof(cl_mem), &cl_bias_half); 464 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 465 | 466 | size_t convertSize[1] = {(size_t) layer->connected_layer->outputSize}; 467 | err = clEnqueueNDRangeKernel( 468 | openCLObjects->queue, 469 | openCLObjects->convert_float_to_half_kernel.kernel, 470 | 1, 471 | 0, 472 | convertSize, 473 | 0, 474 | 0, 0, 0 475 | ); 476 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 477 | 478 | err = clFinish(openCLObjects->queue); 479 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 480 | 481 | clReleaseMemObject(layer->connected_layer->cl_bias); 482 | 483 | layer->connected_layer->cl_bias = cl_bias_half; 484 | } 485 | } 486 | fclose(biasfp); 487 | 488 | char wFilePath[256]; 489 | strcpy(wFilePath, fileNameBuf); 490 | strcat(wFilePath, "_weight"); 491 | FILE *wfp = fopen(wFilePath, "r"); 492 | if(!layer->useGPU) { 493 | layer->connected_layer->W = (float *) calloc(\ 494 | layer->connected_layer->weightSize, sizeof(float)); 495 | fread(layer->connected_layer->W, sizeof(float), 496 | layer->connected_layer->weightSize, 497 | wfp); 498 | } else { 499 | cl_int err; 500 | OpenCLObjects *openCLObjects = getOpenClObject(); 501 | 502 | layer->connected_layer->cl_W = clCreateBuffer( 503 | openCLObjects->context, 504 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 505 | layer->connected_layer->weightSize * sizeof(float), //size in bytes 506 | NULL,//buffer of data 507 | &err); 508 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 509 | 510 | float *mappedBuffer = (float *)clEnqueueMapBuffer(openCLObjects->queue, \ 511 | layer->connected_layer->cl_W, \ 512 | CL_TRUE, CL_MAP_WRITE, \ 513 | 0, \ 514 | layer->connected_layer->weightSize * sizeof(float), \ 515 | 0, NULL, NULL, NULL); 516 | 517 | if(!layer->connected_layer->need_reshape) { 518 | //file is formatted [(c x h x w) x outputsize] 519 | //this is for LRCN 520 | float * buffer = (float *)malloc(layer->connected_layer->outputSize * sizeof(float)); 521 | int input_h = (layer->index == 0) ? model->input_h : layers[layer->index - 1].output_h; 522 | int input_w = (layer->index == 0) ? model->input_w : layers[layer->index - 1].output_w; 523 | int input_c = (layer->index == 0) ? model->input_c : layers[layer->index - 1].output_c; 524 | 525 | for(int c = 0 ; c < input_c ; c++) { 526 | for(int h = 0 ; h < input_h ; h++) { 527 | for(int w = 0 ; w < input_w ; w++) { 528 | fread(buffer, sizeof(float), layer->connected_layer->outputSize, wfp); 529 | for(int n = 0 ; n < layer->connected_layer->outputSize ; n++) { 530 | int idx = getIndexFrom4D(layer->connected_layer->outputSize, input_h, input_w, input_c, n, h, w, c); 531 | mappedBuffer[idx] = buffer[n]; 532 | } 533 | } 534 | } 535 | } 536 | free(buffer); 537 | } else { 538 | //file is formatted [outputsize x (c x h x w)] 539 | int input_h = (layer->index == 0) ? model->input_h : layers[layer->index - 1].output_h; 540 | int input_w = (layer->index == 0) ? model->input_w : layers[layer->index - 1].output_w; 541 | int input_c = (layer->index == 0) ? model->input_c : layers[layer->index - 1].output_c; 542 | 543 | int size = input_h * input_w * input_c; 544 | float *buffer = (float *)malloc(size * sizeof(float)); 545 | for(int n = 0 ; n < layer->connected_layer->outputSize ; n++) { 546 | fread(buffer, sizeof(float), size, wfp); //[c x h x w] 547 | //need to convert to h x w x c 548 | int f_idx = 0; 549 | for(int c = 0 ; c < input_c ; c++) { 550 | for(int h = 0 ; h < input_h ; h++) { 551 | for(int w = 0 ; w < input_w ; w++) { 552 | int idx = getIndexFrom4D(layer->connected_layer->outputSize, input_h, input_w, input_c, n, h, w, c); 553 | mappedBuffer[idx] = buffer[f_idx]; 554 | f_idx++; 555 | } 556 | } 557 | } 558 | } 559 | free(buffer); 560 | } 561 | 562 | clEnqueueUnmapMemObject(openCLObjects->queue, \ 563 | layer->connected_layer->cl_W, \ 564 | mappedBuffer, \ 565 | 0, NULL, NULL); 566 | 567 | if(layer->useHalf) { 568 | cl_mem cl_W_half = clCreateBuffer( 569 | openCLObjects->context, 570 | CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, 571 | layer->connected_layer->weightSize * sizeof(cl_half), //size in bytes 572 | NULL,//buffer of data 573 | &err); 574 | 575 | err = clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 0, sizeof(cl_mem), &layer->connected_layer->cl_W); 576 | err |= clSetKernelArg(openCLObjects->convert_float_to_half_kernel.kernel, 1, sizeof(cl_mem), &cl_W_half); 577 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 578 | 579 | size_t convertSize[1] = {(size_t) layer->connected_layer->weightSize}; 580 | err = clEnqueueNDRangeKernel( 581 | openCLObjects->queue, 582 | openCLObjects->convert_float_to_half_kernel.kernel, 583 | 1, 584 | 0, 585 | convertSize, 586 | 0, 587 | 0, 0, 0 588 | ); 589 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 590 | 591 | err = clFinish(openCLObjects->queue); 592 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 593 | 594 | clReleaseMemObject(layer->connected_layer->cl_W); 595 | 596 | layer->connected_layer->cl_W = cl_W_half; 597 | } 598 | } 599 | fclose(wfp); 600 | } 601 | 602 | if(layer->type == LAYER_TYPE_MAXPOOL) { 603 | layer->output_w = 1 + (layers[layer->index - 1].output_w + layer->maxpool_layer->pad[0] + layer->maxpool_layer->pad[1] - layer->maxpool_layer->size) / layer->maxpool_layer->stride[0]; 604 | layer->output_h = 1 + (layers[layer->index - 1].output_h + layer->maxpool_layer->pad[2] + layer->maxpool_layer->pad[3] - layer->maxpool_layer->size) / layer->maxpool_layer->stride[1]; 605 | layer->output_c = layers[layer->index - 1].output_c; 606 | } 607 | 608 | if(layer->type == LAYER_TYPE_SOFTMAX) { 609 | layer->output_w = 1; 610 | layer->output_h = 1; 611 | layer->output_c = layers[layer->index - 1].output_c; 612 | } 613 | 614 | if(layer->type == LAYER_TYPE_LRN_NORMALIZE) { 615 | layer->output_w = layers[layer->index - 1].output_w; 616 | layer->output_h = layers[layer->index - 1].output_h; 617 | layer->output_c = layers[layer->index - 1].output_c; 618 | } 619 | 620 | int input_w = (i == 1) ? model->input_w : layers[i - 2].output_w; 621 | int input_h = (i == 1) ? model->input_h : layers[i - 2].output_h; 622 | int input_c = (i == 1) ? model->input_c : layers[i - 2].output_c; 623 | 624 | LOGD("Layer %d has input[%d %d %d] and output [%d %d %d]",(i), \ 625 | input_c, input_h, input_w, 626 | layer->output_c, layer->output_h, layer->output_w); 627 | } 628 | 629 | return model; 630 | } 631 | 632 | void cnn_free(cnn *model) { 633 | int i; 634 | for(i = 0 ; i < model->nLayers ; i++) { 635 | cnn_layer *layer = &model->layers[i]; 636 | if(layer->type == LAYER_TYPE_CONV) { 637 | if(!model->useGPU) { 638 | free(layer->conv_layer->bias); 639 | free(layer->conv_layer->W); 640 | } else { 641 | clReleaseMemObject(layer->conv_layer->cl_W); 642 | clReleaseMemObject(layer->conv_layer->cl_bias); 643 | } 644 | free(layer->conv_layer); 645 | } else if(layer->type == LAYER_TYPE_FULLY_CONNECTED) { 646 | if(!model->useGPU) { 647 | free(layer->connected_layer->bias); 648 | free(layer->connected_layer->W); 649 | } else { 650 | clReleaseMemObject(layer->conv_layer->cl_W); 651 | clReleaseMemObject(layer->conv_layer->cl_bias); 652 | } 653 | free(layer->connected_layer); 654 | } else if(layer->type == LAYER_TYPE_MAXPOOL) { 655 | free(layer->maxpool_layer); 656 | } else if(layer->type == LAYER_TYPE_LRN_NORMALIZE) { 657 | free(layer->lrn_layer); 658 | } 659 | } 660 | 661 | if(model->averageImage != NULL) 662 | free(model->averageImage); 663 | 664 | free(model->layers); 665 | free(model); 666 | } -------------------------------------------------------------------------------- /app/src/main/cpp/include/basic_functions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __BASIC_FUNCTIONS_HPP__ 2 | #define __BASIC_FUNCTIONS_HPP__ 3 | 4 | #include "deepsense_lib.hpp" 5 | 6 | int getIndexFrom4D(int d1, int d2, int d3, int d4, int i1, int i2, int i3, int i4); 7 | float getDataFrom4D(float *data, int d1, int d2, int d3, int d4, int i1, int i2, int i3, int i4); 8 | int getIndexFrom3D(int d1, int d2, int d3, int i1, int i2, int i3); 9 | float getDataFrom3D(float *data, int d1, int d2, int d3, int i1, int i2, int i3); 10 | 11 | cnn_frame *activate_RAMP(cnn_frame *frame); 12 | cnn_frame *activate_LOGISTIC(cnn_frame *frame); 13 | cnn_frame *activate_RELU(cnn_frame *frame); 14 | cnn_frame *activate_LEAKY(cnn_frame *frame); 15 | cnn_frame *doFeedForward_Activation(cnn_frame *frame, int activation); 16 | 17 | cnn_frame * frame_init(int w, int h, int c); 18 | cnn_frame * frame_init_gpu(int w, int h, int c); 19 | cnn_frame * frame_init_gpu_half(int w, int h, int c); 20 | cnn_frame * frame_clone(cnn_frame *src); 21 | cnn_frame * frame_convert_to_gpu_float(cnn_frame *frame); 22 | cnn_frame * frame_convert_to_gpu_half(cnn_frame *frame); 23 | cnn_frame * frame_convert_to_cpu(cnn_frame *frame); 24 | void frame_free(cnn_frame *frame); 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/classifier.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CLASSIFIER_HPP__ 2 | #define __CLASSIFIER_HPP__ 3 | 4 | #include "deepsense_lib.hpp" 5 | 6 | float * cnn_doClassification(cnn_frame *frame, cnn *model); 7 | #endif 8 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/clio.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CLIO_HPP__ 2 | #define __CLIO_HPP__ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // Commonly-defined shortcuts for LogCat output from native C applications. 9 | #define LOG_TAG PROGRAM_NAME 10 | #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, LOG_TAG, __VA_ARGS__) 11 | #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, LOG_TAG, __VA_ARGS__) 12 | 13 | /* This function helps to create informative messages in 14 | * case when OpenCL errors occur. The function returns a string 15 | * representation for an OpenCL error code. 16 | * For example, "CL_DEVICE_NOT_FOUND" instead of "-1". 17 | */ 18 | const char* opencl_error_to_str (cl_int error); 19 | 20 | #define SAMPLE_CHECK_ERRORS(ERR) \ 21 | if(ERR != CL_SUCCESS) \ 22 | { \ 23 | LOGD \ 24 | ( \ 25 | "OpenCL error with code %s happened in file %s at line %d. Exiting.\n", \ 26 | opencl_error_to_str(ERR), __FILE__, __LINE__ \ 27 | ); \ 28 | \ 29 | return; \ 30 | } 31 | 32 | #define SAMPLE_CHECK_ERRORS_WITH_RETURN(ERR) \ 33 | if(ERR != CL_SUCCESS) \ 34 | { \ 35 | LOGD \ 36 | ( \ 37 | "OpenCL error with code %s happened in file %s at line %d. Exiting.\n", \ 38 | opencl_error_to_str(ERR), __FILE__, __LINE__ \ 39 | ); \ 40 | \ 41 | return ERR; \ 42 | } 43 | 44 | #define SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(ERR) \ 45 | if(ERR != CL_SUCCESS) \ 46 | { \ 47 | LOGD \ 48 | ( \ 49 | "OpenCL error with code %s happened in file %s at line %d. Exiting.\n", \ 50 | opencl_error_to_str(ERR), __FILE__, __LINE__ \ 51 | ); \ 52 | \ 53 | return NULL; \ 54 | } 55 | 56 | #endif 57 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/deepsense_internal_lib.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __LNN_INTERNAL_LIB_HPP__ 2 | #define __LNN_INTERNAL_LIB_HPP__ 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #include 9 | 10 | typedef unsigned long long timestamp_t; 11 | timestamp_t get_timestamp(); 12 | 13 | OpenCLObjects *getOpenClObject(); 14 | cnn *getModel(); 15 | 16 | #ifdef __cplusplus 17 | } 18 | #endif 19 | 20 | #endif 21 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/deepsense_lib.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __LNN_LIB_HPP__ 2 | #define __LNN_LIB_HPP__ 3 | 4 | #include 5 | 6 | /* 7 | * This structure acts as both input and output to CNN Layer 8 | */ 9 | typedef struct { 10 | int w; 11 | int h; 12 | int c; 13 | float *data; 14 | int useGPU; 15 | int useHalf; 16 | cl_mem cl_data; 17 | } cnn_frame; 18 | 19 | typedef enum { 20 | NO_ACTIVATION, 21 | RAMP, 22 | LOGISTIC, 23 | LEAKY, 24 | LINEAR, 25 | RELU 26 | } activation_function; 27 | 28 | typedef enum {\ 29 | LAYER_TYPE_UNKNOWN = 0, \ 30 | LAYER_TYPE_CONV, \ 31 | LAYER_TYPE_FULLY_CONNECTED, \ 32 | LAYER_TYPE_MAXPOOL, \ 33 | LAYER_TYPE_SOFTMAX, \ 34 | LAYER_TYPE_LRN_NORMALIZE, \ 35 | LAYER_TYPE_LSTM 36 | } layer_type; 37 | 38 | typedef struct { 39 | int clip; 40 | int clip_count; 41 | int input_size; 42 | int output_size; 43 | float *W_x; //[4*output_size x input_size] 44 | float *W_h; 45 | float *bias; 46 | cl_mem cl_W_x; 47 | cl_mem cl_W_h; 48 | cl_mem cl_bias; 49 | //internal states 50 | float *prev_H; 51 | float *prev_C; 52 | cl_mem cl_prev_H; 53 | cl_mem cl_prev_C; 54 | int forward_temp_data; 55 | int need_reshape; 56 | } cnn_layer_lstm; 57 | 58 | typedef struct { 59 | int k; 60 | int size; 61 | float alpha; 62 | float beta; 63 | } cnn_layer_lrn; 64 | 65 | typedef struct { 66 | int stride[2]; 67 | int pad[4]; 68 | int w; //width 69 | int h; //height 70 | int c; //channel 71 | int n; //number of neurons 72 | int group; 73 | float *W; 74 | float *bias; 75 | cl_mem cl_W; 76 | cl_mem cl_bias; 77 | } cnn_layer_conv; 78 | 79 | typedef struct { 80 | int weightSize; 81 | int inputSize; 82 | int outputSize; 83 | float *W; 84 | float *bias; 85 | cl_mem cl_W; 86 | cl_mem cl_bias; 87 | int need_reshape; 88 | } cnn_layer_fully_connected; 89 | 90 | typedef struct { 91 | int size; 92 | int stride[2]; 93 | int pad[4]; 94 | } cnn_layer_maxpool; 95 | 96 | typedef struct { 97 | int index; 98 | int useGPU; 99 | int useHalf; 100 | int output_w; 101 | int output_h; 102 | int output_c; 103 | layer_type type; 104 | cnn_layer_conv *conv_layer; 105 | cnn_layer_fully_connected *connected_layer; 106 | cnn_layer_maxpool *maxpool_layer; 107 | cnn_layer_lrn *lrn_layer; 108 | cnn_layer_lstm *lstm_layer; 109 | cnn_frame *(*doFeedForward)(cnn_frame *frame, void *layer); 110 | int activation; 111 | } cnn_layer; 112 | 113 | typedef struct { 114 | int nLayers; 115 | int useGPU; 116 | int useHalf; 117 | int input_w; 118 | int input_h; 119 | int input_c; 120 | float *averageImage; 121 | cnn_layer *layers; 122 | } cnn; 123 | 124 | cnn * cnn_loadModel(const char *modelDirPath, int useGPU); 125 | void cnn_free(cnn *model); 126 | 127 | #endif 128 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/layers/conv_layer.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __CONV_LAYER__ 2 | #define __CONV_LAYER__ 3 | 4 | #include 5 | 6 | cnn_frame *doFeedForward_CONV(cnn_frame *frame, void *layer); 7 | cnn_frame *doFeedForward_CONV_GPU(cnn_frame *frame, void *layer); 8 | cnn_frame *doFeedForward_CONV_FC_GPU(cnn_frame *frame, void *layer); 9 | 10 | #endif 11 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/layers/fully_connected.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __FULLY_CONNECTED_HPP__ 2 | #define __FULLY_CONNECTED_HPP__ 3 | 4 | #include 5 | 6 | cnn_frame *doFeedForward_FULLY_CONNECTED(cnn_frame *frame, void *layer); 7 | cnn_frame *doFeedForward_FULLY_CONNECTED_GPU(cnn_frame *frame, void *layer); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/layers/lrn.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __LRN_HPP__ 2 | #define __LRN_HPP__ 3 | 4 | #include 5 | 6 | cnn_frame *doFeedForward_LRN(cnn_frame *frame, void *layer); 7 | cnn_frame *doFeedForward_LRN_GPU(cnn_frame *frame, void *layer); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/layers/maxpool.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __MAXPOOL_HPP__ 2 | #define __MAXPOOL_HPP__ 3 | 4 | #include 5 | 6 | cnn_frame *doFeedForward_MAXPOOL(cnn_frame *frame, void *layer); 7 | cnn_frame *doFeedForward_MAXPOOL_GPU(cnn_frame *frame, void *layer); 8 | 9 | #endif 10 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/layers/softmax.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __SOFTMAX_HPP__ 2 | #define __SOFTMAX_HPP__ 3 | 4 | #include 5 | 6 | cnn_frame *doFeedForward_SOFTMAX(cnn_frame *frame, void *layer); 7 | 8 | #endif 9 | 10 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/predefine.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __PREDEFINE_H 2 | #define __PREDEFINE_H 3 | 4 | #include 5 | 6 | #define PROGRAM_NAME "DEEPSENSE" 7 | #define PROGRAM_KERNEL_NAME "deepsense.cl" 8 | 9 | typedef struct { 10 | cl_kernel kernel; 11 | size_t kernel_max_workgroup_size; 12 | } lany_kernel; 13 | 14 | struct OpenCLObjects { 15 | // Regular OpenCL objects: 16 | cl_platform_id platform; 17 | cl_device_id device; 18 | cl_context context; 19 | cl_command_queue queue; //we use single queue only 20 | cl_program program; 21 | 22 | //half kernels 23 | lany_kernel conv_kernel; 24 | lany_kernel conv_fc_kernel; 25 | lany_kernel fully_connected_kernel; 26 | lany_kernel maxpool_kernel; 27 | lany_kernel lrn_kernel; 28 | lany_kernel activation_kernel; 29 | 30 | //float kernels 31 | lany_kernel conv_kernel_float; 32 | lany_kernel conv_fc_kernel_float; 33 | lany_kernel fully_connected_kernel_float; 34 | lany_kernel maxpool_kernel_float; 35 | lany_kernel lrn_kernel_float; 36 | lany_kernel activation_kernel_float; 37 | 38 | lany_kernel convert_float_to_half_kernel; 39 | lany_kernel convert_half_to_float_kernel; 40 | }; 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /app/src/main/cpp/include/utilities.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __UTILITIES_HPP__ 2 | #define __UTILITIES_HPP__ 3 | 4 | #include 5 | #include 6 | 7 | void init_OpenCL( 8 | cl_device_type required_device_type, 9 | OpenCLObjects& openCLObjects, 10 | const char *packageName); 11 | 12 | void shutdown_OpenCL (OpenCLObjects& openCLObjects); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /app/src/main/cpp/layers/conv_layer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | cnn_frame *doFeedForward_CONV(cnn_frame *frame, void *layer) { 7 | LOGD("Running function %s", __PRETTY_FUNCTION__); 8 | 9 | frame = frame_convert_to_cpu(frame); 10 | 11 | cnn_layer_conv *conv_layer = ((cnn_layer *)layer)->conv_layer; 12 | 13 | cnn_frame *output = frame_init(\ 14 | (frame->w + conv_layer->pad[0] + conv_layer->pad[1] - conv_layer->w) / conv_layer->stride[0] + 1, \ 15 | (frame->h + conv_layer->pad[2] + conv_layer->pad[3] - conv_layer->h) / conv_layer->stride[1] + 1, \ 16 | conv_layer->n); 17 | 18 | int i, j, k, x, y, z; 19 | for(i = 0 ; i < output->c; i++) { 20 | for(j = 0 ; j < output->h ; j++) { 21 | for(k = 0 ; k < output->w ; k++) { 22 | float result = 0.0f; 23 | for(x = 0 ; x < conv_layer->c; x++) { 24 | for(y = 0 ; y < conv_layer->h; y++) { 25 | for(z = 0 ; z < conv_layer->w ; z++) { 26 | int w = k * conv_layer->stride[0] - conv_layer->pad[0] + z; 27 | int h = j * conv_layer->stride[1] - conv_layer->pad[2] + y; 28 | if(w < 0 || w >= frame->w) 29 | continue; 30 | if(h < 0 || h >= frame->h) 31 | continue; 32 | 33 | float tmp1 = getDataFrom3D(frame->data, frame->h, frame->w, frame->c, h, w, x); 34 | float tmp2 = getDataFrom4D(conv_layer->W, conv_layer->n, conv_layer->h, conv_layer->w, conv_layer->c, i, y, z, x); 35 | result += tmp1 * tmp2; 36 | } 37 | } 38 | } 39 | 40 | result += conv_layer->bias[i]; 41 | output->data[getIndexFrom3D(output->c, output->h, output->w, i, j, k)] = result; 42 | } 43 | } 44 | } 45 | 46 | frame_free(frame); 47 | 48 | doFeedForward_Activation(output, ((cnn_layer *)layer)->activation); 49 | 50 | return output; 51 | } 52 | 53 | cnn_frame *doFeedForward_CONV_GPU(cnn_frame *frame, void *layer) { 54 | LOGD("Running function %s", __PRETTY_FUNCTION__); 55 | 56 | OpenCLObjects *openCLObjects = getOpenClObject(); 57 | cl_int err = CL_SUCCESS; 58 | 59 | frame = ((cnn_layer *)layer)->useHalf ? frame_convert_to_gpu_half(frame) : frame_convert_to_gpu_float(frame); 60 | 61 | cnn_layer_conv *conv_layer = ((cnn_layer *)layer)->conv_layer; 62 | 63 | int output_w = (frame->w + conv_layer->pad[0] + conv_layer->pad[1] - conv_layer->w) / conv_layer->stride[0] + 1; 64 | int output_h = (frame->h + conv_layer->pad[2] + conv_layer->pad[3] - conv_layer->h) / conv_layer->stride[1] + 1; 65 | int output_c = conv_layer->n; 66 | 67 | cnn_frame *output = ((cnn_layer *)layer)->useHalf ? frame_init_gpu_half(output_w, output_h, output_c) : frame_init_gpu(output_w, output_h, output_c); 68 | 69 | cl_mem cl_frame = frame->cl_data; 70 | cl_mem cl_result = output->cl_data; 71 | 72 | int i = 0; 73 | cl_kernel kernel = ((cnn_layer *)layer)->useHalf ? openCLObjects->conv_kernel.kernel : openCLObjects->conv_kernel_float.kernel; 74 | 75 | err = clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_frame); 76 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->w); 77 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->h); 78 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->c); 79 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem),&conv_layer->cl_W); 80 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem),&conv_layer->cl_bias); 81 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->w); 82 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->h); 83 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->c); 84 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->n); 85 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->stride[0]); 86 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->stride[1]); 87 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[0]); 88 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[1]); 89 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[2]); 90 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[3]); 91 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_result); 92 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_w); 93 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_h); 94 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_c); 95 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 96 | 97 | size_t localSize[3] = {8 , 8, 1}; 98 | int global_x = ((output->w - 1) / localSize[0] + 1) * localSize[0]; 99 | int global_y = ((output->h - 1) / localSize[1] + 1) * localSize[1]; 100 | 101 | int didive = 8; 102 | int gs3 = output_c % didive == 0 ? output_c / didive : output_c; 103 | 104 | size_t globalSize[3] = {(size_t)global_x, (size_t)global_y, (size_t)gs3}; 105 | 106 | err = clEnqueueNDRangeKernel( 107 | openCLObjects->queue, 108 | kernel, 109 | 3, 110 | 0, 111 | globalSize, 112 | localSize, 113 | 0, 0, 0 114 | ); 115 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 116 | 117 | err |= clFinish(openCLObjects->queue); 118 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 119 | 120 | doFeedForward_Activation(output, ((cnn_layer *)layer)->activation); 121 | 122 | frame_free(frame); 123 | 124 | return output; 125 | } 126 | 127 | cnn_frame *doFeedForward_CONV_FC_GPU(cnn_frame *frame, void *layer) { 128 | LOGD("Running function %s", __PRETTY_FUNCTION__); 129 | 130 | frame = ((cnn_layer *)layer)->useHalf ? frame_convert_to_gpu_half(frame) : frame_convert_to_gpu_float(frame); 131 | 132 | cl_int err = CL_SUCCESS; 133 | 134 | cnn_layer_conv *conv_layer = ((cnn_layer *)layer)->conv_layer; 135 | OpenCLObjects *openCLObjects = getOpenClObject(); 136 | 137 | int output_w = (frame->w + conv_layer->pad[0] + conv_layer->pad[1] - conv_layer->w) / conv_layer->stride[0] + 1; 138 | int output_h = (frame->h + conv_layer->pad[2] + conv_layer->pad[3] - conv_layer->h) / conv_layer->stride[1] + 1; 139 | int output_c = conv_layer->n; 140 | 141 | cnn_frame *output = ((cnn_layer *)layer)->useHalf ? frame_init_gpu_half(output_w, output_h, output_c) : frame_init_gpu(output_w, output_h, output_c); 142 | 143 | cl_mem cl_frame = frame->cl_data; 144 | cl_mem cl_result = output->cl_data; 145 | 146 | int i = 0; 147 | cl_kernel kernel = ((cnn_layer *)layer)->useHalf ? openCLObjects->conv_fc_kernel.kernel : openCLObjects->conv_fc_kernel_float.kernel; 148 | err = clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_frame); 149 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->w); 150 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->h); 151 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->c); 152 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &conv_layer->cl_W); 153 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &conv_layer->cl_bias); 154 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->w); 155 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->h); 156 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->c); 157 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->n); 158 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->stride[0]); 159 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->stride[1]); 160 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[0]); 161 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[1]); 162 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[2]); 163 | err |= clSetKernelArg(kernel, i++, sizeof(int), &conv_layer->pad[3]); 164 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_result); 165 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_w); 166 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_h); 167 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_c); 168 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 169 | 170 | //size_t globalSize[1] = {(size_t)output->c}; 171 | size_t globalSize[1] = {(size_t)(output->c % 256 == 0 ? 256 : output->c)}; 172 | 173 | err = clEnqueueNDRangeKernel( 174 | openCLObjects->queue, 175 | kernel, 176 | 1, 177 | 0, 178 | globalSize, 179 | 0, 180 | 0, 0, 0 181 | ); 182 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 183 | 184 | err |= clFinish(openCLObjects->queue); 185 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 186 | 187 | frame_free(frame); 188 | 189 | doFeedForward_Activation(output, ((cnn_layer *)layer)->activation); 190 | 191 | return output; 192 | } 193 | 194 | -------------------------------------------------------------------------------- /app/src/main/cpp/layers/fully_connected.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | cnn_frame *doFeedForward_FULLY_CONNECTED(cnn_frame *frame, void *layer) { 8 | LOGD("Running function %s", __PRETTY_FUNCTION__); 9 | 10 | frame = frame_convert_to_cpu(frame); 11 | 12 | cnn_layer_fully_connected *connected_layer = ((cnn_layer *)layer)->connected_layer; 13 | 14 | cnn_frame *output = frame_init(1, 1, connected_layer->outputSize); 15 | 16 | for(int n = 0 ; n < connected_layer->outputSize ; n++) { 17 | output->data[n] = 0; 18 | for(int i = 0 ; i < frame->c ; i++) { 19 | for(int j = 0 ; j < frame->h ; j++) { 20 | for(int k = 0 ; k < frame->w ; k++) { 21 | int index = getIndexFrom3D(frame->c, frame->h, frame->w, i , j , k); 22 | output->data[n] += frame->data[index] * connected_layer->W[index * connected_layer->outputSize + n]; 23 | } 24 | } 25 | } 26 | 27 | output->data[n] += connected_layer->bias[n]; 28 | } 29 | 30 | doFeedForward_Activation(output, ((cnn_layer *)layer)->activation); 31 | 32 | frame_free(frame); 33 | 34 | return output; 35 | } 36 | 37 | cnn_frame *doFeedForward_FULLY_CONNECTED_GPU(cnn_frame *frame, void *layer) { 38 | LOGD("Running function %s", __PRETTY_FUNCTION__); 39 | 40 | frame = ((cnn_layer *)layer)->useHalf ? frame_convert_to_gpu_half(frame) : frame_convert_to_gpu_float(frame); 41 | 42 | cl_int err; 43 | cnn_layer_fully_connected *connected_layer = ((cnn_layer *)layer)->connected_layer; 44 | OpenCLObjects *openCLObjects = getOpenClObject(); 45 | 46 | cnn_frame *output = ((cnn_layer *)layer)->useHalf ? frame_init_gpu_half(1, 1, connected_layer->outputSize) : frame_init_gpu(1, 1, connected_layer->outputSize); 47 | 48 | int i = 0; 49 | cl_kernel kernel = ((cnn_layer *)layer)->useHalf ? openCLObjects->fully_connected_kernel.kernel : openCLObjects->fully_connected_kernel_float.kernel; 50 | err = clSetKernelArg(kernel, i++, sizeof(cl_mem), &frame->cl_data); 51 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->w); 52 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->h); 53 | err |= clSetKernelArg(kernel, i++, sizeof(int), &frame->c); 54 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &connected_layer->cl_W); 55 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &connected_layer->cl_bias); 56 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &output->cl_data); 57 | err |= clSetKernelArg(kernel, i++, sizeof(int), &connected_layer->outputSize); 58 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 59 | 60 | //size_t globalSize[1] = {(size_t)connected_layer->outputSize}; 61 | size_t globalSize[1] = {(size_t)256}; 62 | 63 | err = clEnqueueNDRangeKernel( 64 | openCLObjects->queue, 65 | kernel, 66 | 1, 67 | 0, 68 | globalSize, 69 | 0, 70 | 0, 0, 0 71 | ); 72 | err |= clFinish(openCLObjects->queue); 73 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 74 | 75 | doFeedForward_Activation(output, ((cnn_layer *)layer)->activation); 76 | 77 | frame_free(frame); 78 | 79 | return output; 80 | } -------------------------------------------------------------------------------- /app/src/main/cpp/layers/lrn.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | cnn_frame *doFeedForward_LRN(cnn_frame *frame, void *layer) { 8 | LOGD("Running function %s", __PRETTY_FUNCTION__); 9 | 10 | frame = frame_convert_to_cpu(frame); 11 | 12 | cnn_layer_lrn *lrn_layer = ((cnn_layer *)layer)->lrn_layer; 13 | 14 | int channels = frame->c; 15 | int width = frame->w; 16 | int height = frame->h; 17 | 18 | cnn_frame *output = frame_init(width, height, channels); 19 | 20 | float alpha_over_size = lrn_layer->alpha / lrn_layer->size; 21 | int size = lrn_layer->size; 22 | int k = lrn_layer->k; 23 | float beta = lrn_layer->beta; 24 | 25 | float *in = frame->data; 26 | float *out = output->data; 27 | 28 | for(int w = 0 ; w < width ; w++) { 29 | for(int h = 0 ; h < height ; h++) { 30 | int offset = (h * width + w) * channels; 31 | int head = 0; 32 | int pre_pad = (size - 1) / 2; 33 | int post_pad = size - pre_pad - 1; 34 | float accum_scale = 0; 35 | 36 | while (head < post_pad) { 37 | float data = in[offset + head]; 38 | accum_scale += data * data; 39 | head++; 40 | } 41 | 42 | while (head < size) { 43 | float data = in[offset + head]; 44 | accum_scale += data * data; 45 | float scale = k + accum_scale * alpha_over_size; 46 | out[offset + head - post_pad] = in[offset + head - post_pad] * pow(scale, -beta); 47 | head++; 48 | } 49 | 50 | while (head < channels) { 51 | float data = in[offset + head]; 52 | accum_scale += data * data; 53 | data = in[offset + head - size]; 54 | accum_scale -= data * data; 55 | float scale = k + accum_scale * alpha_over_size; 56 | out[offset + head - post_pad] = in[offset + head - post_pad] * pow(scale, -beta); 57 | head++; 58 | } 59 | 60 | while (head < channels + post_pad) { 61 | float data = in[offset + head - size]; 62 | accum_scale -= data * data; 63 | float scale = k + accum_scale * alpha_over_size; 64 | out[offset + head - post_pad] = in[offset + head - post_pad] * pow(scale, -beta); 65 | head++; 66 | } 67 | } 68 | } 69 | 70 | frame_free(frame); 71 | return output; 72 | } 73 | 74 | cnn_frame *doFeedForward_LRN_GPU(cnn_frame *frame, void *layer) { 75 | LOGD("Running function %s", __PRETTY_FUNCTION__); 76 | 77 | cnn_layer_lrn *lrn_layer = ((cnn_layer *)layer)->lrn_layer; 78 | 79 | cl_int err; 80 | OpenCLObjects *openCLObjects = getOpenClObject(); 81 | 82 | int channels = frame->c; 83 | int width = frame->w; 84 | int height = frame->h; 85 | 86 | cnn_frame *output = ((cnn_layer *)layer)->useHalf ? frame_init_gpu_half(width, height, channels) : frame_init_gpu(width, height, channels); 87 | 88 | cl_mem cl_frame = frame->cl_data; 89 | cl_mem cl_result = output->cl_data; 90 | 91 | float alpha_over_size = lrn_layer->alpha / lrn_layer->size; 92 | 93 | int i = 0; 94 | cl_kernel kernel = ((cnn_layer *)layer)->useHalf ? openCLObjects->lrn_kernel.kernel : openCLObjects->lrn_kernel_float.kernel; 95 | 96 | err = clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_frame); 97 | err |= clSetKernelArg(kernel, i++, sizeof(int), &channels); 98 | err |= clSetKernelArg(kernel, i++, sizeof(int), &height); 99 | err |= clSetKernelArg(kernel, i++, sizeof(int), &width); 100 | err |= clSetKernelArg(kernel, i++, sizeof(int), &lrn_layer->k); 101 | err |= clSetKernelArg(kernel, i++, sizeof(int), &lrn_layer->size); 102 | err |= clSetKernelArg(kernel, i++, sizeof(int), &alpha_over_size); 103 | err |= clSetKernelArg(kernel, i++, sizeof(int), &lrn_layer->beta); 104 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_result); 105 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 106 | 107 | size_t globalSize[2] = {(size_t)width, (size_t)height}; 108 | 109 | err = clEnqueueNDRangeKernel( 110 | openCLObjects->queue, 111 | kernel, 112 | 2, 113 | 0, 114 | globalSize, 115 | 0, 116 | 0, 0, 0 117 | ); 118 | err |= clFinish(openCLObjects->queue); 119 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 120 | 121 | frame_free(frame); 122 | 123 | return output; 124 | } -------------------------------------------------------------------------------- /app/src/main/cpp/layers/maxpool.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | cnn_frame *doFeedForward_MAXPOOL(cnn_frame *frame, void *layer) { 7 | LOGD("Running function %s", __PRETTY_FUNCTION__); 8 | 9 | frame = frame_convert_to_cpu(frame); 10 | 11 | cnn_layer_maxpool *maxpool_layer = ((cnn_layer *)layer)->maxpool_layer; 12 | 13 | int w = 1 + (frame->w + maxpool_layer->pad[0] + maxpool_layer->pad[1] - maxpool_layer->size) / maxpool_layer->stride[0]; 14 | int h = 1 + (frame->h + maxpool_layer->pad[2] + maxpool_layer->pad[3] - maxpool_layer->size) / maxpool_layer->stride[1]; 15 | int d = frame->c; 16 | cnn_frame *output = frame_init(w, h, d); 17 | 18 | for(int k = 0 ; k < output->c ; k++) { 19 | for(int h = 0; h < output->h; h++) { 20 | for(int w = 0; w < output->w ; w++) { 21 | float max = -999999.9f; 22 | for(int x = 0 ; x < maxpool_layer->size ; x++) { 23 | for(int y = 0 ; y < maxpool_layer->size ; y++) { 24 | int x_ = w * maxpool_layer->stride[0] + x - maxpool_layer->pad[0]; 25 | int y_ = h * maxpool_layer->stride[1] + y - maxpool_layer->pad[2]; 26 | int valid = (x_ >= 0 && x_ < frame->w && y_ >= 0 && y_ < frame->h); 27 | float val = (valid != 0) ? frame->data[getIndexFrom3D(frame->h, frame->w, frame->c, y_, x_, k)] : -999999.9f; 28 | max = (val > max) ? val : max; 29 | } 30 | } 31 | output->data[getIndexFrom3D(output->h, output->w, output->c, h, w, k)] = max; 32 | } 33 | } 34 | } 35 | 36 | frame_free(frame); 37 | 38 | return output; 39 | } 40 | 41 | cnn_frame *doFeedForward_MAXPOOL_GPU(cnn_frame *frame, void *layer) { 42 | LOGD("Running function %s", __PRETTY_FUNCTION__); 43 | 44 | frame = ((cnn_layer *)layer)->useHalf ? frame_convert_to_gpu_half(frame) : frame_convert_to_gpu_float(frame); 45 | 46 | cnn_layer_maxpool *maxpool_layer = ((cnn_layer *)layer)->maxpool_layer; 47 | 48 | cl_int err; 49 | OpenCLObjects *openCLObjects = getOpenClObject(); 50 | 51 | int input_w = frame->w; 52 | int input_h = frame->h; 53 | int input_d = frame->c; 54 | 55 | //prepare output 56 | int output_w = 1 + (frame->w + maxpool_layer->pad[0] + maxpool_layer->pad[1] - maxpool_layer->size) / maxpool_layer->stride[0]; 57 | int output_h = 1 + (frame->h + maxpool_layer->pad[2] + maxpool_layer->pad[3] - maxpool_layer->size) / maxpool_layer->stride[1]; 58 | int output_c = frame->c; 59 | 60 | cnn_frame *output = ((cnn_layer *)layer)->useHalf ? frame_init_gpu_half(output_w, output_h, output_c) : frame_init_gpu(output_w, output_h, output_c); 61 | 62 | cl_mem cl_frame = frame->cl_data; 63 | cl_mem cl_result = output->cl_data; 64 | 65 | int i = 0; 66 | 67 | cl_kernel kernel = ((cnn_layer *)layer)->useHalf ? openCLObjects->maxpool_kernel.kernel : openCLObjects->maxpool_kernel_float.kernel; 68 | 69 | err = clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_frame); 70 | err |= clSetKernelArg(kernel, i++, sizeof(int), &input_w); 71 | err |= clSetKernelArg(kernel, i++, sizeof(int), &input_h); 72 | err |= clSetKernelArg(kernel, i++, sizeof(int), &input_d); 73 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->size); 74 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->stride[0]); 75 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->stride[1]); 76 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->pad[0]); 77 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->pad[1]); 78 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->pad[2]); 79 | err |= clSetKernelArg(kernel, i++, sizeof(int), &maxpool_layer->pad[3]); 80 | err |= clSetKernelArg(kernel, i++, sizeof(cl_mem), &cl_result); 81 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_w); 82 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_h); 83 | err |= clSetKernelArg(kernel, i++, sizeof(int), &output_c); 84 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 85 | 86 | size_t globalSize[3] = {(size_t)output_w, (size_t)output_h, (size_t)output_c}; 87 | 88 | err = clEnqueueNDRangeKernel( 89 | openCLObjects->queue, 90 | kernel, 91 | 3, 92 | 0, 93 | globalSize, 94 | 0, 95 | 0, 0, 0 96 | ); 97 | err |= clFinish(openCLObjects->queue); 98 | SAMPLE_CHECK_ERRORS_WITH_NULL_RETURN(err); 99 | 100 | frame_free(frame); 101 | 102 | return output; 103 | } 104 | -------------------------------------------------------------------------------- /app/src/main/cpp/layers/softmax.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | cnn_frame *doFeedForward_SOFTMAX(cnn_frame *frame, void *layer) { 8 | LOGD("Running function %s", __PRETTY_FUNCTION__); 9 | 10 | double dsum = 0; 11 | int i; 12 | 13 | frame = frame_convert_to_cpu(frame); 14 | 15 | for(i = 0 ; i < frame->c ; i++) { 16 | dsum += exp((double)frame->data[i]); 17 | } 18 | 19 | for(i = 0 ; i < frame->c ; i++) { 20 | frame->data[i] = (float)(exp((double)frame->data[i]) / dsum); 21 | } 22 | 23 | return frame; 24 | } 25 | -------------------------------------------------------------------------------- /app/src/main/cpp/utilities.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | char packageNameBuf[256]; 13 | 14 | inline char* load_Program(const char *inputPath) { 15 | FILE *fp = fopen(inputPath,"r"); 16 | int fd = fileno(fp); 17 | struct stat buf; 18 | fstat(fd, &buf); 19 | int size = buf.st_size; 20 | 21 | char *buffer = (char *)malloc(size + 1); 22 | buffer[size] = '\0'; 23 | fread(buffer, size, 1, fp); 24 | fclose(fp); 25 | 26 | return buffer; 27 | } 28 | 29 | cl_int compile_Program(OpenCLObjects& openCLObjects, const char* kernelFileName) { 30 | 31 | std::string kernelPath; 32 | cl_int err = CL_SUCCESS; 33 | 34 | kernelPath.append("/data/data/"); 35 | kernelPath.append(packageNameBuf); 36 | kernelPath.append("/app_execdir/"); 37 | kernelPath.append(kernelFileName); 38 | 39 | char* tmp = load_Program(kernelPath.c_str()); 40 | std::string tmpStr(tmp); 41 | const char *kernelSource = tmpStr.c_str(); 42 | free(tmp); 43 | 44 | openCLObjects.program = 45 | clCreateProgramWithSource ( 46 | openCLObjects.context, 47 | 1, 48 | &kernelSource, 49 | 0, 50 | &err 51 | ); 52 | SAMPLE_CHECK_ERRORS_WITH_RETURN(err); 53 | 54 | err = clBuildProgram(openCLObjects.program, 0, 0, "-O3 -cl-mad-enable -cl-fast-relaxed-math", 0, 0); 55 | 56 | if(err == CL_BUILD_PROGRAM_FAILURE) { 57 | size_t log_length = 0; 58 | 59 | err = clGetProgramBuildInfo( 60 | openCLObjects.program, 61 | openCLObjects.device, 62 | CL_PROGRAM_BUILD_LOG, 63 | 0, 64 | 0, 65 | &log_length); 66 | SAMPLE_CHECK_ERRORS_WITH_RETURN(err); 67 | 68 | //vector log(log_length); 69 | char* logbuf = (char*)malloc(log_length); 70 | 71 | err = clGetProgramBuildInfo( 72 | openCLObjects.program, 73 | openCLObjects.device, 74 | CL_PROGRAM_BUILD_LOG, 75 | log_length, 76 | (void*)logbuf, 77 | 0); 78 | SAMPLE_CHECK_ERRORS_WITH_RETURN(err); 79 | 80 | LOGE("Error happened during the build of OpenCL program.\nBuild log:%s", logbuf); 81 | 82 | free(logbuf); 83 | } 84 | 85 | //SAMPLE_CHECK_ERRORS_WITH_RETURN(err); 86 | return CL_SUCCESS; 87 | } 88 | 89 | void init_OpenCL( 90 | cl_device_type required_device_type, 91 | OpenCLObjects& openCLObjects, 92 | const char *packageName) { 93 | 94 | using namespace std; 95 | cl_int err = CL_SUCCESS; 96 | 97 | LOGD("init_OpenCL: Initializing GPU\n"); 98 | 99 | strcpy(packageNameBuf, packageName); 100 | 101 | cl_uint num_of_platforms = 0; 102 | err = clGetPlatformIDs(0, 0, &num_of_platforms); 103 | SAMPLE_CHECK_ERRORS(err); 104 | 105 | vector platforms(num_of_platforms); 106 | // Get IDs for all platforms. 107 | err = clGetPlatformIDs(num_of_platforms, &platforms[0], 0); 108 | SAMPLE_CHECK_ERRORS(err); 109 | 110 | cl_uint i = 0; 111 | size_t platform_name_length = 0; 112 | err = clGetPlatformInfo( 113 | platforms[i], 114 | CL_PLATFORM_NAME, 115 | 0, 116 | 0, 117 | &platform_name_length 118 | ); 119 | SAMPLE_CHECK_ERRORS(err); 120 | 121 | vector platform_name(platform_name_length); 122 | err = clGetPlatformInfo( 123 | platforms[i], 124 | CL_PLATFORM_NAME, 125 | platform_name_length, 126 | &platform_name[0], 127 | 0 128 | ); 129 | SAMPLE_CHECK_ERRORS(err); 130 | 131 | openCLObjects.platform = platforms[0]; 132 | 133 | cl_context_properties context_props[] = { 134 | CL_CONTEXT_PLATFORM, 135 | cl_context_properties(openCLObjects.platform), 136 | 0}; 137 | 138 | openCLObjects.context = clCreateContextFromType( 139 | context_props, 140 | required_device_type, 141 | 0, 142 | 0, 143 | &err 144 | ); 145 | SAMPLE_CHECK_ERRORS(err); 146 | 147 | err = clGetContextInfo( 148 | openCLObjects.context, 149 | CL_CONTEXT_DEVICES, 150 | sizeof(openCLObjects.device), 151 | &openCLObjects.device, 152 | 0); 153 | SAMPLE_CHECK_ERRORS(err); 154 | 155 | openCLObjects.queue = clCreateCommandQueue ( 156 | openCLObjects.context, 157 | openCLObjects.device, 158 | 0, // Creating queue properties, refer to the OpenCL specification for details. 159 | &err); 160 | SAMPLE_CHECK_ERRORS(err); 161 | 162 | err = compile_Program(openCLObjects, PROGRAM_KERNEL_NAME); 163 | SAMPLE_CHECK_ERRORS(err); 164 | 165 | cl_device_local_mem_type local_mem_type; 166 | clGetDeviceInfo( 167 | openCLObjects.device, 168 | CL_DEVICE_LOCAL_MEM_TYPE, 169 | sizeof(cl_device_local_mem_type), 170 | &local_mem_type, 171 | NULL 172 | ); 173 | LOGD("CL_DEVICE_LOCAL_MEM_TYPE %u", local_mem_type); 174 | 175 | openCLObjects.conv_kernel.kernel = clCreateKernel(openCLObjects.program, "conv_kernel_half", &err); 176 | SAMPLE_CHECK_ERRORS(err); 177 | clGetKernelWorkGroupInfo( 178 | openCLObjects.conv_kernel.kernel, 179 | openCLObjects.device, 180 | CL_KERNEL_WORK_GROUP_SIZE, 181 | sizeof(size_t), 182 | &openCLObjects.conv_kernel.kernel_max_workgroup_size, 183 | NULL 184 | ); 185 | 186 | openCLObjects.conv_kernel_float.kernel = clCreateKernel(openCLObjects.program, "conv_kernel_float", &err); 187 | SAMPLE_CHECK_ERRORS(err); 188 | clGetKernelWorkGroupInfo( 189 | openCLObjects.conv_kernel_float.kernel, 190 | openCLObjects.device, 191 | CL_KERNEL_WORK_GROUP_SIZE, 192 | sizeof(size_t), 193 | &openCLObjects.conv_kernel_float.kernel_max_workgroup_size, 194 | NULL 195 | ); 196 | 197 | openCLObjects.maxpool_kernel.kernel = clCreateKernel(openCLObjects.program, "maxpool_kernel_half", &err); 198 | SAMPLE_CHECK_ERRORS(err); 199 | clGetKernelWorkGroupInfo( 200 | openCLObjects.maxpool_kernel.kernel, 201 | openCLObjects.device, 202 | CL_KERNEL_WORK_GROUP_SIZE, 203 | sizeof(size_t), 204 | &openCLObjects.maxpool_kernel.kernel_max_workgroup_size, 205 | NULL 206 | ); 207 | 208 | openCLObjects.maxpool_kernel_float.kernel = clCreateKernel(openCLObjects.program, "maxpool_kernel_float", &err); 209 | SAMPLE_CHECK_ERRORS(err); 210 | clGetKernelWorkGroupInfo( 211 | openCLObjects.maxpool_kernel_float.kernel, 212 | openCLObjects.device, 213 | CL_KERNEL_WORK_GROUP_SIZE, 214 | sizeof(size_t), 215 | &openCLObjects.maxpool_kernel_float.kernel_max_workgroup_size, 216 | NULL 217 | ); 218 | 219 | openCLObjects.conv_fc_kernel.kernel = clCreateKernel(openCLObjects.program, "conv_fc_kernel_half", &err); 220 | SAMPLE_CHECK_ERRORS(err); 221 | clGetKernelWorkGroupInfo( 222 | openCLObjects.conv_fc_kernel.kernel, 223 | openCLObjects.device, 224 | CL_KERNEL_WORK_GROUP_SIZE, 225 | sizeof(size_t), 226 | &openCLObjects.conv_fc_kernel.kernel_max_workgroup_size, 227 | NULL 228 | ); 229 | 230 | openCLObjects.conv_fc_kernel_float.kernel = clCreateKernel(openCLObjects.program, "conv_fc_kernel_float", &err); 231 | SAMPLE_CHECK_ERRORS(err); 232 | clGetKernelWorkGroupInfo( 233 | openCLObjects.conv_fc_kernel_float.kernel, 234 | openCLObjects.device, 235 | CL_KERNEL_WORK_GROUP_SIZE, 236 | sizeof(size_t), 237 | &openCLObjects.conv_fc_kernel_float.kernel_max_workgroup_size, 238 | NULL 239 | ); 240 | 241 | openCLObjects.fully_connected_kernel.kernel = clCreateKernel(openCLObjects.program, "fully_connected_kernel_half", &err); 242 | SAMPLE_CHECK_ERRORS(err); 243 | clGetKernelWorkGroupInfo( 244 | openCLObjects.fully_connected_kernel.kernel, 245 | openCLObjects.device, 246 | CL_KERNEL_WORK_GROUP_SIZE, 247 | sizeof(size_t), 248 | &openCLObjects.fully_connected_kernel.kernel_max_workgroup_size, 249 | NULL 250 | ); 251 | 252 | openCLObjects.fully_connected_kernel_float.kernel = clCreateKernel(openCLObjects.program, "fully_connected_kernel_float", &err); 253 | SAMPLE_CHECK_ERRORS(err); 254 | clGetKernelWorkGroupInfo( 255 | openCLObjects.fully_connected_kernel_float.kernel, 256 | openCLObjects.device, 257 | CL_KERNEL_WORK_GROUP_SIZE, 258 | sizeof(size_t), 259 | &openCLObjects.fully_connected_kernel_float.kernel_max_workgroup_size, 260 | NULL 261 | ); 262 | 263 | openCLObjects.lrn_kernel.kernel = clCreateKernel(openCLObjects.program, "cross_channels_lrn_kernel_half", &err); 264 | clGetKernelWorkGroupInfo( 265 | openCLObjects.lrn_kernel.kernel, 266 | openCLObjects.device, 267 | CL_KERNEL_WORK_GROUP_SIZE, 268 | sizeof(size_t), 269 | &openCLObjects.lrn_kernel.kernel_max_workgroup_size, 270 | NULL 271 | ); 272 | SAMPLE_CHECK_ERRORS(err); 273 | 274 | openCLObjects.lrn_kernel_float.kernel = clCreateKernel(openCLObjects.program, "cross_channels_lrn_kernel_float", &err); 275 | clGetKernelWorkGroupInfo( 276 | openCLObjects.lrn_kernel_float.kernel, 277 | openCLObjects.device, 278 | CL_KERNEL_WORK_GROUP_SIZE, 279 | sizeof(size_t), 280 | &openCLObjects.lrn_kernel_float.kernel_max_workgroup_size, 281 | NULL 282 | ); 283 | SAMPLE_CHECK_ERRORS(err); 284 | 285 | openCLObjects.activation_kernel.kernel = clCreateKernel(openCLObjects.program, "activation_kernel_half", &err); 286 | SAMPLE_CHECK_ERRORS(err); 287 | 288 | openCLObjects.activation_kernel_float.kernel = clCreateKernel(openCLObjects.program, "activation_kernel_float", &err); 289 | SAMPLE_CHECK_ERRORS(err); 290 | 291 | openCLObjects.convert_float_to_half_kernel.kernel = clCreateKernel(openCLObjects.program, "convertFloatToHalf", &err); 292 | SAMPLE_CHECK_ERRORS(err); 293 | 294 | openCLObjects.convert_half_to_float_kernel.kernel = clCreateKernel(openCLObjects.program, "convertHalfToFloat", &err); 295 | SAMPLE_CHECK_ERRORS(err); 296 | 297 | LOGD("initOpenCL finished successfully"); 298 | } 299 | 300 | void shutdown_OpenCL (OpenCLObjects& openCLObjects) { 301 | cl_int err = CL_SUCCESS; 302 | 303 | err = clReleaseKernel(openCLObjects.conv_kernel.kernel); 304 | SAMPLE_CHECK_ERRORS(err); 305 | 306 | err = clReleaseKernel(openCLObjects.conv_fc_kernel.kernel); 307 | SAMPLE_CHECK_ERRORS(err); 308 | 309 | err = clReleaseKernel(openCLObjects.maxpool_kernel.kernel); 310 | SAMPLE_CHECK_ERRORS(err); 311 | 312 | err = clReleaseKernel(openCLObjects.lrn_kernel.kernel); 313 | SAMPLE_CHECK_ERRORS(err); 314 | 315 | err = clReleaseKernel(openCLObjects.fully_connected_kernel.kernel); 316 | SAMPLE_CHECK_ERRORS(err); 317 | 318 | err = clReleaseKernel(openCLObjects.conv_kernel_float.kernel); 319 | SAMPLE_CHECK_ERRORS(err); 320 | 321 | err = clReleaseKernel(openCLObjects.conv_fc_kernel_float.kernel); 322 | SAMPLE_CHECK_ERRORS(err); 323 | 324 | err = clReleaseKernel(openCLObjects.maxpool_kernel_float.kernel); 325 | SAMPLE_CHECK_ERRORS(err); 326 | 327 | err = clReleaseKernel(openCLObjects.lrn_kernel_float.kernel); 328 | SAMPLE_CHECK_ERRORS(err); 329 | 330 | err = clReleaseKernel(openCLObjects.fully_connected_kernel_float.kernel); 331 | SAMPLE_CHECK_ERRORS(err); 332 | 333 | err = clReleaseKernel(openCLObjects.convert_float_to_half_kernel.kernel); 334 | SAMPLE_CHECK_ERRORS(err); 335 | 336 | err = clReleaseKernel(openCLObjects.convert_half_to_float_kernel.kernel); 337 | SAMPLE_CHECK_ERRORS(err); 338 | 339 | err = clReleaseProgram(openCLObjects.program); 340 | SAMPLE_CHECK_ERRORS(err); 341 | 342 | err = clReleaseCommandQueue(openCLObjects.queue); 343 | SAMPLE_CHECK_ERRORS(err); 344 | 345 | err = clReleaseContext(openCLObjects.context); 346 | SAMPLE_CHECK_ERRORS(err); 347 | 348 | LOGD("shutdownOpenCL finished successfully"); 349 | } -------------------------------------------------------------------------------- /app/src/main/java/com/lanytek/deepsensev3/MainActivity.java: -------------------------------------------------------------------------------- 1 | package com.lanytek.deepsensev3; 2 | 3 | import android.app.Activity; 4 | import android.content.Intent; 5 | import android.database.Cursor; 6 | import android.graphics.Bitmap; 7 | import android.graphics.Canvas; 8 | import android.graphics.Color; 9 | import android.graphics.Paint; 10 | import android.net.Uri; 11 | import android.os.AsyncTask; 12 | import android.os.Environment; 13 | import android.provider.MediaStore; 14 | import android.support.v7.app.AppCompatActivity; 15 | import android.os.Bundle; 16 | import android.util.Log; 17 | import android.view.View; 18 | import android.widget.Button; 19 | import android.widget.ImageView; 20 | import android.widget.TextView; 21 | 22 | import com.squareup.picasso.Picasso; 23 | 24 | import java.io.BufferedReader; 25 | import java.io.File; 26 | import java.io.FileNotFoundException; 27 | import java.io.FileReader; 28 | import java.io.IOException; 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | import java.util.concurrent.ExecutorService; 32 | import java.util.concurrent.Executors; 33 | import java.util.concurrent.TimeUnit; 34 | 35 | public class MainActivity extends AppCompatActivity { 36 | public static String TAG = "DeepSense"; 37 | 38 | private List img_recognition_descriptions = new ArrayList<>(); 39 | private static final String [] yolo_descriptions = {"aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"}; 40 | 41 | private static String model_yolo_tiny = (new File(Environment.getExternalStorageDirectory(), "YoloModels/Yolo-Tiny-New-Format")).getAbsolutePath(); 42 | private static String model_img_recognition = (new File(Environment.getExternalStorageDirectory(), "ImageNetModels/Vgg_F-New-Format")).getAbsolutePath(); 43 | 44 | private Activity activity = this; 45 | 46 | private ImageView iv; 47 | 48 | private Button btn_loadModelGPU; 49 | private Button btn_processImage; 50 | private TextView tv_runtime, tv_desc; 51 | 52 | private static final int SELECT_PICTURE = 9999; 53 | private String selectedImagePath = null; 54 | 55 | // Used to load the 'native-lib' library on application startup. 56 | static { 57 | System.loadLibrary("deepsense"); 58 | } 59 | 60 | @Override 61 | protected void onCreate(Bundle savedInstanceState) { 62 | super.onCreate(savedInstanceState); 63 | setContentView(R.layout.activity_main); 64 | 65 | iv = (ImageView) findViewById(R.id.iv_image); 66 | btn_loadModelGPU = (Button) findViewById(R.id.btn_loadModelGPU); 67 | btn_processImage = (Button) findViewById(R.id.btn_processImage); 68 | tv_runtime = (TextView) findViewById(R.id.tv_runTime); 69 | tv_desc = (TextView) findViewById(R.id.tv_desc); 70 | 71 | new async_copy_kernel_code().execute("deepsense.cl"); 72 | 73 | btn_loadModelGPU.setOnClickListener(new View.OnClickListener() { 74 | @Override 75 | public void onClick(View v) { 76 | new async_loadModel().execute(); 77 | } 78 | }); 79 | 80 | btn_processImage.setOnClickListener(new View.OnClickListener() { 81 | @Override 82 | public void onClick(View v) { 83 | new async_processImage_yolo().execute(); 84 | //new async_processImage_img_recognition().execute(); 85 | } 86 | }); 87 | 88 | iv.setOnClickListener(new View.OnClickListener() { 89 | @Override 90 | public void onClick(View v) { 91 | Intent intent = new Intent(); 92 | intent.setType("image/*"); 93 | intent.setAction(Intent.ACTION_GET_CONTENT); 94 | startActivityForResult(Intent.createChooser(intent, "Select Picture"), SELECT_PICTURE); 95 | } 96 | }); 97 | } 98 | 99 | private void setButtons(boolean isEnabled) { 100 | //btn_loadModelCPU.setEnabled(isEnabled); 101 | btn_loadModelGPU.setEnabled(isEnabled); 102 | btn_processImage.setEnabled(isEnabled); 103 | } 104 | 105 | private class async_copy_kernel_code extends AsyncTask { 106 | 107 | @Override 108 | protected void onPreExecute() { 109 | super.onPreExecute(); 110 | setButtons(false); 111 | } 112 | 113 | @Override 114 | protected Void doInBackground(String... params) { 115 | for(String p : params) { 116 | Utilities.copyFile(activity, p); 117 | } 118 | return null; 119 | } 120 | 121 | @Override 122 | protected void onPostExecute(Void aVoid) { 123 | super.onPostExecute(aVoid); 124 | setButtons(true); 125 | } 126 | } 127 | 128 | private class async_loadModel extends AsyncTask { 129 | 130 | @Override 131 | protected void onPreExecute() { 132 | setButtons(false); 133 | super.onPreExecute(); 134 | } 135 | 136 | @Override 137 | protected Void doInBackground(Void... params) { 138 | if(new File(model_img_recognition + "/description").exists()) { 139 | try { 140 | img_recognition_descriptions.clear(); 141 | BufferedReader br = new BufferedReader(new FileReader(new File(model_img_recognition + "/description"))); 142 | String line; 143 | while((line = br.readLine()) != null) { 144 | img_recognition_descriptions.add(line); 145 | } 146 | br.close(); 147 | } catch (FileNotFoundException e) { 148 | e.printStackTrace(); 149 | } catch (IOException e) { 150 | e.printStackTrace(); 151 | } 152 | } 153 | 154 | InitGPU(model_yolo_tiny, activity.getPackageName()); 155 | return null; 156 | } 157 | 158 | @Override 159 | protected void onPostExecute(Void aVoid) { 160 | super.onPostExecute(aVoid); 161 | setButtons(true); 162 | } 163 | } 164 | 165 | private class async_processImage_img_recognition extends AsyncTask { 166 | 167 | private double t1,t2; 168 | private double cnn_runtime; 169 | private float [] result; 170 | private Bitmap bm = null; 171 | private int best_idx = -1; 172 | 173 | @Override 174 | protected void onPreExecute() { 175 | btn_processImage.setEnabled(false); 176 | tv_runtime.setText("------"); 177 | tv_desc.setText("..."); 178 | t1 = System.currentTimeMillis(); 179 | super.onPreExecute(); 180 | } 181 | 182 | @Override 183 | protected void onPostExecute(Void aVoid) { 184 | super.onPostExecute(aVoid); 185 | t2 = System.currentTimeMillis(); 186 | double runtime = t2 - t1; 187 | btn_processImage.setEnabled(true); 188 | tv_runtime.setText(cnn_runtime + " / " + runtime + " ms"); 189 | tv_desc.setText(img_recognition_descriptions.get(best_idx)); 190 | } 191 | 192 | @Override 193 | protected Void doInBackground(Void... voids) { 194 | 195 | if(selectedImagePath != null) { 196 | final int IMG_X = 224; 197 | final int IMG_Y = 224; 198 | final int IMG_C = 3; 199 | 200 | final float [] bitmapArray = new float[IMG_X * IMG_Y * IMG_C]; 201 | 202 | try { 203 | bm = Picasso.with(activity) 204 | .load(new File(selectedImagePath)) 205 | .config(Bitmap.Config.ARGB_8888) 206 | .resize(448,448) 207 | .get(); 208 | } catch (IOException e) { 209 | e.printStackTrace(); 210 | } 211 | 212 | if(bm != null) { 213 | ExecutorService executor = Executors.newFixedThreadPool(8); 214 | 215 | final double scaleX = (double)IMG_X / (double)bm.getWidth(); 216 | final double scaleY = (double)IMG_Y / (double)bm.getHeight(); 217 | 218 | for(int i = 0 ; i < 224 ; i++) { 219 | final int finalI = i; 220 | executor.execute(new Runnable() { 221 | @Override 222 | public void run() { 223 | for(int j = 0 ; j < IMG_Y ; j++) { 224 | int pixel = bm.getPixel((int)Math.ceil(1/scaleX * finalI),(int)Math.ceil(1/scaleY * j)); 225 | float b = (float)(pixel & 0x000000ff); 226 | float g = (float)((pixel >> 8) & 0x000000ff); 227 | float r = (float)((pixel >> 16) & 0x000000ff); 228 | int index = finalI * IMG_Y + j; 229 | bitmapArray[index * 3] = r - 122.803f; 230 | bitmapArray[index * 3 + 1] = g - 114.885f; 231 | bitmapArray[index * 3 + 2] = b - 101.572f; 232 | } 233 | } 234 | }); 235 | } 236 | 237 | executor.shutdown(); 238 | try { 239 | executor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS); 240 | } catch (InterruptedException e) { 241 | e.printStackTrace(); 242 | } 243 | 244 | double x1 = System.currentTimeMillis(); 245 | float [] result = GetInferrence(bitmapArray); 246 | double x2 = System.currentTimeMillis(); 247 | cnn_runtime = x2 - x1; 248 | Log.d(TAG,"CNN RUNTIME: " + cnn_runtime + "ms"); 249 | 250 | //get top-1 251 | float best_prob = 0; 252 | for(int i = 0 ; i < 1000 ; i ++) { 253 | if(best_prob < result[i]) { 254 | best_idx = i; 255 | best_prob = result[i]; 256 | } 257 | } 258 | 259 | Log.d(TAG,"Image classified as : " + img_recognition_descriptions.get(best_idx)); 260 | } 261 | } 262 | 263 | return null; 264 | } 265 | } 266 | 267 | private class async_processImage_yolo extends AsyncTask { 268 | 269 | private double t1,t2; 270 | private double cnn_runtime; 271 | private float [] result; 272 | private Bitmap bm = null; 273 | 274 | @Override 275 | protected void onPreExecute() { 276 | btn_processImage.setEnabled(false); 277 | tv_runtime.setText("------"); 278 | t1 = System.currentTimeMillis(); 279 | super.onPreExecute(); 280 | } 281 | 282 | @Override 283 | protected Void doInBackground(Void... params) { 284 | 285 | if(selectedImagePath != null) { 286 | final int IMG_X = 448; 287 | final int IMG_Y = 448; 288 | final int IMG_C = 3; 289 | 290 | final float [] bitmapArray = new float[IMG_X * IMG_Y * IMG_C]; 291 | 292 | try { 293 | bm = Picasso.with(activity) 294 | .load(new File(selectedImagePath)) 295 | .config(Bitmap.Config.ARGB_8888) 296 | .resize(IMG_X,IMG_Y) 297 | .get(); 298 | } catch (IOException e) { 299 | e.printStackTrace(); 300 | } 301 | 302 | if(bm != null) { 303 | /*ExecutorService executor = Executors.newFixedThreadPool(8); 304 | 305 | for(int w = 0 ; w < bm.getWidth() ; w++) { 306 | final int finalW = w; 307 | executor.execute(new Runnable() { 308 | @Override 309 | public void run() { 310 | for(int h = 0 ; h < bm.getHeight() ; h++) { 311 | int pixel = bm.getPixel(finalW, h); 312 | for(int c = 0 ; c < 3 ; c++) { 313 | bitmapArray[h * IMG_X * IMG_C + finalW * IMG_C + c] = getColorPixel(pixel, c); 314 | } 315 | } 316 | } 317 | }); 318 | } 319 | 320 | executor.shutdown(); 321 | try { 322 | executor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS); 323 | } catch (InterruptedException e) { 324 | e.printStackTrace(); 325 | }*/ 326 | 327 | for(int w = 0 ; w < bm.getWidth() ; w++) { 328 | for(int h = 0 ; h < bm.getHeight() ; h++) { 329 | int pixel = bm.getPixel(w, h); 330 | for(int c = 0 ; c < 3 ; c++) { 331 | bitmapArray[h * IMG_X * IMG_C + w * IMG_C + c] = Utilities.getColorPixel(pixel, c); 332 | } 333 | } 334 | } 335 | } 336 | 337 | double x1 = System.currentTimeMillis(); 338 | float [] result = GetInferrence(bitmapArray); 339 | double x2 = System.currentTimeMillis(); 340 | cnn_runtime = x2 - x1; 341 | Log.d(TAG,"CNN RUNTIME: " + cnn_runtime + "ms"); 342 | 343 | int classes = 20; 344 | int side = 7; 345 | int num = 2; 346 | float thresh = 0.15f; 347 | 348 | //process result first 349 | float [][] probs = new float[side * side * num][classes]; 350 | Utilities.box[] boxes = new Utilities.box[side * side * num]; 351 | for(int j = 0 ; j < boxes.length ; j++) 352 | boxes[j] = new Utilities.box(); 353 | 354 | Utilities.convert_yolo_detections(result, classes, num, 1, side, 1, 1, thresh, probs, boxes, 0); 355 | Utilities.do_nms_sort(boxes, probs, side * side * num, classes, 0.5f); 356 | 357 | //do box drawing 358 | final Bitmap mutableBitmap = Bitmap.createScaledBitmap( 359 | bm, 512, 512, false).copy(bm.getConfig(), true); 360 | final Canvas canvas = new Canvas(mutableBitmap); 361 | 362 | for(int i = 0; i < side * side * num; ++i){ 363 | 364 | int classid = -1; 365 | float maxprob = -100000.0f; 366 | for(int j = 0 ; j < classes ; j++) { 367 | if(probs[i][j] > maxprob) { 368 | classid = j; 369 | maxprob = probs[i][j]; 370 | } 371 | } 372 | 373 | if(classid < 0) 374 | continue; 375 | 376 | float prob = probs[i][classid]; 377 | if(prob > thresh){ 378 | Utilities.box b = boxes[i]; 379 | 380 | int left = (int) ((b.x-b.w/2.) * mutableBitmap.getWidth()); 381 | int right = (int) ((b.x+b.w/2.) * mutableBitmap.getWidth()); 382 | int top = (int) ((b.y-b.h/2.) * mutableBitmap.getHeight()); 383 | int bot = (int) ((b.y+b.h/2.) * mutableBitmap.getHeight()); 384 | 385 | if(left < 0) left = 0; 386 | if(right > mutableBitmap.getWidth() - 1) right = mutableBitmap.getWidth() - 1; 387 | if(top < 0) top = 0; 388 | if(bot > mutableBitmap.getHeight() - 1) bot = mutableBitmap.getHeight() - 1; 389 | 390 | Paint p = new Paint(); 391 | p.setStrokeWidth(p.getStrokeWidth() * 3); 392 | p.setColor(Color.RED); 393 | canvas.drawLine(left, top, right, top, p); 394 | canvas.drawLine(left, top, left, bot, p); 395 | canvas.drawLine(left, bot, right, bot, p); 396 | canvas.drawLine(right, top, right, bot, p); 397 | 398 | p.setTextSize(48f); 399 | p.setColor(Color.BLUE); 400 | canvas.drawText("" + yolo_descriptions[classid],left + (right - left)/2,top + (bot - top)/2,p); 401 | } 402 | } 403 | 404 | activity.runOnUiThread(new Runnable() { 405 | @Override 406 | public void run() { 407 | iv.setImageBitmap(mutableBitmap); 408 | } 409 | }); 410 | } 411 | 412 | return null; 413 | } 414 | 415 | @Override 416 | protected void onPostExecute(Void aVoid) { 417 | super.onPostExecute(aVoid); 418 | t2 = System.currentTimeMillis(); 419 | double runtime = t2 - t1; 420 | btn_processImage.setEnabled(true); 421 | tv_runtime.setText(cnn_runtime + " / " + runtime + " ms"); 422 | } 423 | } 424 | 425 | public String getPath(Uri uri) { 426 | String[] projection = { MediaStore.Images.Media.DATA }; 427 | Cursor cursor = managedQuery(uri, projection, null, null, null); 428 | int column_index = cursor.getColumnIndexOrThrow(MediaStore.Images.Media.DATA); 429 | cursor.moveToFirst(); 430 | return cursor.getString(column_index); 431 | } 432 | 433 | public void onActivityResult(int requestCode, int resultCode, Intent data) { 434 | if (resultCode == RESULT_OK) { 435 | if (requestCode == SELECT_PICTURE) { 436 | Uri selectedImageUri = data.getData(); 437 | selectedImagePath = getPath(selectedImageUri); 438 | if(selectedImagePath != null) 439 | iv.setImageURI(selectedImageUri); 440 | } 441 | } 442 | } 443 | 444 | /** 445 | * A native method that is implemented by the 'native-lib' native library, 446 | * which is packaged with this application. 447 | */ 448 | public native void InitGPU(String model_dir_path, String packageName); 449 | public native float [] GetInferrence(float [] input); 450 | 451 | } 452 | -------------------------------------------------------------------------------- /app/src/main/java/com/lanytek/deepsensev3/Utilities.java: -------------------------------------------------------------------------------- 1 | package com.lanytek.deepsensev3; 2 | 3 | import android.app.Activity; 4 | 5 | import java.io.File; 6 | import java.io.FileOutputStream; 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.io.OutputStream; 10 | import java.util.Arrays; 11 | import java.util.Comparator; 12 | import java.util.concurrent.ExecutorService; 13 | import java.util.concurrent.Executors; 14 | import java.util.concurrent.TimeUnit; 15 | 16 | /** 17 | * Created by JC1DA on 3/16/16. 18 | */ 19 | public class Utilities { 20 | public static void copyFile(Activity activity, final String f) { 21 | InputStream in; 22 | try { 23 | in = activity.getAssets().open(f); 24 | final File of = new File(activity.getDir("execdir", activity.MODE_PRIVATE), f); 25 | 26 | final OutputStream out = new FileOutputStream(of); 27 | 28 | final byte b[] = new byte[65535]; 29 | int sz = 0; 30 | while ((sz = in.read(b)) > 0) { 31 | out.write(b, 0, sz); 32 | } 33 | in.close(); 34 | out.close(); 35 | } catch (IOException e) { 36 | e.printStackTrace(); 37 | } 38 | } 39 | 40 | public static class box { 41 | public float x,y,w,h; 42 | } 43 | 44 | public static class sortable_bbox { 45 | public int index; 46 | public int classid; 47 | public float [][] probs; 48 | } 49 | 50 | public static float getColorPixel(int pixel, int color) { 51 | float value = 0; 52 | 53 | switch (color) { 54 | case 0: 55 | value = (float)((pixel >> 16) & 0x000000ff) / 255.0f; 56 | break; 57 | case 1: 58 | value = (float)((pixel >> 8) & 0x000000ff) / 255.0f; 59 | break; 60 | case 2: 61 | value = (float)(pixel & 0x000000ff) / 255.0f; 62 | break; 63 | } 64 | 65 | return value; 66 | } 67 | 68 | public static float colors[][] = { {1.0f,0.0f,1.0f} , {0.0f,0.0f,1.0f} , {0.0f,1.0f,1.0f} , {0.0f,1.0f,0.0f} , {1.0f,1.0f,0.0f} , {1.0f,0.0f,0.0f} }; 69 | public static float get_color(int c, int x, int max) 70 | { 71 | float ratio = ((float)x/max)*5; 72 | int i = (int) Math.floor(ratio); 73 | int j = (int) Math.ceil(ratio); 74 | ratio -= i; 75 | float r = (1-ratio) * colors[i][c] + ratio*colors[j][c]; 76 | //printf("%f\n", r); 77 | return r; 78 | } 79 | 80 | public static void convert_yolo_detections(float [] predictions, int classes, int num, int square, int side, int w, int h, float thresh, float [][] probs, box [] boxes, int only_objectness) 81 | { 82 | int i,j,n; 83 | //int per_cell = 5*num+classes; 84 | for (i = 0; i < side * side; ++i){ 85 | int row = i / side; 86 | int col = i % side; 87 | for(n = 0; n < num; ++n){ 88 | int index = i*num + n; 89 | int p_index = side*side*classes + i*num + n; 90 | float scale = predictions[p_index]; 91 | int box_index = side*side*(classes + num) + (i*num + n)*4; 92 | boxes[index].x = (predictions[box_index + 0] + col) / side * w; 93 | boxes[index].y = (predictions[box_index + 1] + row) / side * h; 94 | boxes[index].w = (float) (Math.pow(predictions[box_index + 2], ((square != 0) ? 2 : 1)) * w); 95 | boxes[index].h = (float) (Math.pow(predictions[box_index + 3], ((square != 0) ? 2 : 1)) * h); 96 | for(j = 0; j < classes; ++j){ 97 | int class_index = i*classes; 98 | float prob = scale*predictions[class_index+j]; 99 | probs[index][j] = (prob > thresh) ? prob : 0; 100 | } 101 | if(only_objectness != 0){ 102 | probs[index][0] = scale; 103 | } 104 | } 105 | } 106 | } 107 | 108 | public static void convert_yolo_detections_mt(final float [] predictions, final int classes, final int num, final int square, final int side, final int w, final int h, final float thresh, final float [][] probs, final box [] boxes, final int only_objectness) 109 | { 110 | 111 | ExecutorService executor = Executors.newFixedThreadPool(4); 112 | for (int idx = 0; idx < side * side; ++idx){ 113 | final int i = idx; 114 | Runnable worker = new Runnable() { 115 | @Override 116 | public void run() { 117 | int row = i / side; 118 | int col = i % side; 119 | for(int n = 0; n < num; ++n){ 120 | int index = i*num + n; 121 | int p_index = side*side*classes + i*num + n; 122 | float scale = predictions[p_index]; 123 | int box_index = side*side*(classes + num) + (i*num + n)*4; 124 | boxes[index].x = (predictions[box_index + 0] + col) / side * w; 125 | boxes[index].y = (predictions[box_index + 1] + row) / side * h; 126 | boxes[index].w = (float) (Math.pow(predictions[box_index + 2], ((square != 0) ? 2 : 1)) * w); 127 | boxes[index].h = (float) (Math.pow(predictions[box_index + 3], ((square != 0) ? 2 : 1)) * h); 128 | for(int j = 0; j < classes; ++j){ 129 | int class_index = i*classes; 130 | float prob = scale*predictions[class_index+j]; 131 | probs[index][j] = (prob > thresh) ? prob : 0; 132 | } 133 | if(only_objectness != 0){ 134 | probs[index][0] = scale; 135 | } 136 | } 137 | } 138 | }; 139 | executor.execute(worker); 140 | } 141 | 142 | executor.shutdown(); 143 | try { 144 | executor.awaitTermination(Long.MAX_VALUE, TimeUnit.MILLISECONDS); 145 | } catch (InterruptedException e) { 146 | e.printStackTrace(); 147 | } 148 | } 149 | 150 | public static float overlap(float x1, float w1, float x2, float w2) 151 | { 152 | float l1 = x1 - w1/2; 153 | float l2 = x2 - w2/2; 154 | float left = l1 > l2 ? l1 : l2; 155 | float r1 = x1 + w1/2; 156 | float r2 = x2 + w2/2; 157 | float right = r1 < r2 ? r1 : r2; 158 | return right - left; 159 | } 160 | 161 | public static float box_intersection(box a, box b) 162 | { 163 | float w = overlap(a.x, a.w, b.x, b.w); 164 | float h = overlap(a.y, a.h, b.y, b.h); 165 | if(w < 0 || h < 0) return 0; 166 | float area = w*h; 167 | return area; 168 | } 169 | 170 | public static float box_union(box a, box b) 171 | { 172 | float i = box_intersection(a, b); 173 | float u = a.w*a.h + b.w*b.h - i; 174 | return u; 175 | } 176 | 177 | public static float box_iou(box a, box b) 178 | { 179 | return box_intersection(a, b)/box_union(a, b); 180 | } 181 | 182 | public static void do_nms_sort(box [] boxes, float [][] probs, int total, int classes, float thresh) 183 | { 184 | int i, j, k; 185 | sortable_bbox [] s = new sortable_bbox[total]; 186 | for(i = 0 ; i < s.length ; i++) 187 | s[i] = new sortable_bbox(); 188 | 189 | for(i = 0; i < total; ++i){ 190 | s[i].index = i; 191 | s[i].classid = 0; 192 | s[i].probs = probs; 193 | } 194 | 195 | for(k = 0; k < classes; ++k){ 196 | for(i = 0; i < total; ++i){ 197 | s[i].classid = k; 198 | } 199 | 200 | Arrays.sort(s, new Comparator() { 201 | @Override 202 | public int compare(sortable_bbox a, sortable_bbox b) { 203 | float diff = a.probs[a.index][a.classid] - b.probs[b.index][b.classid]; 204 | if (diff < 0) return 1; 205 | else if (diff > 0) return -1; 206 | return 0; 207 | } 208 | }); 209 | 210 | for(i = 0; i < total; ++i){ 211 | if(probs[s[i].index][k] == 0) continue; 212 | box a = boxes[s[i].index]; 213 | for(j = i+1; j < total; ++j){ 214 | box b = boxes[s[j].index]; 215 | if (box_iou(a, b) > thresh){ 216 | probs[s[j].index][k] = 0; 217 | } 218 | } 219 | } 220 | } 221 | } 222 | 223 | } 224 | -------------------------------------------------------------------------------- /app/src/main/res/layout/activity_main.xml: -------------------------------------------------------------------------------- 1 | 10 | 11 | 15 | 16 | 21 | 22 | 23 | 24 | 29 | 30 | 35 | 36 | 42 | 43 | 44 | 45 | 50 | 51 | 56 | 57 | 63 | 64 | 65 | 66 | 72 | 73 |